diff --git a/CLAUDE.md b/CLAUDE.md index 24d6137c..761ca356 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -62,6 +62,7 @@ make lint # Run linter ./msgvault stats # Show archive stats # Apple Mail import +./msgvault import-messenger --me you@facebook.messenger ~/facebook-dyi # Import Facebook Messenger DYI ./msgvault import-emlx # Auto-discover accounts ./msgvault import-emlx ~/Library/Mail # Explicit mail directory ./msgvault import-emlx --account me@gmail.com # Specific account(s) @@ -196,6 +197,10 @@ The TUI automatically builds/updates the Parquet cache on launch when new messag Sync is **read-only** - no modifications to Gmail. +## Test Data + +Never use real people's names, email addresses, or identifiers in test fixtures. Use obviously synthetic names: `alice`, `bob`, `Test User`, `user@example.com`. Before committing test data, verify no real PII is present. + ## Go Development After making any Go code changes, always run `go fmt ./...` and `go vet ./...` before committing. Stage ALL resulting changes, including formatting-only files. diff --git a/cmd/msgvault/cmd/build_cache_messenger_test.go b/cmd/msgvault/cmd/build_cache_messenger_test.go new file mode 100644 index 00000000..ffa1ff0c --- /dev/null +++ b/cmd/msgvault/cmd/build_cache_messenger_test.go @@ -0,0 +1,84 @@ +package cmd + +import ( + "context" + "database/sql" + "path/filepath" + "testing" + + _ "github.com/marcboeker/go-duckdb" + "github.com/wesm/msgvault/internal/fbmessenger" + "github.com/wesm/msgvault/internal/store" +) + +// TestBuildCache_AfterMessengerImport verifies invariant #3 from the plan: +// after importing Messenger JSON and running buildCache, the resulting +// Parquet partition files exist and contain the expected row count. +func TestBuildCache_AfterMessengerImport(t *testing.T) { + tmp := t.TempDir() + dbPath := filepath.Join(tmp, "msgvault.db") + analyticsDir := filepath.Join(tmp, "analytics") + + st, err := store.Open(dbPath) + if err != nil { + t.Fatalf("open store: %v", err) + } + if err := st.InitSchema(); err != nil { + t.Fatalf("init schema: %v", err) + } + + fixture, err := filepath.Abs("../../../internal/fbmessenger/testdata/json_simple") + if err != nil { + t.Fatal(err) + } + summary, err := fbmessenger.ImportDYI(context.Background(), st, fbmessenger.ImportOptions{ + Me: "wes@facebook.messenger", + RootDir: fixture, + Format: "auto", + AttachmentsDir: t.TempDir(), + }) + if err != nil { + t.Fatalf("ImportDYI: %v", err) + } + if err := st.Close(); err != nil { + t.Fatal(err) + } + if summary.MessagesAdded != 4 { + t.Fatalf("imported %d want 4", summary.MessagesAdded) + } + + result, err := buildCache(dbPath, analyticsDir, false) + if err != nil { + t.Fatalf("buildCache: %v", err) + } + if result.Skipped { + t.Fatal("buildCache unexpectedly skipped") + } + + duckdb, err := sql.Open("duckdb", "") + if err != nil { + t.Fatalf("open duckdb: %v", err) + } + defer func() { _ = duckdb.Close() }() + + var n int + pattern := filepath.Join(analyticsDir, "messages", "**", "*.parquet") + if err := duckdb.QueryRow( + `SELECT COUNT(*) FROM read_parquet(?, hive_partitioning=true)`, pattern, + ).Scan(&n); err != nil { + t.Fatalf("duckdb scan: %v", err) + } + if n != 4 { + t.Errorf("parquet messages=%d want 4", n) + } + + var mtype string + if err := duckdb.QueryRow( + `SELECT DISTINCT message_type FROM read_parquet(?, hive_partitioning=true)`, pattern, + ).Scan(&mtype); err != nil { + t.Fatalf("duckdb message_type: %v", err) + } + if mtype != "fbmessenger" { + t.Errorf("message_type=%q want fbmessenger", mtype) + } +} diff --git a/cmd/msgvault/cmd/import_messenger.go b/cmd/msgvault/cmd/import_messenger.go new file mode 100644 index 00000000..723e6034 --- /dev/null +++ b/cmd/msgvault/cmd/import_messenger.go @@ -0,0 +1,156 @@ +package cmd + +import ( + "context" + "fmt" + "os" + "os/signal" + "syscall" + "time" + + "github.com/spf13/cobra" + "github.com/wesm/msgvault/internal/fbmessenger" + "github.com/wesm/msgvault/internal/store" +) + +var ( + importMessengerMe string + importMessengerFormat string + importMessengerLimit int + importMessengerNoResume bool + importMessengerCheckpointEvery int +) + +var importMessengerCmd = &cobra.Command{ + Use: "import-messenger ", + Short: "Import Facebook Messenger from a Download Your Information export", + Long: `Import Facebook Messenger conversations from a DYI export (JSON or HTML). + +Both JSON and HTML DYI formats are supported. When a thread contains both, the +JSON form wins because it preserves timestamps at millisecond precision and +reactions with relational fidelity. Use --format both to import both copies +into a single conversation with disambiguated source_message_id values. + +Participants are synthesized as @facebook.messenger addresses. Two +participants whose display names produce the same slug are merged with a +warning — DYI exports do not expose stable user IDs, so this is the best we +can do without false-splitting one person into two phantom participants. + +Your own identifier is required via --me and must itself be a +@facebook.messenger address; this value becomes the source identifier +and drives is_from_me on outbound messages. + +HTML exports do not expose timezone information; timestamps are stored as +UTC. JSON exports have millisecond-precision timestamps that are preserved +verbatim. + +Examples: + msgvault import-messenger --me test.user@facebook.messenger ~/downloads/facebook-export + msgvault import-messenger --me test.user@facebook.messenger --format both ./dyi + msgvault import-messenger --me test.user@facebook.messenger --limit 100 ./dyi +`, + Args: cobra.ExactArgs(1), + SilenceUsage: true, + RunE: func(cmd *cobra.Command, args []string) error { + if err := MustBeLocal("import-messenger"); err != nil { + return err + } + return runImportMessenger(cmd, args[0]) + }, +} + +func runImportMessenger(cmd *cobra.Command, rootDir string) error { + if info, err := os.Stat(rootDir); err != nil { + return fmt.Errorf("source directory not found: %w", err) + } else if !info.IsDir() { + return fmt.Errorf("source path is not a directory: %s", rootDir) + } + + dbPath := cfg.DatabaseDSN() + s, err := store.Open(dbPath) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer func() { _ = s.Close() }() + + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + defer signal.Stop(sigChan) + go func() { + select { + case <-sigChan: + _, _ = fmt.Fprintln(cmd.ErrOrStderr(), "\nInterrupted. Saving checkpoint...") + cancel() + case <-ctx.Done(): + } + }() + + opts := fbmessenger.ImportOptions{ + Me: importMessengerMe, + RootDir: rootDir, + Format: importMessengerFormat, + AttachmentsDir: cfg.AttachmentsDir(), + Limit: importMessengerLimit, + NoResume: importMessengerNoResume, + CheckpointEvery: importMessengerCheckpointEvery, + Logger: logger, + } + + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Importing Facebook Messenger DYI from %s\n", rootDir) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Me: %s\n", importMessengerMe) + _, _ = fmt.Fprintln(cmd.OutOrStdout()) + + summary, err := fbmessenger.ImportDYI(ctx, s, opts) + if err != nil { + if ctx.Err() != nil { + _, _ = fmt.Fprintln(cmd.OutOrStdout(), "\nImport interrupted. Re-run to continue.") + rebuildCacheAfterWrite(dbPath) + return nil + } + return fmt.Errorf("import failed: %w", err) + } + + _, _ = fmt.Fprintln(cmd.OutOrStdout()) + if summary.WasResumed { + _, _ = fmt.Fprintln(cmd.OutOrStdout(), "Resumed from checkpoint.") + } + _, _ = fmt.Fprintln(cmd.OutOrStdout(), "Import complete!") + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Duration: %s\n", summary.Duration.Round(time.Millisecond)) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Threads: %d processed, %d skipped\n", + summary.ThreadsProcessed, summary.ThreadsSkipped) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Files skipped: %d (unrecognized siblings)\n", summary.FilesSkipped) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Messages: %d processed, %d added, %d skipped\n", + summary.MessagesProcessed, summary.MessagesAdded, summary.MessagesSkipped) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Participants: %d\n", summary.ParticipantsResolved) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Attachments: %d found, %d stored\n", summary.AttachmentsFound, summary.AttachmentsStored) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Reactions: %d\n", summary.ReactionsAdded) + if summary.Errors > 0 { + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Errors: %d\n", summary.Errors) + } + if summary.MessagesAdded > 0 && summary.FromMeCount == 0 { + _, _ = fmt.Fprintf(cmd.OutOrStdout(), + "\n Warning: no messages matched --me %q (slug: %q).\n"+ + " The --me value must match the slug of your display name in the export.\n", + importMessengerMe, fbmessenger.Slug(fbmessenger.StripDomain(importMessengerMe))) + } + + rebuildCacheAfterWrite(dbPath) + return nil +} + +func init() { + importMessengerCmd.Flags().StringVar(&importMessengerMe, "me", "", "your @facebook.messenger identifier (required)") + importMessengerCmd.Flags().StringVar(&importMessengerFormat, "format", "auto", "format to import: auto|json|html|both") + importMessengerCmd.Flags().IntVar(&importMessengerLimit, "limit", 0, "limit number of messages (for testing)") + importMessengerCmd.Flags().BoolVar(&importMessengerNoResume, "no-resume", false, "ignore any existing checkpoint and start fresh") + importMessengerCmd.Flags().IntVar(&importMessengerCheckpointEvery, "checkpoint-interval", 200, "checkpoint every N messages") + _ = importMessengerCmd.MarkFlagRequired("me") + rootCmd.AddCommand(importMessengerCmd) +} diff --git a/cmd/msgvault/cmd/import_messenger_e2e_test.go b/cmd/msgvault/cmd/import_messenger_e2e_test.go new file mode 100644 index 00000000..84201560 --- /dev/null +++ b/cmd/msgvault/cmd/import_messenger_e2e_test.go @@ -0,0 +1,156 @@ +package cmd + +import ( + "bytes" + "context" + "io" + "path/filepath" + "strings" + "testing" + + "github.com/wesm/msgvault/internal/store" +) + +func saveMessengerState(t *testing.T) func() { + t.Helper() + prevCfg := cfg + prevLogger := logger + prevMe := importMessengerMe + prevFormat := importMessengerFormat + prevLimit := importMessengerLimit + prevNoResume := importMessengerNoResume + prevCheckpoint := importMessengerCheckpointEvery + prevCfgFile := cfgFile + prevHomeDir := homeDir + prevVerbose := verbose + prevOut := rootCmd.OutOrStdout() + prevErr := rootCmd.ErrOrStderr() + return func() { + cfg = prevCfg + logger = prevLogger + importMessengerMe = prevMe + importMessengerFormat = prevFormat + importMessengerLimit = prevLimit + importMessengerNoResume = prevNoResume + importMessengerCheckpointEvery = prevCheckpoint + cfgFile = prevCfgFile + homeDir = prevHomeDir + verbose = prevVerbose + rootCmd.SetOut(prevOut) + rootCmd.SetErr(prevErr) + rootCmd.SetArgs(nil) + } +} + +func TestImportMessenger_JSON_EndToEnd(t *testing.T) { + tmp := t.TempDir() + t.Cleanup(saveMessengerState(t)) + + fixture, err := filepath.Abs("../../../internal/fbmessenger/testdata/json_simple") + if err != nil { + t.Fatal(err) + } + + var stdout bytes.Buffer + rootCmd.SetOut(&stdout) + rootCmd.SetErr(io.Discard) + rootCmd.SetArgs([]string{ + "--home", tmp, + "import-messenger", + "--me", "test.user@facebook.messenger", + fixture, + }) + if err := rootCmd.ExecuteContext(context.Background()); err != nil { + t.Fatalf("import-messenger: %v", err) + } + if !strings.Contains(stdout.String(), "Import complete") { + t.Errorf("stdout missing Import complete: %q", stdout.String()) + } + + st, err := store.Open(filepath.Join(tmp, "msgvault.db")) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = st.Close() }) + + var n int + if err := st.DB().QueryRow("SELECT COUNT(*) FROM messages WHERE message_type='fbmessenger'").Scan(&n); err != nil { + t.Fatal(err) + } + if n != 4 { + t.Errorf("messages=%d want 4", n) + } + if err := st.DB().QueryRow("SELECT COUNT(*) FROM participants WHERE email_address='test.user@facebook.messenger'").Scan(&n); err != nil { + t.Fatal(err) + } + if n != 1 { + t.Errorf("self participant count=%d want 1", n) + } +} + +func TestImportMessenger_HTML_EndToEnd(t *testing.T) { + tmp := t.TempDir() + t.Cleanup(saveMessengerState(t)) + + fixture, err := filepath.Abs("../../../internal/fbmessenger/testdata/html_simple") + if err != nil { + t.Fatal(err) + } + + var stdout bytes.Buffer + rootCmd.SetOut(&stdout) + rootCmd.SetErr(io.Discard) + rootCmd.SetArgs([]string{ + "--home", tmp, + "import-messenger", + "--me", "test.user@facebook.messenger", + fixture, + }) + if err := rootCmd.ExecuteContext(context.Background()); err != nil { + t.Fatalf("import-messenger: %v", err) + } + if !strings.Contains(stdout.String(), "Import complete") { + t.Errorf("stdout missing Import complete: %q", stdout.String()) + } + st, err := store.Open(filepath.Join(tmp, "msgvault.db")) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = st.Close() }) + + var n int + if err := st.DB().QueryRow("SELECT COUNT(*) FROM messages WHERE message_type='fbmessenger'").Scan(&n); err != nil { + t.Fatal(err) + } + if n != 3 { + t.Errorf("messages=%d want 3", n) + } + var rawFormat string + if err := st.DB().QueryRow("SELECT DISTINCT raw_format FROM message_raw").Scan(&rawFormat); err != nil { + t.Fatal(err) + } + if rawFormat != "fbmessenger_html" { + t.Errorf("raw_format=%q want fbmessenger_html", rawFormat) + } +} + +func TestImportMessenger_MissingDir(t *testing.T) { + tmp := t.TempDir() + t.Cleanup(saveMessengerState(t)) + + rootCmd.SetOut(io.Discard) + rootCmd.SetErr(io.Discard) + rootCmd.SetArgs([]string{ + "--home", tmp, + "import-messenger", + "--me", "test.user@facebook.messenger", + filepath.Join(tmp, "does", "not", "exist"), + }) + err := rootCmd.ExecuteContext(context.Background()) + if err == nil { + t.Fatal("expected error for missing dir") + } + if !strings.Contains(err.Error(), "not found") && !strings.Contains(err.Error(), "no such") { + t.Errorf("error should describe missing path, got %v", err) + } +} diff --git a/cmd/msgvault/cmd/root.go b/cmd/msgvault/cmd/root.go index f2c8d008..24f1ec0b 100644 --- a/cmd/msgvault/cmd/root.go +++ b/cmd/msgvault/cmd/root.go @@ -225,9 +225,10 @@ func Execute() error { // Installs a panic recovery and closes the log file handler on // return so every run ends cleanly in the log. func ExecuteContext(ctx context.Context) error { - // Defers run LIFO: close the log file first, then recover - // panics. This ensures the panic record is written while the - // file handle is still open. + // Defer ordering is load-bearing. LIFO means recoverAndLogPanic + // runs before the log-file close. Because recoverAndLogPanic calls + // os.Exit (which skips remaining defers), it closes logResult + // itself before exiting. Do not reorder these defers. defer func() { if logResult != nil { logResult.Close() diff --git a/internal/fbmessenger/convergence_test.go b/internal/fbmessenger/convergence_test.go new file mode 100644 index 00000000..d694cf2b --- /dev/null +++ b/internal/fbmessenger/convergence_test.go @@ -0,0 +1,67 @@ +package fbmessenger + +import ( + "reflect" + "regexp" + "sort" + "strings" + "testing" + "time" +) + +var convergenceWS = regexp.MustCompile(`\s+`) + +func normalizeConvergence(s string) string { + return strings.TrimSpace(convergenceWS.ReplaceAllString(s, " ")) +} + +func TestJSONHTMLConvergence_Simple(t *testing.T) { + jsonRoot := "testdata/json_simple" + htmlRoot := "testdata/html_simple" + jsonTh, err := ParseJSONThread(jsonRoot, threadDir(t, jsonRoot, "inbox", "alice_ABC123")) + if err != nil { + t.Fatalf("json: %v", err) + } + htmlTh, err := ParseHTMLThread(htmlRoot, threadDir(t, htmlRoot, "inbox", "alice_ABC123")) + if err != nil { + t.Fatalf("html: %v", err) + } + if len(jsonTh.Messages) != len(htmlTh.Messages) { + t.Fatalf("message count: json=%d html=%d", len(jsonTh.Messages), len(htmlTh.Messages)) + } + // Participants equal by slug. + var jSlugs, hSlugs []string + for _, p := range jsonTh.Participants { + jSlugs = append(jSlugs, Slug(p.Name)) + } + for _, p := range htmlTh.Participants { + hSlugs = append(hSlugs, Slug(p.Name)) + } + sort.Strings(jSlugs) + sort.Strings(hSlugs) + if !reflect.DeepEqual(jSlugs, hSlugs) { + t.Errorf("participant slugs differ: json=%v html=%v", jSlugs, hSlugs) + } + // Per-message bodies and timestamps. + // + // Reactions are a JSON-only feature (HTML exports do not expose + // reaction metadata), so we compare bodies on their common ground: + // the JSON body with its trailing "[reacted: ...]" suffix stripped. + // Dual-path reaction coverage lives in TestImportDYI_ReactionsDualPath. + for i := range jsonTh.Messages { + jb := normalizeConvergence(stripReactionSuffix(jsonTh.Messages[i].Body)) + hb := normalizeConvergence(htmlTh.Messages[i].Body) + if jb != hb { + t.Errorf("message[%d] body differs:\n json=%q\n html=%q", i, jb, hb) + } + jt := jsonTh.Messages[i].SentAt.Truncate(time.Minute) + ht := htmlTh.Messages[i].SentAt.Truncate(time.Minute) + if !jt.Equal(ht) { + t.Errorf("message[%d] timestamp differs: json=%v html=%v", i, jt, ht) + } + if Slug(jsonTh.Messages[i].SenderName) != Slug(htmlTh.Messages[i].SenderName) { + t.Errorf("message[%d] sender differs: json=%q html=%q", + i, jsonTh.Messages[i].SenderName, htmlTh.Messages[i].SenderName) + } + } +} diff --git a/internal/fbmessenger/discover.go b/internal/fbmessenger/discover.go new file mode 100644 index 00000000..e5213e76 --- /dev/null +++ b/internal/fbmessenger/discover.go @@ -0,0 +1,302 @@ +package fbmessenger + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "strings" +) + +// ThreadDir is one discovered DYI thread directory ready for parsing. +type ThreadDir struct { + // Path is the absolute filesystem path to the thread directory. + // For E2EE flat exports, this is the parent directory containing + // the JSON file. + Path string + // FilePath is set for E2EE flat exports where each thread is a + // single JSON file (e.g. "Name_N.json"). Empty for DYI exports. + FilePath string + // Section is the DYI section name (e.g. "inbox", "archived_threads"). + Section string + // Name is the thread directory basename (e.g. "testuser_ABC123XYZ"). + // For E2EE exports, this is the filename without extension. + Name string + // Format is "json", "html", "both", or "e2ee_json". + Format string +} + +// knownSections are the DYI subdirectories we walk for threads. +var knownSections = []string{ + "inbox", + "archived_threads", + "filtered_threads", + "message_requests", + "e2ee_cutover", +} + +// messagesRootCandidates returns candidate "messages" roots inside a DYI +// export, handling the post-2024 ("your_activity_across_facebook/messages"), +// 2025+ ("your_facebook_activity/messages"), and pre-2024 ("messages") layouts. +func messagesRootCandidates(root string) []string { + return []string{ + filepath.Join(root, "your_activity_across_facebook", "messages"), + filepath.Join(root, "your_facebook_activity", "messages"), + filepath.Join(root, "messages"), + } +} + +// Discover walks a DYI export root and returns one ThreadDir per thread +// directory found, sorted by (Section, Name). Absolute and relative root +// inputs yield the same logical result (paths are converted to absolute). +func Discover(root string) ([]ThreadDir, error) { + absRoot, err := filepath.Abs(root) + if err != nil { + return nil, fmt.Errorf("fbmessenger: abs root: %w", err) + } + info, err := os.Stat(absRoot) + if err != nil { + return nil, fmt.Errorf("fbmessenger: stat root: %w", err) + } + if !info.IsDir() { + return nil, fmt.Errorf("fbmessenger: root is not a directory: %s", absRoot) + } + + var out []ThreadDir + scannedE2EE := make(map[string]bool) + for _, candidate := range messagesRootCandidates(absRoot) { + info, err := os.Stat(candidate) + if err != nil || !info.IsDir() { + continue + } + // Check for DYI section subdirectories. + for _, section := range knownSections { + sectionDir := filepath.Join(candidate, section) + entries, err := os.ReadDir(sectionDir) + if err != nil { + continue + } + for _, entry := range entries { + if !entry.IsDir() { + continue + } + name := entry.Name() + if strings.HasPrefix(name, ".") { + continue + } + threadPath := filepath.Join(sectionDir, name) + format, ok := detectFormat(threadPath) + if !ok { + continue + } + out = append(out, ThreadDir{ + Path: threadPath, + Section: section, + Name: name, + Format: format, + }) + } + } + // Check for E2EE flat export: *.json files directly in the + // messages root (no section subdirectories). + if !scannedE2EE[candidate] { + scannedE2EE[candidate] = true + out = append(out, discoverE2EEFlat(candidate)...) + } + } + + // Also check if absRoot itself is an E2EE flat export directory + // (user passed the messages dir directly). + if !scannedE2EE[absRoot] { + out = append(out, discoverE2EEFlat(absRoot)...) + } + + sort.Slice(out, func(i, j int) bool { + if out[i].Section != out[j].Section { + return out[i].Section < out[j].Section + } + return out[i].Name < out[j].Name + }) + return out, nil +} + +// discoverE2EEFlat scans a directory for E2EE flat export files: JSON +// files like "Name_N.json" sitting directly in the directory (not inside +// section subdirectories). Each file becomes one ThreadDir with Format +// "e2ee_json". +func discoverE2EEFlat(dir string) []ThreadDir { + entries, err := os.ReadDir(dir) + if err != nil { + return nil + } + var out []ThreadDir + for _, entry := range entries { + if entry.IsDir() { + continue + } + name := entry.Name() + if strings.HasPrefix(name, ".") { + continue + } + if !strings.HasSuffix(name, ".json") { + continue + } + // Skip known non-thread JSON files. + if isKnownMetadataFile(name) { + continue + } + full := filepath.Join(dir, name) + // Probe the top-level JSON shape via a streaming decoder so + // only actual threads enter the indexed list. Keeping the list + // stable across runs matters because per-thread checkpoints + // resume by index — if metadata files joined the list one run + // and dropped the next (e.g. after a Facebook DYI schema change + // or an allowlist update) the saved index would point past the + // next real thread. On any I/O or JSON error we fall through + // and include the file so ParseE2EEJSONFile can classify it + // (corrupt files still raise ErrCorruptJSON instead of being + // silently dropped at discovery). + if shape := probeE2EEShape(full); shape == e2eeShapeNotThread { + continue + } + threadName := strings.TrimSuffix(name, ".json") + out = append(out, ThreadDir{ + Path: dir, + FilePath: full, + Section: "e2ee_cutover", + Name: threadName, + Format: "e2ee_json", + }) + } + return out +} + +type e2eeShape int + +const ( + // e2eeShapeUnknown: the probe couldn't read or decode the file; the + // caller should include it so the full parser can classify it. + e2eeShapeUnknown e2eeShape = iota + // e2eeShapeThread: the file is a JSON object with at least both of + // "participants" and "messages" at the top level. + e2eeShapeThread + // e2eeShapeNotThread: the file is valid JSON but not a thread — + // non-object shape (array/scalar), or an object missing both + // "participants" and "messages". Objects that have exactly one of + // the two are reported as e2eeShapeUnknown so the parser can raise + // ErrCorruptJSON rather than being silently dropped. + e2eeShapeNotThread +) + +// probeE2EEShape classifies the top-level shape of a candidate E2EE +// flat-export JSON file without decoding the entire body. It streams +// tokens with json.Decoder and stops as soon as both "participants" +// and "messages" keys have been seen. +func probeE2EEShape(filePath string) e2eeShape { + f, err := os.Open(filePath) + if err != nil { + return e2eeShapeUnknown + } + defer func() { _ = f.Close() }() + dec := json.NewDecoder(f) + tok, err := dec.Token() + if err != nil { + return e2eeShapeUnknown + } + d, ok := tok.(json.Delim) + if !ok || d != '{' { + return e2eeShapeNotThread + } + var hasP, hasM bool + for dec.More() { + tok, err := dec.Token() + if err != nil { + return e2eeShapeUnknown + } + key, ok := tok.(string) + if !ok { + return e2eeShapeUnknown + } + if key == "participants" { + hasP = true + } + if key == "messages" { + hasM = true + } + if hasP && hasM { + return e2eeShapeThread + } + var skip json.RawMessage + if err := dec.Decode(&skip); err != nil { + return e2eeShapeUnknown + } + } + if !hasP && !hasM { + return e2eeShapeNotThread + } + // Object with exactly one of the two keys — let the parser raise + // ErrCorruptJSON rather than silently dropping it. + return e2eeShapeUnknown +} + +// isKnownMetadataFile returns true for JSON filenames that are DYI +// metadata rather than thread exports. +func isKnownMetadataFile(name string) bool { + switch name { + case "autofill_information.json", + "chat_invites_received.json", + "community_chats_settings.json", + "encrypted_messaging_backup_settings.json", + "information_about_your_devices.json", + "messaging_settings.json", + "messenger_active_status_platform_settings.json", + "messenger_active_status_settings.json", + "messenger_ui_settings.json", + "secret_conversations.json", + "support_messages.json", + "your_chat_settings_on_web.json", + "your_end-to-end_encryption_enabled_messenger_device.json", + "your_messenger_app_install_information.json", + "your_responsiveness_in_messaging_threads.json": + return true + } + return false +} + +// detectFormat inspects a thread directory and reports whether it has a +// numbered message_.json, a message_*.html, or both. Directories with +// neither are ignored (returns ok=false). The JSON match mirrors +// ParseJSONThread's regex so a thread with only an unnumbered JSON +// sibling (e.g. message_final.json) is not misclassified as JSON and +// does not crowd out a sibling HTML file. +func detectFormat(threadPath string) (string, bool) { + entries, err := os.ReadDir(threadPath) + if err != nil { + return "", false + } + hasJSON, hasHTML := false, false + for _, e := range entries { + if e.IsDir() { + continue + } + name := e.Name() + if strings.HasPrefix(name, ".") || name == "autofill_information.json" { + continue + } + if reMessageFile.MatchString(name) { + hasJSON = true + } else if strings.HasPrefix(name, "message_") && strings.HasSuffix(name, ".html") { + hasHTML = true + } + } + switch { + case hasJSON && hasHTML: + return "both", true + case hasJSON: + return "json", true + case hasHTML: + return "html", true + } + return "", false +} diff --git a/internal/fbmessenger/discover_test.go b/internal/fbmessenger/discover_test.go new file mode 100644 index 00000000..05a4fc25 --- /dev/null +++ b/internal/fbmessenger/discover_test.go @@ -0,0 +1,214 @@ +package fbmessenger + +import ( + "os" + "path/filepath" + "reflect" + "sort" + "testing" +) + +func TestDiscover_JSONSimple(t *testing.T) { + dirs, err := Discover("testdata/json_simple") + if err != nil { + t.Fatalf("Discover: %v", err) + } + // json_simple has one inbox thread and one archived thread. + want := []struct { + section, name, format string + }{ + {"archived_threads", "zoe_ARCH", "json"}, + {"inbox", "alice_ABC123", "json"}, + } + if len(dirs) != len(want) { + t.Fatalf("discovered %d threads, want %d: %+v", len(dirs), len(want), dirs) + } + for i, w := range want { + if dirs[i].Section != w.section { + t.Errorf("[%d] section=%q want %q", i, dirs[i].Section, w.section) + } + if dirs[i].Name != w.name { + t.Errorf("[%d] name=%q want %q", i, dirs[i].Name, w.name) + } + if dirs[i].Format != w.format { + t.Errorf("[%d] format=%q want %q", i, dirs[i].Format, w.format) + } + if !filepath.IsAbs(dirs[i].Path) { + t.Errorf("[%d] path not absolute: %q", i, dirs[i].Path) + } + } +} + +func TestDiscover_HTMLOnly(t *testing.T) { + dirs, err := Discover("testdata/html_simple") + if err != nil { + t.Fatalf("Discover: %v", err) + } + if len(dirs) != 1 { + t.Fatalf("discovered %d threads, want 1", len(dirs)) + } + if dirs[0].Format != "html" { + t.Errorf("format=%q want html", dirs[0].Format) + } +} + +func TestDiscover_Both(t *testing.T) { + dirs, err := Discover("testdata/mixed") + if err != nil { + t.Fatalf("Discover: %v", err) + } + if len(dirs) != 1 { + t.Fatalf("discovered %d threads, want 1", len(dirs)) + } + if dirs[0].Format != "both" { + t.Errorf("format=%q want both", dirs[0].Format) + } +} + +func TestDiscover_AbsoluteAndRelativeInvariance(t *testing.T) { + rel, err := Discover("testdata/json_simple") + if err != nil { + t.Fatalf("relative Discover: %v", err) + } + absRoot, err := filepath.Abs("testdata/json_simple") + if err != nil { + t.Fatal(err) + } + abs, err := Discover(absRoot) + if err != nil { + t.Fatalf("absolute Discover: %v", err) + } + sort.Slice(rel, func(i, j int) bool { return rel[i].Path < rel[j].Path }) + sort.Slice(abs, func(i, j int) bool { return abs[i].Path < abs[j].Path }) + if !reflect.DeepEqual(rel, abs) { + t.Errorf("relative vs absolute differ:\nrel=%+v\nabs=%+v", rel, abs) + } +} + +func TestDiscover_IgnoresHiddenAndMediaSubdirs(t *testing.T) { + // json_with_media contains a photos/ subdir with tiny.png; it must + // not be returned as a thread dir. + dirs, err := Discover("testdata/json_with_media") + if err != nil { + t.Fatalf("Discover: %v", err) + } + if len(dirs) != 1 { + t.Fatalf("discovered %d threads, want 1", len(dirs)) + } + if dirs[0].Name != "bob_XYZ789" { + t.Errorf("name=%q want bob_XYZ789", dirs[0].Name) + } + // None of the returned paths should point at photos/, videos/, etc. + for _, d := range dirs { + base := filepath.Base(d.Path) + if base == "photos" || base == "videos" { + t.Errorf("unexpected media subdir yielded: %q", d.Path) + } + } +} + +func TestDiscover_AlternateLayouts(t *testing.T) { + // Verify all three messagesRootCandidates layouts are discovered. + layouts := []string{ + filepath.Join("your_activity_across_facebook", "messages"), + filepath.Join("your_facebook_activity", "messages"), + "messages", + } + for _, layout := range layouts { + t.Run(layout, func(t *testing.T) { + tmp := t.TempDir() + threadDir := filepath.Join(tmp, layout, "inbox", "testthread_1") + if err := os.MkdirAll(threadDir, 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile( + filepath.Join(threadDir, "message_1.json"), + []byte(`{"participants":[{"name":"A"}],"messages":[]}`), + 0644, + ); err != nil { + t.Fatal(err) + } + dirs, err := Discover(tmp) + if err != nil { + t.Fatalf("Discover: %v", err) + } + if len(dirs) != 1 { + t.Fatalf("discovered %d threads, want 1: %+v", len(dirs), dirs) + } + if dirs[0].Name != "testthread_1" { + t.Errorf("name=%q want testthread_1", dirs[0].Name) + } + }) + } +} + +// TestDiscover_UnnumberedJSONSiblingNotMisclassified guards against a thread +// directory that contains valid HTML plus only an unnumbered JSON sibling +// (e.g. message_final.json) being classified as JSON or "both". ParseJSONThread +// only accepts numbered message_.json files, so misclassifying as JSON +// would cause the thread to fail in auto/json mode and the valid HTML to +// be skipped. +func TestDiscover_UnnumberedJSONSiblingNotMisclassified(t *testing.T) { + tmp := t.TempDir() + threadDir := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "foo_1") + if err := os.MkdirAll(threadDir, 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile( + filepath.Join(threadDir, "message_1.html"), + []byte(`hi`), + 0644, + ); err != nil { + t.Fatal(err) + } + if err := os.WriteFile( + filepath.Join(threadDir, "message_final.json"), + []byte(`{"unrelated":true}`), + 0644, + ); err != nil { + t.Fatal(err) + } + + dirs, err := Discover(tmp) + if err != nil { + t.Fatalf("Discover: %v", err) + } + if len(dirs) != 1 { + t.Fatalf("discovered %d threads, want 1: %+v", len(dirs), dirs) + } + if dirs[0].Format != "html" { + t.Errorf("format=%q want html (unnumbered JSON sibling must not promote thread to json/both)", dirs[0].Format) + } +} + +func TestDiscover_IgnoresDSStore(t *testing.T) { + // Create a temp DYI tree with a .DS_Store at the thread level; it + // must not turn it into a thread dir. + tmp := t.TempDir() + threadDir := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "foo_1") + if err := os.MkdirAll(threadDir, 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(threadDir, "message_1.json"), []byte(`{"participants":[{"name":"A"}],"messages":[]}`), 0644); err != nil { + t.Fatal(err) + } + // Add a .DS_Store sibling and a .hidden dir at section level. + section := filepath.Dir(threadDir) + if err := os.WriteFile(filepath.Join(section, ".DS_Store"), []byte("x"), 0644); err != nil { + t.Fatal(err) + } + if err := os.Mkdir(filepath.Join(section, ".hidden"), 0755); err != nil { + t.Fatal(err) + } + + dirs, err := Discover(tmp) + if err != nil { + t.Fatalf("Discover: %v", err) + } + if len(dirs) != 1 { + t.Fatalf("discovered %d threads, want 1: %+v", len(dirs), dirs) + } + if dirs[0].Name != "foo_1" { + t.Errorf("name=%q want foo_1", dirs[0].Name) + } +} diff --git a/internal/fbmessenger/e2ee_parser.go b/internal/fbmessenger/e2ee_parser.go new file mode 100644 index 00000000..7c350151 --- /dev/null +++ b/internal/fbmessenger/e2ee_parser.go @@ -0,0 +1,211 @@ +package fbmessenger + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +// rawE2EEExport is the shape of a single E2EE thread JSON file. +type rawE2EEExport struct { + Participants []string `json:"participants"` + ThreadName string `json:"threadName"` + Messages []rawE2EEMessage `json:"messages"` +} + +type rawE2EEMedia struct { + URI string `json:"uri"` +} + +type rawE2EEReaction struct { + Actor string `json:"actor"` + Reaction string `json:"reaction"` +} + +type rawE2EEMessage struct { + SenderName string `json:"senderName"` + Text string `json:"text"` + Timestamp int64 `json:"timestamp"` + Type string `json:"type"` + IsUnsent bool `json:"isUnsent"` + Media []rawE2EEMedia `json:"media"` + Reactions []rawE2EEReaction `json:"reactions"` +} + +// missingThreadKey names the missing top-level key for the corrupt- +// thread error message, given which of "participants"/"messages" is +// present. +func missingThreadKey(hasP, hasM bool) string { + switch { + case !hasP: + return "participants" + case !hasM: + return "messages" + } + return "" +} + +// ParseE2EEJSONFile parses a single E2EE flat-export JSON file and +// returns a populated Thread. rootDir is the export root (used for +// resolving media paths); filePath is the absolute path to the JSON file. +func ParseE2EEJSONFile(rootDir, filePath string) (*Thread, error) { + data, err := os.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("fbmessenger: read e2ee file: %w", err) + } + // Classify the top-level shape before doing the strongly-typed + // decode. discover.probeE2EEShape already filters out obvious + // non-thread JSON, but the parser defends against direct callers + // and against a discovery layer that defers to it on ambiguous + // shapes. Distinguish three cases: + // - neither "participants" nor "messages": not a thread → silent skip + // - exactly one of the two: malformed → ErrCorruptJSON so the + // importer logs and counts it + // - both present, or non-object: object→full decode; non-object→not a thread + var top any + if err := json.Unmarshal(data, &top); err != nil { + return nil, fmt.Errorf("%w: %s: %v", ErrCorruptJSON, filepath.Base(filePath), err) + } + obj, ok := top.(map[string]any) + if !ok { + return nil, fmt.Errorf("%w: %s", ErrNotE2EEThread, filepath.Base(filePath)) + } + _, hasP := obj["participants"] + _, hasM := obj["messages"] + switch { + case !hasP && !hasM: + return nil, fmt.Errorf("%w: %s", ErrNotE2EEThread, filepath.Base(filePath)) + case hasP != hasM: + return nil, fmt.Errorf("%w: %s: missing %s", ErrCorruptJSON, filepath.Base(filePath), missingThreadKey(hasP, hasM)) + } + var decoded rawE2EEExport + if err := json.Unmarshal(data, &decoded); err != nil { + return nil, fmt.Errorf("%w: %s: %v", ErrCorruptJSON, filepath.Base(filePath), err) + } + + thread := &Thread{ + DirName: strings.TrimSuffix(filepath.Base(filePath), ".json"), + Title: DecodeMojibake(decoded.ThreadName), + Format: "e2ee_json", + RawBytes: data, + } + + // Participants. + seen := make(map[string]bool, len(decoded.Participants)) + for _, name := range decoded.Participants { + name = DecodeMojibake(name) + if seen[name] { + continue + } + seen[name] = true + thread.Participants = append(thread.Participants, Participant{Name: name}) + } + + if len(thread.Participants) <= 2 { + thread.ConvType = "direct_chat" + } else { + thread.ConvType = "group_chat" + } + + // Sort messages chronologically. + sort.SliceStable(decoded.Messages, func(i, j int) bool { + return decoded.Messages[i].Timestamp < decoded.Messages[j].Timestamp + }) + + // The media directory sits alongside the JSON files. + mediaDir := filepath.Dir(filePath) + absMediaDir, err := filepath.Abs(mediaDir) + if err != nil { + return nil, fmt.Errorf("fbmessenger: abs media dir: %w", err) + } + + thread.Messages = make([]Message, 0, len(decoded.Messages)) + msgIdx := 0 + for _, m := range decoded.Messages { + if m.IsUnsent { + continue + } + msg := Message{ + Index: msgIdx, + SenderName: DecodeMojibake(m.SenderName), + Type: m.Type, + } + msgIdx++ + if m.Timestamp > 0 { + msg.SentAt = time.UnixMilli(m.Timestamp).UTC() + } + msg.Body = renderE2EEBody(m) + msg.Attachments = resolveE2EEAttachments(absMediaDir, m) + for _, r := range m.Reactions { + msg.Reactions = append(msg.Reactions, Reaction{ + Actor: DecodeMojibake(r.Actor), + Reaction: DecodeMojibake(r.Reaction), + }) + } + if len(msg.Reactions) > 0 { + msg.Body = appendReactionSummary(msg.Body, msg.Reactions) + } + thread.Messages = append(thread.Messages, msg) + } + return thread, nil +} + +// renderE2EEBody computes the body string for one E2EE message. +func renderE2EEBody(m rawE2EEMessage) string { + if m.Text != "" { + return DecodeMojibake(m.Text) + } + if len(m.Media) > 0 { + return "[media]" + } + return "" +} + +// resolveE2EEAttachments resolves media URIs for an E2EE message. +// E2EE media URIs are relative like "./media/uuid.jpeg". +func resolveE2EEAttachments(absDir string, m rawE2EEMessage) []Attachment { + var out []Attachment + for _, media := range m.Media { + if media.URI == "" { + continue + } + // Strip leading "./" from relative URIs. + rel := strings.TrimPrefix(media.URI, "./") + abs := filepath.Join(absDir, rel) + // Verify the resolved path stays inside absDir. + absClean, err := filepath.Abs(abs) + if err != nil { + continue + } + if !strings.HasPrefix(absClean, absDir+string(filepath.Separator)) && absClean != absDir { + continue + } + out = append(out, Attachment{ + URI: media.URI, + AbsPath: absClean, + Kind: guessE2EEKind(media.URI), + Filename: filepath.Base(media.URI), + MimeType: guessMime(media.URI), + }) + } + return out +} + +func guessE2EEKind(uri string) string { + ext := strings.ToLower(filepath.Ext(uri)) + switch ext { + case ".jpg", ".jpeg", ".png", ".webp": + return "photo" + case ".mp4", ".mov": + return "video" + case ".mp3", ".wav", ".ogg": + return "audio" + case ".gif": + return "gif" + } + return "file" +} diff --git a/internal/fbmessenger/e2ee_parser_test.go b/internal/fbmessenger/e2ee_parser_test.go new file mode 100644 index 00000000..324e08f7 --- /dev/null +++ b/internal/fbmessenger/e2ee_parser_test.go @@ -0,0 +1,266 @@ +package fbmessenger + +import ( + "errors" + "os" + "path/filepath" + "strings" + "testing" +) + +func TestParseE2EEJSONFile_Simple(t *testing.T) { + root := "testdata/e2ee_simple" + absRoot, err := filepath.Abs(root) + if err != nil { + t.Fatal(err) + } + filePath := filepath.Join(absRoot, "your_activity_across_facebook", "messages", "alice_1.json") + + th, err := ParseE2EEJSONFile(absRoot, filePath) + if err != nil { + t.Fatalf("parse: %v", err) + } + if th.ConvType != "direct_chat" { + t.Errorf("conv_type=%q want direct_chat", th.ConvType) + } + if len(th.Participants) != 2 { + t.Errorf("participants=%d want 2", len(th.Participants)) + } + if th.Format != "e2ee_json" { + t.Errorf("format=%q want e2ee_json", th.Format) + } + if th.DirName != "alice_1" { + t.Errorf("dir_name=%q want alice_1", th.DirName) + } + // Unsent message should be filtered out. + if len(th.Messages) != 3 { + t.Fatalf("messages=%d want 3", len(th.Messages)) + } + // Messages must be chronological ascending. + for i := 1; i < len(th.Messages); i++ { + if th.Messages[i-1].SentAt.After(th.Messages[i].SentAt) { + t.Errorf("messages out of order at %d", i) + } + } + // Mojibake repair: message 1 body must contain "café". + if !strings.Contains(th.Messages[1].Body, "café") { + t.Errorf("mojibake not repaired: body=%q", th.Messages[1].Body) + } + // Reactions appended to body. + if !strings.Contains(th.Messages[1].Body, "[reacted:") { + t.Errorf("reactions not appended: body=%q", th.Messages[1].Body) + } + if len(th.Messages[1].Reactions) != 1 { + t.Errorf("reactions=%d want 1", len(th.Messages[1].Reactions)) + } + // Index monotonic. + for i, m := range th.Messages { + if m.Index != i { + t.Errorf("index[%d]=%d want %d", i, m.Index, i) + } + } +} + +func TestParseE2EEJSONFile_Group(t *testing.T) { + root := "testdata/e2ee_simple" + absRoot, err := filepath.Abs(root) + if err != nil { + t.Fatal(err) + } + filePath := filepath.Join(absRoot, "your_activity_across_facebook", "messages", "group_2.json") + + th, err := ParseE2EEJSONFile(absRoot, filePath) + if err != nil { + t.Fatalf("parse: %v", err) + } + if th.ConvType != "group_chat" { + t.Errorf("conv_type=%q want group_chat", th.ConvType) + } + if len(th.Participants) != 3 { + t.Errorf("participants=%d want 3", len(th.Participants)) + } +} + +func TestParseE2EEJSONFile_MediaResolution(t *testing.T) { + root := "testdata/e2ee_simple" + absRoot, err := filepath.Abs(root) + if err != nil { + t.Fatal(err) + } + filePath := filepath.Join(absRoot, "your_activity_across_facebook", "messages", "group_2.json") + + th, err := ParseE2EEJSONFile(absRoot, filePath) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(th.Messages) != 2 { + t.Fatalf("messages=%d want 2", len(th.Messages)) + } + // Second message has a media attachment. + m := th.Messages[1] + if m.Body != "[media]" { + t.Errorf("body=%q want [media]", m.Body) + } + if len(m.Attachments) != 1 { + t.Fatalf("attachments=%d want 1", len(m.Attachments)) + } + att := m.Attachments[0] + if att.Kind != "photo" { + t.Errorf("kind=%q want photo", att.Kind) + } + if att.Filename != "photo.jpg" { + t.Errorf("filename=%q want photo.jpg", att.Filename) + } + if _, err := os.Stat(att.AbsPath); err != nil { + t.Errorf("attachment file should exist on disk: %v", err) + } +} + +func TestParseE2EEJSONFile_NotAThread(t *testing.T) { + tmp := t.TempDir() + cases := map[string]string{ + "array.json": `[{"any": "list"}]`, + "scalar.json": `"a string"`, + "no_keys.json": `{"setting": true, "version": 2}`, + } + for name, body := range cases { + p := filepath.Join(tmp, name) + if err := os.WriteFile(p, []byte(body), 0o644); err != nil { + t.Fatal(err) + } + _, err := ParseE2EEJSONFile(tmp, p) + if !errors.Is(err, ErrNotE2EEThread) { + t.Errorf("%s: expected ErrNotE2EEThread, got %v", name, err) + } + } +} + +// TestParseE2EEJSONFile_PartialObjectCorrupt verifies that an object +// with exactly one of "participants"/"messages" is classified as corrupt +// rather than silently skipped — a partial export with missing +// messages must not vanish silently. +func TestParseE2EEJSONFile_PartialObjectCorrupt(t *testing.T) { + tmp := t.TempDir() + cases := map[string]string{ + "only_p.json": `{"participants": ["A", "B"]}`, + "only_msg.json": `{"messages": [{"senderName":"A","text":"x","timestamp":1}]}`, + } + for name, body := range cases { + p := filepath.Join(tmp, name) + if err := os.WriteFile(p, []byte(body), 0o644); err != nil { + t.Fatal(err) + } + _, err := ParseE2EEJSONFile(tmp, p) + if !errors.Is(err, ErrCorruptJSON) { + t.Errorf("%s: expected ErrCorruptJSON, got %v", name, err) + } + } +} + +func TestParseE2EEJSONFile_CorruptJSON(t *testing.T) { + tmp := t.TempDir() + badFile := filepath.Join(tmp, "bad.json") + if err := os.WriteFile(badFile, []byte(`{not valid json`), 0644); err != nil { + t.Fatal(err) + } + _, err := ParseE2EEJSONFile(tmp, badFile) + if err == nil { + t.Fatal("expected error, got nil") + } + if !errors.Is(err, ErrCorruptJSON) { + t.Errorf("expected ErrCorruptJSON, got %v", err) + } +} + +func TestParseE2EEJSONFile_PathEscapeRejected(t *testing.T) { + tmp := t.TempDir() + body := `{ + "participants": ["A", "B"], + "threadName": "test", + "messages": [{ + "senderName": "A", + "text": "", + "timestamp": 1600000000000, + "type": "Generic", + "media": [{"uri": "../../etc/passwd"}] + }] + }` + filePath := filepath.Join(tmp, "evil.json") + if err := os.WriteFile(filePath, []byte(body), 0644); err != nil { + t.Fatal(err) + } + th, err := ParseE2EEJSONFile(tmp, filePath) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(th.Messages) != 1 { + t.Fatalf("messages=%d want 1", len(th.Messages)) + } + // Path escape should be rejected — no attachments resolved. + if len(th.Messages[0].Attachments) != 0 { + t.Errorf("path escape not rejected: attachments=%+v", th.Messages[0].Attachments) + } +} + +func TestDiscover_E2EEFlat(t *testing.T) { + dirs, err := Discover("testdata/e2ee_simple") + if err != nil { + t.Fatalf("Discover: %v", err) + } + // e2ee_simple has two JSON files at the messages root: alice_1.json + // and group_2.json. Both should be discovered as e2ee_json threads. + if len(dirs) != 2 { + t.Fatalf("discovered %d threads, want 2: %+v", len(dirs), dirs) + } + for _, d := range dirs { + if d.Format != "e2ee_json" { + t.Errorf("format=%q want e2ee_json for %q", d.Format, d.Name) + } + if d.Section != "e2ee_cutover" { + t.Errorf("section=%q want e2ee_cutover for %q", d.Section, d.Name) + } + if d.FilePath == "" { + t.Errorf("FilePath should be set for E2EE thread %q", d.Name) + } + if !filepath.IsAbs(d.Path) { + t.Errorf("path not absolute: %q", d.Path) + } + } + // Sorted by name. + if dirs[0].Name != "alice_1" { + t.Errorf("dirs[0].Name=%q want alice_1", dirs[0].Name) + } + if dirs[1].Name != "group_2" { + t.Errorf("dirs[1].Name=%q want group_2", dirs[1].Name) + } +} + +// TestDiscover_E2EEFlatRejectsNonThreadJSON verifies that a directory +// containing both real thread files and unknown non-thread JSON blobs +// (e.g. a new DYI metadata file Facebook may add) discovers only the +// thread files. Keeping the indexed list stable across runs is required +// for checkpoint-by-thread-index resume. +func TestDiscover_E2EEFlatRejectsNonThreadJSON(t *testing.T) { + tmp := t.TempDir() + thread := `{"participants":["A","B"],"threadName":"t","messages":[]}` + if err := os.WriteFile(filepath.Join(tmp, "real_1.json"), []byte(thread), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(tmp, "metadata.json"), []byte(`{"setting":true,"version":3}`), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(tmp, "list.json"), []byte(`[1,2,3]`), 0o644); err != nil { + t.Fatal(err) + } + + dirs, err := Discover(tmp) + if err != nil { + t.Fatalf("Discover: %v", err) + } + if len(dirs) != 1 { + t.Fatalf("discovered %d files, want 1: %+v", len(dirs), dirs) + } + if dirs[0].Name != "real_1" { + t.Errorf("dirs[0].Name=%q want real_1", dirs[0].Name) + } +} diff --git a/internal/fbmessenger/html_parser.go b/internal/fbmessenger/html_parser.go new file mode 100644 index 00000000..31deb09f --- /dev/null +++ b/internal/fbmessenger/html_parser.go @@ -0,0 +1,412 @@ +package fbmessenger + +import ( + "fmt" + "os" + "path/filepath" + "regexp" + "sort" + "strings" + "time" + + "golang.org/x/net/html" +) + +// maxBodyLinesBeforeTimestamp limits the look-ahead window when scanning +// for a timestamp line after a sender name in HTML message blocks. +const maxBodyLinesBeforeTimestamp = 64 + +// htmlTimeLayouts are the candidate layouts the HTML parser tries when +// interpreting a timestamp string. All parsed times are treated as UTC; +// Facebook HTML exports do not expose a timezone (plan D6). +var htmlTimeLayouts = []string{ + "Jan 2, 2006, 3:04 PM", + "Jan 2, 2006 at 3:04 PM", + "2 Jan 2006, 15:04", + "Jan 2, 2006, 3:04:05 PM", +} + +// parseHTMLTimestamp parses a stamp string with any of the known layouts, +// returning the time in UTC. Returns the zero value and false on failure. +func parseHTMLTimestamp(s string) (time.Time, bool) { + s = strings.TrimSpace(s) + for _, layout := range htmlTimeLayouts { + if t, err := time.ParseInLocation(layout, s, time.UTC); err == nil { + return t.UTC(), true + } + } + return time.Time{}, false +} + +// ParseHTMLThread parses a DYI HTML thread directory and returns a Thread. +func ParseHTMLThread(rootDir, threadDir string) (*Thread, error) { + entries, err := os.ReadDir(threadDir) + if err != nil { + return nil, fmt.Errorf("fbmessenger: read html thread dir: %w", err) + } + var files []string + for _, e := range entries { + if e.IsDir() { + continue + } + name := e.Name() + if strings.HasPrefix(name, "message_") && strings.HasSuffix(name, ".html") { + files = append(files, name) + } + } + if len(files) == 0 { + return nil, fmt.Errorf("fbmessenger: no message_*.html files in %s", threadDir) + } + sort.Strings(files) + + absRoot, err := filepath.Abs(rootDir) + if err != nil { + return nil, fmt.Errorf("fbmessenger: abs root: %w", err) + } + + var thread Thread + thread.Format = "html" + thread.DirName = filepath.Base(threadDir) + + for _, name := range files { + data, err := os.ReadFile(filepath.Join(threadDir, name)) + if err != nil { + return nil, fmt.Errorf("fbmessenger: read %s: %w", name, err) + } + thread.RawBytes = append(thread.RawBytes, data...) + doc, err := html.Parse(strings.NewReader(string(data))) + if err != nil { + return nil, fmt.Errorf("fbmessenger: parse %s: %w", name, err) + } + if title := extractHTMLTitle(doc); title != "" && thread.Title == "" { + thread.Title = title + } + lines, images := collectHTMLLines(doc) + absThreadDir, _ := filepath.Abs(threadDir) + participants, msgs := parseHTMLLines(lines, images, absRoot, absThreadDir) + if len(thread.Participants) == 0 { + thread.Participants = participants + } + base := len(thread.Messages) + for i := range msgs { + msgs[i].Index = base + i + } + thread.Messages = append(thread.Messages, msgs...) + } + + if len(thread.Participants) <= 2 { + thread.ConvType = "direct_chat" + } else { + thread.ConvType = "group_chat" + } + + sort.SliceStable(thread.Messages, func(i, j int) bool { + if thread.Messages[i].SentAt.IsZero() || thread.Messages[j].SentAt.IsZero() { + return thread.Messages[i].Index < thread.Messages[j].Index + } + return thread.Messages[i].SentAt.Before(thread.Messages[j].SentAt) + }) + // Re-number indices after sort so Index remains monotonic. + for i := range thread.Messages { + thread.Messages[i].Index = i + } + return &thread, nil +} + +// extractHTMLTitle finds the document text. +func extractHTMLTitle(n *html.Node) string { + if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil { + return strings.TrimSpace(n.FirstChild.Data) + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if t := extractHTMLTitle(c); t != "" { + return t + } + } + return "" +} + +// htmlImageRef records an <img> src and the index in the lines array +// where it appears, so parseHTMLLines can associate images with the +// correct message block. +type htmlImageRef struct { + Src string + LineIdx int // index into the lines slice at point of encounter +} + +// collectHTMLLines walks the body of the document and returns a flat list +// of logical text lines plus positioned image references. +// +// A "line" is the concatenated text content of a leaf block-level element +// (div/p/span when it contains no block-level descendants). Whitespace +// runs inside a line are collapsed. +// +// Images are recorded with the line index at the time they are encountered +// in document order, so callers can associate each image with the message +// block whose line range contains it. +func collectHTMLLines(doc *html.Node) ([]string, []htmlImageRef) { + var body *html.Node + var find func(n *html.Node) + find = func(n *html.Node) { + if body != nil { + return + } + if n.Type == html.ElementNode && n.Data == "body" { + body = n + return + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + find(c) + } + } + find(doc) + if body == nil { + body = doc + } + + var lines []string + var images []htmlImageRef + + // collectImgs records all <img> elements under n at the current line index. + var collectImgs func(n *html.Node) + collectImgs = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "img" { + for _, a := range n.Attr { + if a.Key == "src" { + images = append(images, htmlImageRef{Src: a.Val, LineIdx: len(lines)}) + break + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + collectImgs(c) + } + } + + var walk func(n *html.Node) + walk = func(n *html.Node) { + // Pick up images that live outside leaf blocks (e.g. between divs). + if n.Type == html.ElementNode && n.Data == "img" { + for _, a := range n.Attr { + if a.Key == "src" { + images = append(images, htmlImageRef{Src: a.Val, LineIdx: len(lines)}) + break + } + } + return + } + if n.Type == html.ElementNode && isLeafBlock(n) { + // Collect images inside the leaf block at the current line position. + collectImgs(n) + text := collapseWhitespace(textContent(n)) + if text != "" { + lines = append(lines, text) + } + return + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + } + walk(body) + return lines, images +} + +// isLeafBlock returns true for block-level elements that contain no +// block-level descendants. The DOM walker treats such elements as the +// boundary of one logical text line. +func isLeafBlock(n *html.Node) bool { + if n.Type != html.ElementNode { + return false + } + switch n.Data { + case "div", "p", "li", "td", "h1", "h2", "h3", "h4": + // fall through + default: + return false + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode { + switch c.Data { + case "div", "p", "li", "td", "h1", "h2", "h3", "h4": + return false + } + if hasBlockDescendant(c) { + return false + } + } + } + return true +} + +func hasBlockDescendant(n *html.Node) bool { + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode { + switch c.Data { + case "div", "p", "li", "td", "h1", "h2", "h3", "h4": + return true + } + if hasBlockDescendant(c) { + return true + } + } + } + return false +} + +// textContent returns the concatenation of text nodes under n. +func textContent(n *html.Node) string { + if n.Type == html.TextNode { + return n.Data + } + var b strings.Builder + for c := n.FirstChild; c != nil; c = c.NextSibling { + b.WriteString(textContent(c)) + } + return b.String() +} + +var wsRun = regexp.MustCompile(`[ \t\f\v]+`) + +func collapseWhitespace(s string) string { + s = html.UnescapeString(s) + // Collapse horizontal whitespace runs but preserve newlines so body + // text keeps paragraph breaks. + lines := strings.Split(s, "\n") + for i, line := range lines { + lines[i] = strings.TrimSpace(wsRun.ReplaceAllString(line, " ")) + } + // Rejoin and trim leading/trailing blank lines. + joined := strings.Join(lines, "\n") + joined = strings.Trim(joined, "\n") + return joined +} + +// parseHTMLLines scans the flat text-line output of collectHTMLLines, +// extracts the participants header, and walks the remaining lines +// collecting (sender, body, timestamp) triples. Images are assigned to +// the message block whose line range contains the image's DOM position. +func parseHTMLLines(lines []string, images []htmlImageRef, absRoot, htmlDir string) ([]Participant, []Message) { + var participants []Participant + participantNames := make(map[string]bool) + // remainingStart is the offset into lines where message blocks begin + // (after the Participants: header). Image LineIdx values are relative + // to the original lines slice, so we track this offset. + remainingStart := 0 + remaining := lines + for i, ln := range lines { + if strings.HasPrefix(ln, "Participants:") { + rest := strings.TrimSpace(strings.TrimPrefix(ln, "Participants:")) + for _, part := range strings.Split(rest, ",") { + name := strings.TrimSpace(part) + if name == "" { + continue + } + if !participantNames[name] { + participantNames[name] = true + participants = append(participants, Participant{Name: name}) + } + } + remaining = lines[i+1:] + remainingStart = i + 1 + break + } + } + + // imagesInRange returns images whose LineIdx falls in [loLine, hiLine). + // LineIdx values are in the original lines coordinate space. + imagesInRange := func(loLine, hiLine int) []htmlImageRef { + var out []htmlImageRef + for _, img := range images { + if img.LineIdx >= loLine && img.LineIdx < hiLine { + out = append(out, img) + } + } + return out + } + + var messages []Message + // Scan for message blocks. A message is a window that starts with a + // participant name line, ends with a timestamp line, and has zero or + // more body lines in between. + i := 0 + for i < len(remaining) { + sender := remaining[i] + if !participantNames[sender] { + i++ + continue + } + // Find the next timestamp line. + end := -1 + nextSender := -1 + for j := i + 1; j < len(remaining) && j < i+1+maxBodyLinesBeforeTimestamp; j++ { + if _, ok := parseHTMLTimestamp(remaining[j]); ok { + end = j + break + } + // If we hit another sender name before a timestamp, this + // block lacks a timestamp; bail out gracefully and resume + // scanning at that candidate rather than advancing one line + // at a time through the failed window. + if participantNames[remaining[j]] { + nextSender = j + break + } + } + if end == -1 { + if nextSender > i { + i = nextSender + } else { + i++ + } + continue + } + bodyLines := remaining[i+1 : end] + body := strings.Join(bodyLines, "\n") + ts, _ := parseHTMLTimestamp(remaining[end]) + + msg := Message{ + SenderName: sender, + SentAt: ts, + Body: body, + } + + // Attach images whose DOM position falls within this message + // block's line range [sender line .. timestamp line]. + blockImages := imagesInRange(remainingStart+i, remainingStart+end+1) + for _, img := range blockImages { + msg.Attachments = append(msg.Attachments, makeHTMLAttachment(absRoot, htmlDir, img.Src)) + } + if len(blockImages) > 0 && body == "" { + msg.Body = "[photo]" + } + + messages = append(messages, msg) + i = end + 1 + } + return participants, messages +} + +func makeHTMLAttachment(absRoot, htmlDir, src string) Attachment { + // HTML attachment src is relative to the HTML file directory. + // We resolve against htmlDir, but require the result to stay inside + // absRoot so a malicious export cannot read arbitrary files. + abs := "" + if !filepath.IsAbs(src) { + cleaned := filepath.Clean(src) + if !strings.HasPrefix(cleaned, ".."+string(filepath.Separator)) && cleaned != ".." { + candidate := filepath.Join(htmlDir, cleaned) + absCandidate, err := filepath.Abs(candidate) + if err == nil && (absCandidate == absRoot || strings.HasPrefix(absCandidate, absRoot+string(filepath.Separator))) { + abs = absCandidate + } + } + } + return Attachment{ + URI: src, + AbsPath: abs, + Kind: "photo", + Filename: filepath.Base(src), + MimeType: guessMime(src), + } +} diff --git a/internal/fbmessenger/html_parser_test.go b/internal/fbmessenger/html_parser_test.go new file mode 100644 index 00000000..a5f6952f --- /dev/null +++ b/internal/fbmessenger/html_parser_test.go @@ -0,0 +1,150 @@ +package fbmessenger + +import ( + "os" + "path/filepath" + "strings" + "testing" + "time" +) + +func TestParseHTMLThread_Simple(t *testing.T) { + root := "testdata/html_simple" + th, err := ParseHTMLThread(root, threadDir(t, root, "inbox", "alice_ABC123")) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(th.Participants) != 2 { + t.Errorf("participants=%d want 2", len(th.Participants)) + } + if th.ConvType != "direct_chat" { + t.Errorf("conv_type=%q want direct_chat", th.ConvType) + } + if len(th.Messages) != 3 { + t.Fatalf("messages=%d want 3", len(th.Messages)) + } + // HTML exports do not expose reaction metadata, so the HTML parser + // must not fabricate a "[reacted: ...]" suffix. Reaction coverage + // lives in the JSON parser tests + TestImportDYI_ReactionsDualPath. + wantBodies := []string{ + "Hello", + "café time?", + "See you soon", + } + for i, w := range wantBodies { + if th.Messages[i].Body != w { + t.Errorf("messages[%d].Body=%q want %q", i, th.Messages[i].Body, w) + } + } + if th.Title != "Alice Example" { + t.Errorf("title=%q want Alice Example", th.Title) + } +} + +func TestParseHTMLThread_WithMedia(t *testing.T) { + root := "testdata/html_with_media" + th, err := ParseHTMLThread(root, threadDir(t, root, "inbox", "bob_XYZ789")) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(th.Messages) != 1 { + t.Fatalf("messages=%d want 1: %+v", len(th.Messages), th.Messages) + } + m := th.Messages[0] + if len(m.Attachments) != 1 { + t.Fatalf("attachments=%d want 1", len(m.Attachments)) + } + if _, err := os.Stat(m.Attachments[0].AbsPath); err != nil { + t.Errorf("attachment should exist on disk: %v", err) + } +} + +func TestParseHTMLThread_TimestampLayouts(t *testing.T) { + want := time.Date(2019, 10, 19, 14, 37, 0, 0, time.UTC) + for _, name := range []string{"layout1.html", "layout2.html", "layout3.html"} { + data, err := os.ReadFile(filepath.Join("testdata/html_timestamps", name)) + if err != nil { + t.Fatal(err) + } + // Use parseHTMLLines indirectly through the main parse path by + // writing into a temp thread dir and calling ParseHTMLThread. + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "ts_TEST") + if err := os.MkdirAll(threadPath, 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(threadPath, "message_1.html"), data, 0644); err != nil { + t.Fatal(err) + } + th, err := ParseHTMLThread(tmp, threadPath) + if err != nil { + t.Fatalf("%s: parse: %v", name, err) + } + if len(th.Messages) != 1 { + t.Fatalf("%s: messages=%d want 1", name, len(th.Messages)) + } + if !th.Messages[0].SentAt.Equal(want) { + t.Errorf("%s: SentAt=%v want %v", name, th.Messages[0].SentAt, want) + } + if th.Messages[0].SentAt.Location() != time.UTC { + t.Errorf("%s: location=%v want UTC", name, th.Messages[0].SentAt.Location()) + } + } +} + +// TestParseHTMLThread_ImagePositioning verifies that images are attached to +// the message block where they appear in the DOM, not to the first empty or +// attachment-less message. +func TestParseHTMLThread_ImagePositioning(t *testing.T) { + root := "testdata/html_multi_media" + th, err := ParseHTMLThread(root, threadDir(t, root, "inbox", "carol_IMG456")) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(th.Messages) != 3 { + t.Fatalf("messages=%d want 3", len(th.Messages)) + } + // Message 0: "Hello Carol" — no image, no attachments. + if len(th.Messages[0].Attachments) != 0 { + t.Errorf("messages[0].Attachments=%d want 0 (image should NOT land here)", len(th.Messages[0].Attachments)) + } + // Message 1: "Check out this photo" — the image belongs here. + if len(th.Messages[1].Attachments) != 1 { + t.Errorf("messages[1].Attachments=%d want 1", len(th.Messages[1].Attachments)) + } + // Message 2: "Nice picture" — no attachments. + if len(th.Messages[2].Attachments) != 0 { + t.Errorf("messages[2].Attachments=%d want 0", len(th.Messages[2].Attachments)) + } +} + +func TestParseHTMLThread_StructuralParsing(t *testing.T) { + // Replace known class names with random strings; the parser must + // still find participants, bodies, and timestamps. + data, err := os.ReadFile("testdata/html_simple/your_activity_across_facebook/messages/inbox/alice_ABC123/message_1.html") + if err != nil { + t.Fatal(err) + } + body := string(data) + for _, cls := range []string{"_a706", "_a70e", "_3b0d", "_a6-g", "_a6-p", "_2ph_", "_a6-h", "_a6-i", "_a72d"} { + body = strings.ReplaceAll(body, cls, "zzq"+cls[1:]) + } + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "alice_ABC123") + if err := os.MkdirAll(threadPath, 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(threadPath, "message_1.html"), []byte(body), 0644); err != nil { + t.Fatal(err) + } + th, err := ParseHTMLThread(tmp, threadPath) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(th.Messages) != 3 { + t.Fatalf("messages=%d want 3", len(th.Messages)) + } + if len(th.Participants) != 2 { + t.Errorf("participants=%d want 2", len(th.Participants)) + } +} diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go new file mode 100644 index 00000000..f2dbca0f --- /dev/null +++ b/internal/fbmessenger/importer.go @@ -0,0 +1,860 @@ +package fbmessenger + +import ( + "context" + "crypto/sha256" + "database/sql" + "encoding/json" + "errors" + "fmt" + "io" + "log/slog" + "os" + "path/filepath" + "sort" + "strings" + "time" + "unicode/utf8" + + "github.com/wesm/msgvault/internal/mime" + "github.com/wesm/msgvault/internal/store" +) + +// errLimitReached signals that ImportOptions.Limit tripped mid-thread so +// the caller should stop importing and not advance the per-thread +// checkpoint past a partially-imported thread. +var errLimitReached = errors.New("fbmessenger: import limit reached") + +// fbmessengerCheckpoint is the JSON payload stored in +// sync_runs.cursor_before that records progress through a DYI import so +// a subsequent run can skip already-processed threads. RootDir is +// recorded to guard against a resume attempt against a different export +// directory; ThreadIndex and LastMessageIndex describe the most +// recent committed position. +type fbmessengerCheckpoint struct { + RootDir string `json:"root_dir"` + ThreadIndex int `json:"thread_index"` + LastMessageIndex int `json:"last_message_index"` +} + +// ImportOptions configures ImportDYI. +type ImportOptions struct { + // Me is the importer's own identifier, required to be of the form + // "<slug>@facebook.messenger". It is stored as the source identifier + // and used to compute is_from_me. + Me string + // RootDir is the DYI export root directory. + RootDir string + // Format overrides auto-detection. One of "auto", "json", "html", + // "both". Empty is treated as "auto". + Format string + // AttachmentsDir is the content-addressed attachment storage root. + // When empty, attachments are not copied to disk but rows are still + // written with empty storage_path. + AttachmentsDir string + // Limit caps the number of messages imported across the whole DYI + // tree. 0 means no limit. + Limit int + // NoResume disables checkpoint-based resume. + NoResume bool + // CheckpointEvery is the number of messages between checkpoints. + // Defaults to 200 when <= 0. + CheckpointEvery int + // Logger is the slog logger to use; a discard logger is used when nil. + Logger *slog.Logger +} + +// ImportSummary describes the outcome of a run. +type ImportSummary struct { + Duration time.Duration + ThreadsProcessed int + // ThreadsSkipped counts whole-thread skips for unparseable + // message JSON/HTML files that caused the thread to be dropped. + ThreadsSkipped int64 + // FilesSkipped counts unrecognized sibling files (e.g. + // message_final.json) that were skipped without aborting the + // surrounding thread. + FilesSkipped int64 + MessagesProcessed int64 + MessagesAdded int64 + MessagesSkipped int64 + ParticipantsResolved int + AttachmentsFound int + AttachmentsStored int + ReactionsAdded int + FromMeCount int64 + Errors int + HardErrors bool + WasResumed bool +} + +// ImportDYI imports a Facebook Messenger DYI export into the store. +func ImportDYI(ctx context.Context, st *store.Store, opts ImportOptions) (*ImportSummary, error) { + start := time.Now() + summary := &ImportSummary{} + logger := opts.Logger + if logger == nil { + logger = slog.New(slog.NewTextHandler(io.Discard, nil)) + } + if opts.CheckpointEvery <= 0 { + opts.CheckpointEvery = 200 + } + format := strings.ToLower(opts.Format) + if format == "" { + format = "auto" + } + switch format { + case "auto", "json", "html", "both": + // valid + default: + return nil, fmt.Errorf("fbmessenger: unknown --format %q (valid: auto, json, html, both)", format) + } + + // Validate --me. + if opts.Me == "" { + return nil, fmt.Errorf("fbmessenger: --me is required") + } + if !strings.HasSuffix(opts.Me, "@"+Domain) { + return nil, fmt.Errorf("fbmessenger: --me must be a <slug>@%s address, got %q", Domain, opts.Me) + } + + // Check root dir exists early. + if info, err := os.Stat(opts.RootDir); err != nil { + return nil, fmt.Errorf("fbmessenger: root dir: %w", err) + } else if !info.IsDir() { + return nil, fmt.Errorf("fbmessenger: root is not a directory: %s", opts.RootDir) + } + + // Resolve absolute root so the checkpoint is comparable across + // relative invocations. + absRoot, err := filepath.Abs(opts.RootDir) + if err != nil { + return nil, fmt.Errorf("fbmessenger: abs root: %w", err) + } + + // Create source and start sync run. + source, err := st.GetOrCreateSource("facebook_messenger", opts.Me) + if err != nil { + return nil, fmt.Errorf("fbmessenger: source: %w", err) + } + + // Read any existing active-run checkpoint before calling + // StartSync (which marks active runs failed). This mirrors the + // pattern used by emlx_import / mbox_import. + var ( + startThreadIdx int + cp store.Checkpoint + ) + if !opts.NoResume { + // Look for a resumable sync run. Try active (running) first, + // then fall back to the latest checkpointed run (which includes + // failed/interrupted runs whose checkpoint is still valid). + prev, err := st.GetActiveSync(source.ID) + if err != nil { + return nil, fmt.Errorf("fbmessenger: check active sync: %w", err) + } + if prev == nil || !prev.CursorBefore.Valid || prev.CursorBefore.String == "" { + prev, err = st.GetLatestCheckpointedSync(source.ID) + if err != nil { + return nil, fmt.Errorf("fbmessenger: check checkpointed sync: %w", err) + } + } + if prev != nil && prev.CursorBefore.Valid && prev.CursorBefore.String != "" { + var prior fbmessengerCheckpoint + if err := json.Unmarshal([]byte(prev.CursorBefore.String), &prior); err == nil { + if prior.RootDir != "" && prior.RootDir != absRoot { + return nil, fmt.Errorf( + "fbmessenger: active import is for a different root (%q), not %q; rerun with --no-resume to start fresh", + prior.RootDir, absRoot, + ) + } + // Treat any well-formed checkpoint as resumable. A + // checkpoint saved mid-way through the first thread + // has ThreadIndex == 0; the outer loop still starts + // at threadIdx=0 either way, but resuming lets us + // carry cumulative counters forward and emit the + // "resuming" log so a user-visible interrupt during + // thread 0 is reflected in the summary. + startThreadIdx = prior.ThreadIndex + summary.WasResumed = true + cp.MessagesProcessed = prev.MessagesProcessed + cp.MessagesAdded = prev.MessagesAdded + cp.MessagesUpdated = prev.MessagesUpdated + cp.ErrorsCount = prev.ErrorsCount + summary.MessagesProcessed = prev.MessagesProcessed + summary.MessagesAdded = prev.MessagesAdded + logger.Info("fbmessenger: resuming import", + "root", absRoot, + "thread_index", startThreadIdx, + "processed", cp.MessagesProcessed, + ) + } + } + } + + syncID, err := st.StartSync(source.ID, "import-messenger") + if err != nil { + return nil, fmt.Errorf("fbmessenger: start sync: %w", err) + } + var syncErr error + defer func() { + if syncErr != nil { + _ = st.FailSync(syncID, syncErr.Error()) + } else { + _ = st.CompleteSync(syncID, "") + } + }() + + // Pre-create label taxonomy. + labelIDs := make(map[string]int64) + parentLabelID, err := st.EnsureLabel(source.ID, "messenger", "Messenger", "folder") + if err != nil { + syncErr = err + return nil, fmt.Errorf("fbmessenger: ensure parent label: %w", err) + } + labelIDs["Messenger"] = parentLabelID + for _, pair := range sectionLabelNames() { + lid, err := st.EnsureLabel(source.ID, pair.id, pair.name, "folder") + if err != nil { + syncErr = err + return nil, fmt.Errorf("fbmessenger: ensure label %s: %w", pair.name, err) + } + labelIDs[pair.name] = lid + } + + // Seed the self participant so even an empty import leaves a trace. + meAddr := mime.Address{Name: "", Email: opts.Me, Domain: Domain} + if _, err := st.EnsureParticipantsBatch([]mime.Address{meAddr}); err != nil { + syncErr = err + return nil, fmt.Errorf("fbmessenger: seed self participant: %w", err) + } + meLocal := StripDomain(opts.Me) + + // Discover threads. + threads, err := Discover(absRoot) + if err != nil { + syncErr = err + return nil, fmt.Errorf("fbmessenger: discover: %w", err) + } + + // Guard against a checkpoint that points past the current list + // (e.g. export shrunk); fall through to a full scan in that + // case since the source_message_id upsert will dedupe. + if startThreadIdx > len(threads) { + logger.Warn("fbmessenger: checkpoint thread_index out of range; restarting full scan", + "checkpoint", startThreadIdx, "threads", len(threads)) + startThreadIdx = 0 + summary.WasResumed = false + } + + for threadIdx, td := range threads { + if threadIdx < startThreadIdx { + continue + } + if ctx.Err() != nil { + syncErr = ctx.Err() + return summary, ctx.Err() + } + if opts.Limit > 0 && summary.MessagesAdded >= int64(opts.Limit) { + break + } + effective := td.Format + // E2EE threads bypass the format filter entirely. + if effective != "e2ee_json" { + switch format { + case "json": + if effective == "html" { + continue + } + effective = "json" + case "html": + if effective == "json" { + continue + } + effective = "html" + case "both": + // Keep as-is; "both" threads get both parsed. + case "auto": + if effective == "both" { + effective = "json" + } + } + } + + summary.ThreadsProcessed++ + err := importThread(ctx, st, source.ID, td, effective, format, opts, labelIDs, meLocal, logger, summary, syncID, absRoot, threadIdx, &cp) + if err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + syncErr = err + return summary, err + } + if errors.Is(err, errLimitReached) { + // Limit tripped mid-thread: do not advance the + // checkpoint past this thread, so a subsequent + // non-limited run re-scans it and picks up the + // remaining messages via source_message_id dedup. + break + } + summary.Errors++ + summary.HardErrors = true + logger.Warn("fbmessenger: thread failed", "thread", td.Name, "err", err) + // Don't advance the checkpoint on a hard thread error. + // source_message_id dedup makes retry safe on the next + // run, so a transient failure (DB lock, I/O blip) self- + // heals. If the error is persistent the user will see + // HardErrors=true and can rerun with --no-resume. + continue + } + // Persist per-thread checkpoint so resume can skip fully + // committed threads. Advance ThreadIndex to threadIdx+1 + // because this thread is now fully processed. + if !opts.NoResume { + _ = saveFbmessengerCheckpoint(st, syncID, absRoot, threadIdx+1, 0, &cp, summary) + } + } + + if err := st.RecomputeConversationStats(source.ID); err != nil { + logger.Warn("fbmessenger: recompute stats", "err", err) + } + summary.Duration = time.Since(start) + return summary, nil +} + +// sectionLabelNames returns the section label name mapping. +type sectionLabel struct { + section, id, name string +} + +func sectionLabelNames() []sectionLabel { + return []sectionLabel{ + {"inbox", "messenger:inbox", "Messenger / Inbox"}, + {"archived_threads", "messenger:archived", "Messenger / Archived"}, + {"filtered_threads", "messenger:filtered", "Messenger / Filtered"}, + {"message_requests", "messenger:requests", "Messenger / Requests"}, + {"e2ee_cutover", "messenger:e2ee", "Messenger / E2EE"}, + } +} + +func sectionLabelName(section string) string { + for _, p := range sectionLabelNames() { + if p.section == section { + return p.name + } + } + return "Messenger / Other" +} + +// importThread imports one ThreadDir per effective format. +func importThread( + ctx context.Context, + st *store.Store, + sourceID int64, + td ThreadDir, + effective, requested string, + opts ImportOptions, + labelIDs map[string]int64, + meLocal string, + logger *slog.Logger, + summary *ImportSummary, + syncID int64, + absRoot string, + threadIdx int, + cp *store.Checkpoint, +) error { + // Decide which parsers to run. + type parsedPair struct { + thread *Thread + prefix string // "" for json, "html_" for html under "both" + } + var toImport []parsedPair + + runJSON := func(prefix string) error { + th, err := ParseJSONThread(opts.RootDir, td.Path) + if err != nil { + if errors.Is(err, ErrCorruptJSON) { + summary.ThreadsSkipped++ + logger.Warn("fbmessenger: corrupt json, skipping", "thread", td.Name, "err", err) + return nil + } + return err + } + if len(th.BadSiblings) > 0 { + summary.FilesSkipped += int64(len(th.BadSiblings)) + logger.Warn("fbmessenger: skipping unrecognized sibling files", + "thread", td.Name, "files", th.BadSiblings) + } + th.Section = td.Section + toImport = append(toImport, parsedPair{thread: th, prefix: prefix}) + return nil + } + runHTML := func(prefix string) error { + th, err := ParseHTMLThread(opts.RootDir, td.Path) + if err != nil { + return err + } + th.Section = td.Section + toImport = append(toImport, parsedPair{thread: th, prefix: prefix}) + return nil + } + + runE2EE := func() error { + th, err := ParseE2EEJSONFile(opts.RootDir, td.FilePath) + if err != nil { + if errors.Is(err, ErrCorruptJSON) { + summary.ThreadsSkipped++ + logger.Warn("fbmessenger: corrupt e2ee json, skipping", "thread", td.Name, "err", err) + return nil + } + if errors.Is(err, ErrNotE2EEThread) { + // Not a thread (e.g. a DYI metadata file Facebook + // added that isn't in the allowlist). Silently skip. + return nil + } + return err + } + th.Section = td.Section + toImport = append(toImport, parsedPair{thread: th, prefix: ""}) + return nil + } + + switch { + case td.Format == "e2ee_json": + if err := runE2EE(); err != nil { + return err + } + case requested == "both" && td.Format == "both": + if err := runJSON(""); err != nil { + return err + } + if err := runHTML("html_"); err != nil { + return err + } + case effective == "json": + if err := runJSON(""); err != nil { + return err + } + case effective == "html": + if err := runHTML(""); err != nil { + return err + } + } + + for _, pair := range toImport { + if err := writeThreadToStore(ctx, st, sourceID, td, pair.thread, pair.prefix, opts, labelIDs, meLocal, logger, summary, syncID, absRoot, threadIdx, cp); err != nil { + return err + } + } + return nil +} + +func writeThreadToStore( + ctx context.Context, + st *store.Store, + sourceID int64, + td ThreadDir, + thread *Thread, + prefix string, + opts ImportOptions, + labelIDs map[string]int64, + meLocal string, + logger *slog.Logger, + summary *ImportSummary, + syncID int64, + absRoot string, + threadIdx int, + cp *store.Checkpoint, +) error { + // Ensure conversation. Use section-qualified name so threads with + // the same basename in different sections (e.g. inbox vs archived) + // don't collide. + threadKey := td.Section + "/" + td.Name + convID, err := st.EnsureConversationWithType(sourceID, threadKey, thread.ConvType, thread.Title) + if err != nil { + return fmt.Errorf("ensure conversation: %w", err) + } + + // Build participant address map for this thread. + addrs := make([]mime.Address, 0, len(thread.Participants)+1) + addrs = append(addrs, mime.Address{Name: "", Email: opts.Me, Domain: Domain}) + for _, p := range thread.Participants { + addrs = append(addrs, Address(p.Name)) + } + partIDs, err := st.EnsureParticipantsBatch(addrs) + if err != nil { + return fmt.Errorf("ensure participants: %w", err) + } + summary.ParticipantsResolved += len(partIDs) + + // Map display name → participant ID and email for this thread. + nameToID := make(map[string]int64) + nameToEmail := make(map[string]string) + for _, p := range thread.Participants { + addr := Address(p.Name) + nameToID[p.Name] = partIDs[addr.Email] + nameToEmail[p.Name] = addr.Email + } + meID := partIDs[opts.Me] + // Ensure all participants (incl. self) are linked to the conversation. + _ = st.EnsureConversationParticipant(convID, meID, "member") + for _, pid := range nameToID { + _ = st.EnsureConversationParticipant(convID, pid, "member") + } + + // Determine label IDs for this thread. + parentLabelID := labelIDs["Messenger"] + sectionLabelID := labelIDs[sectionLabelName(td.Section)] + + // Raw bytes format tag. + rawFormat := "fbmessenger_" + thread.Format + + // Iterate messages. + for mi, m := range thread.Messages { + if ctx.Err() != nil { + return ctx.Err() + } + if opts.Limit > 0 && summary.MessagesAdded >= int64(opts.Limit) { + return errLimitReached + } + summary.MessagesProcessed++ + + // Resolve sender. + senderName := m.SenderName + senderID := sql.NullInt64{} + isFromMe := false + if senderName != "" { + if id, ok := nameToID[senderName]; ok { + senderID = sql.NullInt64{Int64: id, Valid: true} + } else { + // Participant not in the thread's participants list; + // synthesize. + addr := Address(senderName) + m2, err := st.EnsureParticipantsBatch([]mime.Address{addr}) + if err == nil { + if id, ok := m2[addr.Email]; ok { + senderID = sql.NullInt64{Int64: id, Valid: true} + nameToID[senderName] = id + nameToEmail[senderName] = addr.Email + _ = st.EnsureConversationParticipant(convID, id, "member") + } + } + } + } + if senderName != "" { + if Slug(senderName) == meLocal { + isFromMe = true + senderID = sql.NullInt64{Int64: meID, Valid: true} + summary.FromMeCount++ + } + } + + // Build the source_message_id. Section-qualified to avoid + // collisions across sections with the same thread basename. + srcMsgID := fmt.Sprintf("%s__%s%d", threadKey, prefix, m.Index) + + // If we couldn't resolve a sender this run but a previous import + // of the same message already recorded one, preserve it — the + // UpsertMessage UPDATE path unconditionally overwrites sender_id + // with the incoming value, which would nullify valid prior data. + // Also rehydrate the sender's display name and email so + // downstream writes to message_recipients("from") and FTS don't + // clobber the prior display-name / from-address with empty + // strings derived from an unresolved current-run senderName. + if !senderID.Valid { + var priorID sql.NullInt64 + var priorName, priorEmail sql.NullString + var priorIsFromMe bool + // COALESCE onto message_recipients.display_name because the + // seeded self (--me) participant is created with an empty + // display_name; falling back to what we wrote on the prior + // import preserves the sender label for self-authored rows. + err := st.DB().QueryRow( + `SELECT m.sender_id, m.is_from_me, + COALESCE(NULLIF(p.display_name, ''), + (SELECT mr.display_name FROM message_recipients mr + WHERE mr.message_id = m.id AND mr.recipient_type = 'from' + LIMIT 1)), + p.email_address + FROM messages m + LEFT JOIN participants p ON p.id = m.sender_id + WHERE m.source_id = ? AND m.source_message_id = ?`, + sourceID, srcMsgID, + ).Scan(&priorID, &priorIsFromMe, &priorName, &priorEmail) + if err == nil && priorID.Valid { + senderID = priorID + isFromMe = priorIsFromMe + if priorIsFromMe { + summary.FromMeCount++ + } + if priorName.Valid && priorName.String != "" { + senderName = priorName.String + nameToID[senderName] = priorID.Int64 + if priorEmail.Valid { + nameToEmail[senderName] = priorEmail.String + } + } + // Re-link the rehydrated participant into the + // conversation. Older DBs affected by the + // pre-fix synthesized-sender bug have a valid + // messages.sender_id but no matching + // conversation_participants row; this repairs + // them on re-import. + _ = st.EnsureConversationParticipant(convID, priorID.Int64, "member") + } + } + + snippet := buildSnippet(m.Body) + msgRow := &store.Message{ + ConversationID: convID, + SourceID: sourceID, + SourceMessageID: srcMsgID, + MessageType: "fbmessenger", + SentAt: sql.NullTime{Time: m.SentAt, Valid: !m.SentAt.IsZero()}, + ReceivedAt: sql.NullTime{Time: m.SentAt, Valid: !m.SentAt.IsZero()}, + SenderID: senderID, + IsFromMe: isFromMe, + Snippet: sql.NullString{String: snippet, Valid: snippet != ""}, + SizeEstimate: int64(len(m.Body)), + HasAttachments: len(m.Attachments) > 0, + AttachmentCount: len(m.Attachments), + } + messageID, err := st.UpsertMessage(msgRow) + if err != nil { + summary.Errors++ + summary.MessagesSkipped++ + logger.Warn("fbmessenger: upsert message", "err", err) + continue + } + summary.MessagesAdded++ + + // Body. + bodyText := sql.NullString{String: m.Body, Valid: m.Body != ""} + if err := st.UpsertMessageBody(messageID, bodyText, sql.NullString{}); err != nil { + summary.Errors++ + logger.Warn("fbmessenger: upsert body", "err", err) + } + + // Raw bytes — store once per thread (first message only) to avoid bloat. + if mi == 0 && len(thread.RawBytes) > 0 { + if err := st.UpsertMessageRawWithFormat(messageID, thread.RawBytes, rawFormat); err != nil { + logger.Warn("fbmessenger: upsert raw", "err", err) + } + } + + // Recipients: from = sender, to = other participants. + // Only rewrite "from" when we have a valid sender — otherwise on + // re-import we would clobber a previously-recorded sender with an + // empty row. + if senderID.Valid { + fromIDs := []int64{senderID.Int64} + fromNames := []string{senderName} + if err := st.ReplaceMessageRecipients(messageID, "from", fromIDs, fromNames); err != nil { + logger.Warn("fbmessenger: replace from recipients", "err", err) + } + } + + var toIDs []int64 + var toNames []string + seenPID := make(map[int64]bool) + if senderID.Valid { + seenPID[senderID.Int64] = true + } + sortedNames := make([]string, 0, len(nameToID)) + for name := range nameToID { + sortedNames = append(sortedNames, name) + } + sort.Strings(sortedNames) + for _, name := range sortedNames { + pid := nameToID[name] + if seenPID[pid] { + continue + } + seenPID[pid] = true + toIDs = append(toIDs, pid) + toNames = append(toNames, name) + } + // If message is from someone else, ensure self is in "to". + if !isFromMe && !seenPID[meID] { + toIDs = append(toIDs, meID) + toNames = append(toNames, "") + seenPID[meID] = true + } + _ = st.ReplaceMessageRecipients(messageID, "to", toIDs, toNames) + + // Labels. + if parentLabelID != 0 { + _ = st.LinkMessageLabel(messageID, parentLabelID) + } + if sectionLabelID != 0 { + _ = st.LinkMessageLabel(messageID, sectionLabelID) + } + + // Attachments. + for _, att := range m.Attachments { + summary.AttachmentsFound++ + storagePath, contentHash, size := handleAttachment(att, opts.AttachmentsDir) + if storagePath != "" { + summary.AttachmentsStored++ + } + if storagePath != "" || contentHash != "" || att.AbsPath != "" { + if err := st.UpsertAttachment(messageID, att.Filename, att.MimeType, storagePath, contentHash, size); err != nil { + logger.Warn("fbmessenger: upsert attachment", "err", err) + } + } else { + // Empty row so the user sees a trace that something was referenced. + if err := st.UpsertAttachment(messageID, att.Filename, att.MimeType, "", "", 0); err != nil { + logger.Warn("fbmessenger: upsert attachment (empty)", "err", err) + } + } + } + + // Reactions: first-class rows and body-append already done. + for _, r := range m.Reactions { + actorAddr := Address(r.Actor) + m2, err := st.EnsureParticipantsBatch([]mime.Address{actorAddr}) + if err != nil { + continue + } + pid := m2[actorAddr.Email] + if pid == 0 { + continue + } + if err := st.UpsertReaction(messageID, pid, "emoji", r.Reaction, m.SentAt); err == nil { + summary.ReactionsAdded++ + } + } + + // FTS indexing. + fromAddr := "" + if senderID.Valid { + fromAddr = nameToEmail[senderName] + if isFromMe { + fromAddr = opts.Me + } + } + toAddr := strings.Join(nameToEmailsList(nameToEmail, opts.Me, senderName), " ") + if err := st.UpsertFTS(messageID, thread.Title, m.Body, fromAddr, toAddr, ""); err != nil { + logger.Warn("fbmessenger: upsert fts", "err", err) + } + + // Checkpoint every N messages within a long thread. We save + // the current thread index (not threadIdx+1) because this + // thread is still in progress; we also record the last + // message index so an observer can see progress, though + // resume skips at thread granularity only. + if !opts.NoResume && summary.MessagesAdded > 0 && summary.MessagesAdded%int64(opts.CheckpointEvery) == 0 { + _ = saveFbmessengerCheckpoint(st, syncID, absRoot, threadIdx, m.Index, cp, summary) + } + } + return nil +} + +// saveFbmessengerCheckpoint marshals a fbmessengerCheckpoint JSON blob +// into sync_runs.cursor_before along with the counter fields. Errors +// are returned to the caller but ImportDYI logs rather than aborts. +func saveFbmessengerCheckpoint( + st *store.Store, syncID int64, + absRoot string, threadIdx int, lastMsgIdx int, + cp *store.Checkpoint, summary *ImportSummary, +) error { + b, err := json.Marshal(fbmessengerCheckpoint{ + RootDir: absRoot, + ThreadIndex: threadIdx, + LastMessageIndex: lastMsgIdx, + }) + if err != nil { + return fmt.Errorf("marshal fbmessenger checkpoint: %w", err) + } + cp.PageToken = string(b) + cp.MessagesProcessed = summary.MessagesProcessed + cp.MessagesAdded = summary.MessagesAdded + cp.ErrorsCount = int64(summary.Errors) + return st.UpdateSyncCheckpoint(syncID, cp) +} + +func nameToEmailsList(m map[string]string, me, skipName string) []string { + out := make([]string, 0, len(m)+1) + for name, email := range m { + if name == skipName { + continue + } + out = append(out, email) + } + out = append(out, me) + return out +} + +func buildSnippet(body string) string { + s := strings.TrimSpace(body) + if utf8.RuneCountInString(s) > 200 { + s = string([]rune(s)[:200]) + } + return s +} + +// handleAttachment copies an attachment file into content-addressed +// storage and returns (storagePath, contentHash, size). All zero values +// when the file is missing, unreadable, or no AttachmentsDir is configured. +// Symlinks and non-regular files (devices, sockets, named pipes) are +// rejected: a malicious DYI export could plant a symlink pointing at a +// sensitive local file (e.g. ~/.ssh/id_rsa) and we would otherwise copy +// the target into the attachment store. +func handleAttachment(att Attachment, attachmentsDir string) (string, string, int) { + if attachmentsDir == "" || att.AbsPath == "" { + return "", "", 0 + } + linfo, err := os.Lstat(att.AbsPath) + if err != nil { + return "", "", 0 + } + if !linfo.Mode().IsRegular() { + return "", "", 0 + } + f, err := os.Open(att.AbsPath) + if err != nil { + return "", "", 0 + } + defer func() { _ = f.Close() }() + + info, err := f.Stat() + if err != nil { + return "", "", 0 + } + if !info.Mode().IsRegular() { + return "", "", 0 + } + + h := sha256.New() + if _, err := io.Copy(h, f); err != nil { + return "", "", 0 + } + contentHash := fmt.Sprintf("%x", h.Sum(nil)) + rel := filepath.Join(contentHash[:2], contentHash) + absStorage := filepath.Join(attachmentsDir, rel) + + if _, err := os.Stat(absStorage); err == nil { + return rel, contentHash, int(info.Size()) + } + if err := os.MkdirAll(filepath.Dir(absStorage), 0750); err != nil { + return "", contentHash, 0 + } + if _, err := f.Seek(0, io.SeekStart); err != nil { + return "", contentHash, 0 + } + dst, err := os.OpenFile(absStorage, os.O_CREATE|os.O_WRONLY|os.O_EXCL, 0600) + if err != nil { + if os.IsExist(err) { + return rel, contentHash, int(info.Size()) + } + return "", contentHash, 0 + } + if _, err := io.Copy(dst, f); err != nil { + _ = dst.Close() + _ = os.Remove(absStorage) + return "", contentHash, 0 + } + if err := dst.Close(); err != nil { + _ = os.Remove(absStorage) + return "", contentHash, 0 + } + return rel, contentHash, int(info.Size()) +} diff --git a/internal/fbmessenger/importer_fts_test.go b/internal/fbmessenger/importer_fts_test.go new file mode 100644 index 00000000..f453ee95 --- /dev/null +++ b/internal/fbmessenger/importer_fts_test.go @@ -0,0 +1,89 @@ +//go:build fts5 + +package fbmessenger + +import ( + "strings" + "testing" + + "github.com/wesm/msgvault/internal/testutil" +) + +// TestImportDYI_MojibakeFTSIndexed verifies that mojibake-repaired body +// text (e.g. "café") lands in message_bodies AND is indexed by FTS5 so a +// direct MATCH query returns a hit. Gated on the fts5 build tag so the +// FTS assertion is always active under the project's canonical +// `go test -tags fts5 ./...` invocation. +func TestImportDYI_MojibakeFTSIndexed(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/json_simple") + if !st.FTS5Available() { + t.Fatal("FTS5 build tag set but FTS5 not available in this binary") + } + + // The body stored in message_bodies must contain literal "café". + var body string + if err := st.DB().QueryRow( + `SELECT body_text FROM message_bodies WHERE body_text LIKE '%café%'`, + ).Scan(&body); err != nil { + t.Fatalf("body query: %v", err) + } + if !strings.Contains(body, "café") { + t.Errorf("body=%q", body) + } + + var count int + if err := st.DB().QueryRow( + "SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH ?", "café", + ).Scan(&count); err != nil { + t.Fatalf("fts query: %v", err) + } + if count < 1 { + t.Errorf("fts match for café: got %d want >=1", count) + } +} + +// TestImportDYI_ReactionsDualPath verifies that reactions land both as +// first-class rows in the reactions table and as an appended +// "[reacted: ...]" suffix in body_text that FTS5 can match. Gated on +// the fts5 build tag; the FTS MATCH assertion is unconditional. +func TestImportDYI_ReactionsDualPath(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/json_simple") + if !st.FTS5Available() { + t.Fatal("FTS5 build tag set but FTS5 not available in this binary") + } + + // Count reactions on the message that contains café. + var n int + if err := st.DB().QueryRow(` + SELECT COUNT(*) FROM reactions r + JOIN message_bodies b ON b.message_id = r.message_id + WHERE b.body_text LIKE '%café%' + `).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 2 { + t.Errorf("reactions=%d want 2", n) + } + + // Body text must contain the appended [reacted: ...] summary. + var bodyCount int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM message_bodies WHERE body_text LIKE '%[reacted:%'`, + ).Scan(&bodyCount); err != nil { + t.Fatal(err) + } + if bodyCount < 1 { + t.Errorf("body with [reacted: suffix: got %d want >=1", bodyCount) + } + + if err := st.DB().QueryRow( + "SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH ?", "reacted", + ).Scan(&n); err != nil { + t.Fatal(err) + } + if n < 1 { + t.Errorf("fts match reacted: %d want >=1", n) + } +} diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go new file mode 100644 index 00000000..11e2672b --- /dev/null +++ b/internal/fbmessenger/importer_test.go @@ -0,0 +1,1549 @@ +package fbmessenger + +import ( + "context" + "crypto/sha256" + "database/sql" + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/wesm/msgvault/internal/store" + "github.com/wesm/msgvault/internal/testutil" +) + +func importFixture(t *testing.T, st *store.Store, rootDir string, extra ...func(*ImportOptions)) *ImportSummary { + t.Helper() + opts := ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: rootDir, + Format: "auto", + AttachmentsDir: t.TempDir(), + } + for _, f := range extra { + f(&opts) + } + summary, err := ImportDYI(context.Background(), st, opts) + if err != nil { + t.Fatalf("ImportDYI(%s): %v", rootDir, err) + } + return summary +} + +func countMessages(t *testing.T, st *store.Store, where string, args ...any) int { + t.Helper() + var n int + q := "SELECT COUNT(*) FROM messages" + if where != "" { + q += " WHERE " + where + } + if err := st.DB().QueryRow(q, args...).Scan(&n); err != nil { + t.Fatalf("count query: %v", err) + } + return n +} + +func TestImportDYI_JSONSimple(t *testing.T) { + st := testutil.NewTestStore(t) + summary := importFixture(t, st, "testdata/json_simple") + // json_simple has 1 inbox thread (3 messages) + 1 archived thread (1 message) = 4 + if summary.MessagesAdded != 4 { + t.Errorf("MessagesAdded=%d want 4", summary.MessagesAdded) + } + if summary.HardErrors { + t.Errorf("HardErrors=true") + } + if got := countMessages(t, st, "message_type='fbmessenger'"); got != 4 { + t.Errorf("messages count=%d want 4", got) + } + if got := countMessages(t, st, "message_type='fbmessenger' AND sent_at IS NOT NULL"); got != 4 { + t.Errorf("sent_at NULL rows exist: got %d want 4", got) + } + // Exactly one message_type present. + rows, err := st.DB().Query("SELECT DISTINCT message_type FROM messages") + if err != nil { + t.Fatal(err) + } + defer func() { _ = rows.Close() }() + var types []string + for rows.Next() { + var s string + _ = rows.Scan(&s) + types = append(types, s) + } + if len(types) != 1 || types[0] != "fbmessenger" { + t.Errorf("types=%v want [fbmessenger]", types) + } +} + +// TestImportDYI_MojibakeRepaired verifies mojibake repair on the body +// stored in message_bodies independently of FTS5. The FTS5 MATCH +// assertion lives in importer_fts_test.go gated on the fts5 build tag. +func TestImportDYI_MojibakeRepaired(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/json_simple") + var body string + if err := st.DB().QueryRow( + `SELECT body_text FROM message_bodies WHERE body_text LIKE '%café%'`, + ).Scan(&body); err != nil { + t.Fatalf("body query: %v", err) + } + if !strings.Contains(body, "café") { + t.Errorf("body=%q", body) + } +} + +func TestImportDYI_DirectChat(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/json_simple") + var ct string + if err := st.DB().QueryRow( + "SELECT conversation_type FROM conversations WHERE source_conversation_id='inbox/alice_ABC123'", + ).Scan(&ct); err != nil { + t.Fatal(err) + } + if ct != "direct_chat" { + t.Errorf("conv type=%q want direct_chat", ct) + } +} + +func TestImportDYI_GroupChat(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/json_group") + var ct string + if err := st.DB().QueryRow( + "SELECT conversation_type FROM conversations WHERE source_conversation_id='inbox/crew_GRP123'", + ).Scan(&ct); err != nil { + t.Fatal(err) + } + if ct != "group_chat" { + t.Errorf("conv type=%q want group_chat", ct) + } + // Three facebook.messenger participants (Taylor/Alice/Bob) plus the + // self seed. The self seed and the slug-derived sender address match + // ("test.user@facebook.messenger"), so they collapse to one row. + var n int + if err := st.DB().QueryRow( + "SELECT COUNT(*) FROM participants WHERE domain='facebook.messenger'", + ).Scan(&n); err != nil { + t.Fatal(err) + } + if n < 3 { + t.Errorf("participants(fb)=%d want >=3", n) + } + // Every message has at least one 'to' recipient. + var badMsgs int + if err := st.DB().QueryRow(` + SELECT COUNT(*) FROM messages m + WHERE m.conversation_id = (SELECT id FROM conversations WHERE source_conversation_id='inbox/crew_GRP123') + AND NOT EXISTS (SELECT 1 FROM message_recipients r WHERE r.message_id = m.id AND r.recipient_type='to') + `).Scan(&badMsgs); err != nil { + t.Fatal(err) + } + if badMsgs != 0 { + t.Errorf("messages without 'to' recipients: %d", badMsgs) + } +} + +func TestImportDYI_MultifileNumericSort(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/json_multifile") + rows, err := st.DB().Query(` + SELECT source_message_id, sent_at FROM messages + WHERE source_id = (SELECT id FROM sources WHERE source_type='facebook_messenger') + ORDER BY sent_at ASC + `) + if err != nil { + t.Fatal(err) + } + defer func() { _ = rows.Close() }() + var ids []string + var lastTime time.Time + for rows.Next() { + var id string + var sentAt sql.NullTime + if err := rows.Scan(&id, &sentAt); err != nil { + t.Fatal(err) + } + if sentAt.Valid { + if !sentAt.Time.After(lastTime) { + t.Errorf("non-monotonic sent_at at %s", id) + } + lastTime = sentAt.Time + } + ids = append(ids, id) + } + if len(ids) != 4 { + t.Fatalf("rows=%d want 4", len(ids)) + } + // All source_message_id values must be prefixed dave_MULTI__ and + // have monotonic index suffixes. + for i, id := range ids { + want := fmt.Sprintf("inbox/dave_MULTI__%d", i) + if id != want { + t.Errorf("source_message_id[%d]=%q want %q", i, id, want) + } + } +} + +func TestImportDYI_Idempotent(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/json_simple") + before := snapshotRowCounts(t, st) + _ = importFixture(t, st, "testdata/json_simple") + after := snapshotRowCounts(t, st) + for k, v := range before { + if after[k] != v { + t.Errorf("%s: before=%d after=%d", k, v, after[k]) + } + } +} + +func snapshotRowCounts(t *testing.T, st *store.Store) map[string]int { + t.Helper() + out := make(map[string]int) + for _, tbl := range []string{"messages", "participants", "message_recipients", "attachments", "reactions", "conversations", "labels"} { + var n int + if err := st.DB().QueryRow("SELECT COUNT(*) FROM " + tbl).Scan(&n); err != nil { + t.Fatalf("count %s: %v", tbl, err) + } + out[tbl] = n + } + return out +} + +// A thread containing both a valid numbered file and an unrecognized +// sibling (e.g. message_final.json) must import the valid file and +// report the bad sibling via MessagesSkipped rather than aborting the +// entire thread. +func TestImportDYI_UnnumberedSiblingSkipped(t *testing.T) { + st := testutil.NewTestStore(t) + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "mixnames_OK") + if err := os.MkdirAll(threadPath, 0755); err != nil { + t.Fatal(err) + } + good := `{"participants":[{"name":"A"},{"name":"B"}],"messages":[ +{"sender_name":"A","timestamp_ms":1600000000000,"type":"Generic","content":"good message"} +],"title":"mix"}` + if err := os.WriteFile(filepath.Join(threadPath, "message_1.json"), []byte(good), 0644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(threadPath, "message_final.json"), []byte(`not json`), 0644); err != nil { + t.Fatal(err) + } + summary := importFixture(t, st, tmp) + if summary.HardErrors { + t.Errorf("HardErrors=true") + } + if summary.MessagesAdded != 1 { + t.Errorf("MessagesAdded=%d want 1", summary.MessagesAdded) + } + if summary.FilesSkipped < 1 { + t.Errorf("FilesSkipped=%d want >=1 (bad sibling)", summary.FilesSkipped) + } + if summary.MessagesSkipped != 0 { + t.Errorf("MessagesSkipped=%d want 0 (no message was rejected)", summary.MessagesSkipped) + } + if summary.ThreadsSkipped != 0 { + t.Errorf("ThreadsSkipped=%d want 0", summary.ThreadsSkipped) + } + // Valid file must be imported. + var n int + if err := st.DB().QueryRow( + "SELECT COUNT(*) FROM conversations WHERE source_conversation_id='inbox/mixnames_OK'", + ).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 1 { + t.Errorf("conversation not imported: n=%d", n) + } +} + +func TestImportDYI_CorruptSkipped(t *testing.T) { + st := testutil.NewTestStore(t) + summary := importFixture(t, st, "testdata/corrupt") + if summary.HardErrors { + t.Errorf("HardErrors=true") + } + if summary.ThreadsSkipped < 1 { + t.Errorf("ThreadsSkipped=%d want >=1 (corrupt thread)", summary.ThreadsSkipped) + } + if summary.MessagesSkipped != 0 { + t.Errorf("MessagesSkipped=%d want 0 (only whole-thread skip)", summary.MessagesSkipped) + } + // Good sibling message must still be imported. + var n int + if err := st.DB().QueryRow( + "SELECT COUNT(*) FROM conversations WHERE source_conversation_id='inbox/goodsibling_OK'", + ).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 1 { + t.Errorf("good sibling not imported: n=%d", n) + } +} + +func TestImportDYI_AttachmentStorage(t *testing.T) { + st := testutil.NewTestStore(t) + attachDir := t.TempDir() + opts := ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: "testdata/json_with_media", + Format: "auto", + AttachmentsDir: attachDir, + } + if _, err := ImportDYI(context.Background(), st, opts); err != nil { + t.Fatal(err) + } + // Compute expected hash from fixture. + png, err := os.ReadFile("testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png") + if err != nil { + t.Fatal(err) + } + wantHash := fmt.Sprintf("%x", sha256.Sum256(png)) + + var contentHash, storagePath string + var size int64 + if err := st.DB().QueryRow( + "SELECT content_hash, storage_path, size FROM attachments LIMIT 1", + ).Scan(&contentHash, &storagePath, &size); err != nil { + t.Fatal(err) + } + if contentHash != wantHash { + t.Errorf("content_hash=%q want %q", contentHash, wantHash) + } + if storagePath == "" { + t.Error("storage_path empty") + } + absStorage := filepath.Join(attachDir, storagePath) + got, err := os.ReadFile(absStorage) + if err != nil { + t.Fatalf("stored file: %v", err) + } + if string(got) != string(png) { + t.Errorf("stored bytes differ") + } + if size != int64(len(png)) { + t.Errorf("size=%d want %d", size, len(png)) + } +} + +func TestImportDYI_AttachmentPathEscapeRejected(t *testing.T) { + st := testutil.NewTestStore(t) + tmp := t.TempDir() + // Build a fixture whose JSON references ../../etc/passwd. + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "evil_ESC") + if err := os.MkdirAll(threadPath, 0755); err != nil { + t.Fatal(err) + } + body := `{"participants":[{"name":"A"},{"name":"B"}],"messages":[ +{"sender_name":"A","timestamp_ms":1600000000000,"type":"Generic","photos":[{"uri":"../../etc/passwd"}]} +],"title":"x"}` + if err := os.WriteFile(filepath.Join(threadPath, "message_1.json"), []byte(body), 0644); err != nil { + t.Fatal(err) + } + + summary, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: t.TempDir(), + }) + if err != nil { + t.Fatal(err) + } + if summary.HardErrors { + t.Errorf("HardErrors=true") + } + // Exactly one attachment row, with empty storage_path and content_hash. + var sp, ch string + if err := st.DB().QueryRow("SELECT storage_path, content_hash FROM attachments LIMIT 1").Scan(&sp, &ch); err != nil { + t.Fatal(err) + } + if sp != "" || ch != "" { + t.Errorf("path escape not rejected: storage_path=%q content_hash=%q", sp, ch) + } +} + +// TestImportDYI_AttachmentSymlinkRejected verifies that an attachment URI +// pointing at a symlink (e.g. a malicious DYI export that planted a +// symlink to a sensitive local file) is not followed: handleAttachment +// returns no storage_path/content_hash, so the symlink target is never +// copied into the attachment store. +func TestImportDYI_AttachmentSymlinkRejected(t *testing.T) { + st := testutil.NewTestStore(t) + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "evil_LNK") + photosDir := filepath.Join(threadPath, "photos") + if err := os.MkdirAll(photosDir, 0755); err != nil { + t.Fatal(err) + } + // Create a "secret" file outside the attachment URI tree and a + // symlink at the URI path that points at it. The URI itself stays + // inside the export root, so the path-escape guard does not catch + // it; only the symlink check does. + secret := filepath.Join(t.TempDir(), "secret.txt") + if err := os.WriteFile(secret, []byte("password=hunter2"), 0600); err != nil { + t.Fatal(err) + } + link := filepath.Join(photosDir, "innocent.png") + if err := os.Symlink(secret, link); err != nil { + t.Skipf("symlink not supported: %v", err) + } + + body := `{"participants":[{"name":"A"},{"name":"B"}],"messages":[ +{"sender_name":"A","timestamp_ms":1600000000000,"type":"Generic","photos":[{"uri":"messages/inbox/evil_LNK/photos/innocent.png"}]} +],"title":"x"}` + if err := os.WriteFile(filepath.Join(threadPath, "message_1.json"), []byte(body), 0644); err != nil { + t.Fatal(err) + } + + attachmentsDir := t.TempDir() + summary, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: attachmentsDir, + }) + if err != nil { + t.Fatal(err) + } + if summary.HardErrors { + t.Errorf("HardErrors=true") + } + var sp, ch string + if err := st.DB().QueryRow("SELECT storage_path, content_hash FROM attachments LIMIT 1").Scan(&sp, &ch); err != nil { + t.Fatal(err) + } + if sp != "" || ch != "" { + t.Errorf("symlinked attachment not rejected: storage_path=%q content_hash=%q", sp, ch) + } + // Defense in depth: assert nothing under attachmentsDir contains the + // secret bytes, so even a future copy regression would be caught. + _ = filepath.Walk(attachmentsDir, func(p string, info os.FileInfo, err error) error { + if err != nil || info.IsDir() { + return nil + } + data, _ := os.ReadFile(p) + if strings.Contains(string(data), "hunter2") { + t.Errorf("symlink target leaked into attachments store at %s", p) + } + return nil + }) +} + +func TestImportDYI_MissingAttachment(t *testing.T) { + st := testutil.NewTestStore(t) + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "missing_MIS") + if err := os.MkdirAll(threadPath, 0755); err != nil { + t.Fatal(err) + } + body := `{"participants":[{"name":"A"},{"name":"B"}],"messages":[ +{"sender_name":"A","timestamp_ms":1600000000000,"type":"Generic","photos":[{"uri":"messages/inbox/missing_MIS/photos/gone.png"}]} +],"title":"x"}` + if err := os.WriteFile(filepath.Join(threadPath, "message_1.json"), []byte(body), 0644); err != nil { + t.Fatal(err) + } + summary, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: t.TempDir(), + }) + if err != nil { + t.Fatal(err) + } + if summary.HardErrors { + t.Errorf("HardErrors=true") + } + var sp, ch string + if err := st.DB().QueryRow("SELECT storage_path, content_hash FROM attachments LIMIT 1").Scan(&sp, &ch); err != nil { + t.Fatal(err) + } + if sp != "" || ch != "" { + t.Errorf("missing attachment should have empty storage_path: got sp=%q ch=%q", sp, ch) + } +} + +// TestImportDYI_ReactionsFirstClass verifies reaction rows and the +// "[reacted: ...]" body-append independently of FTS5. The FTS5 MATCH +// half of the dual-path lives in importer_fts_test.go. +func TestImportDYI_ReactionsFirstClass(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/json_simple") + var n int + if err := st.DB().QueryRow(` + SELECT COUNT(*) FROM reactions r + JOIN message_bodies b ON b.message_id = r.message_id + WHERE b.body_text LIKE '%café%' + `).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 2 { + t.Errorf("reactions=%d want 2", n) + } + var bodyCount int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM message_bodies WHERE body_text LIKE '%[reacted:%'`, + ).Scan(&bodyCount); err != nil { + t.Fatal(err) + } + if bodyCount < 1 { + t.Errorf("body with [reacted: suffix: got %d want >=1", bodyCount) + } +} + +func TestImportDYI_NonTextMessageBodies(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/json_nontext") + want := map[string]string{ + "inbox/sam_NONTXT__0": "[system] Sam left the chat", + "inbox/sam_NONTXT__1": "[shared link] https://example.com/article\nExample share text", + "inbox/sam_NONTXT__2": "[call: missed, 0s]", + "inbox/sam_NONTXT__3": "[call: 3m 12s]", + "inbox/sam_NONTXT__4": "[photo]", + "inbox/sam_NONTXT__5": "[sticker]", + } + for id, wantBody := range want { + var body string + if err := st.DB().QueryRow(` + SELECT b.body_text FROM message_bodies b + JOIN messages m ON m.id = b.message_id + WHERE m.source_message_id = ?`, id).Scan(&body); err != nil { + t.Errorf("%s: %v", id, err) + continue + } + if body != wantBody { + t.Errorf("%s: body=%q want %q", id, body, wantBody) + } + } +} + +func TestImportDYI_MixedFormatJSONWins(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/mixed") + // Exactly one conversation. + var n int + if err := st.DB().QueryRow("SELECT COUNT(*) FROM conversations WHERE source_conversation_id='inbox/eve_MIX'").Scan(&n); err != nil { + t.Fatal(err) + } + if n != 1 { + t.Errorf("conversations=%d want 1", n) + } + // 2 messages, no __html_ prefix. + if err := st.DB().QueryRow("SELECT COUNT(*) FROM messages").Scan(&n); err != nil { + t.Fatal(err) + } + if n != 2 { + t.Errorf("messages=%d want 2", n) + } + if err := st.DB().QueryRow("SELECT COUNT(*) FROM messages WHERE source_message_id LIKE '%html_%'").Scan(&n); err != nil { + t.Fatal(err) + } + if n != 0 { + t.Errorf("html_ prefixed rows=%d want 0", n) + } +} + +func TestImportDYI_FormatBoth(t *testing.T) { + st := testutil.NewTestStore(t) + summary, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: "testdata/mixed", + Format: "both", + AttachmentsDir: t.TempDir(), + }) + if err != nil { + t.Fatal(err) + } + if summary.HardErrors { + t.Error("HardErrors=true") + } + var n int + if err := st.DB().QueryRow("SELECT COUNT(*) FROM messages").Scan(&n); err != nil { + t.Fatal(err) + } + if n != 4 { + t.Errorf("messages=%d want 4", n) + } + if err := st.DB().QueryRow("SELECT COUNT(*) FROM messages WHERE source_message_id LIKE '%__html_%'").Scan(&n); err != nil { + t.Fatal(err) + } + if n != 2 { + t.Errorf("html rows=%d want 2", n) + } + // One conversation row, not two. + if err := st.DB().QueryRow("SELECT COUNT(*) FROM conversations WHERE source_conversation_id='inbox/eve_MIX'").Scan(&n); err != nil { + t.Fatal(err) + } + if n != 1 { + t.Errorf("conversations=%d want 1", n) + } +} + +func TestImportDYI_IsFromMe(t *testing.T) { + st := testutil.NewTestStore(t) + _, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: "testdata/json_simple", + Format: "auto", + AttachmentsDir: t.TempDir(), + }) + if err != nil { + t.Fatal(err) + } + var ident string + if err := st.DB().QueryRow( + "SELECT identifier FROM sources WHERE source_type='facebook_messenger'", + ).Scan(&ident); err != nil { + t.Fatal(err) + } + if ident != "test.user@facebook.messenger" { + t.Errorf("identifier=%q", ident) + } + // Messages authored by Test User should have is_from_me=1. + var wesFromMe, aliceFromMe int + if err := st.DB().QueryRow(` + SELECT COUNT(*) FROM messages m + WHERE m.is_from_me = 1 AND m.source_message_id LIKE 'inbox/alice_ABC123__%' + `).Scan(&wesFromMe); err != nil { + t.Fatal(err) + } + if wesFromMe < 1 { + t.Errorf("wes is_from_me rows=%d want >=1", wesFromMe) + } + if err := st.DB().QueryRow(` + SELECT COUNT(*) FROM messages m + WHERE m.is_from_me = 0 AND m.source_message_id LIKE 'inbox/alice_ABC123__%' + `).Scan(&aliceFromMe); err != nil { + t.Fatal(err) + } + if aliceFromMe < 1 { + t.Errorf("alice is_from_me=0 rows=%d want >=1", aliceFromMe) + } +} + +func TestImportDYI_LabelTaxonomy(t *testing.T) { + st := testutil.NewTestStore(t) + _ = importFixture(t, st, "testdata/json_simple") + // Messenger and Messenger / Inbox and Messenger / Archived must exist. + for _, name := range []string{"Messenger", "Messenger / Inbox", "Messenger / Archived"} { + var n int + if err := st.DB().QueryRow("SELECT COUNT(*) FROM labels WHERE name = ?", name).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 1 { + t.Errorf("label %q count=%d want 1", name, n) + } + } + // Every inbox message has both Messenger and Messenger / Inbox labels. + var n int + if err := st.DB().QueryRow(` + SELECT COUNT(*) FROM message_labels ml + JOIN labels l ON l.id = ml.label_id + JOIN messages m ON m.id = ml.message_id + WHERE l.name = 'Messenger / Inbox' + AND m.source_message_id LIKE 'inbox/alice_ABC123__%' + `).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 3 { + t.Errorf("inbox labels on alice msgs: %d want 3", n) + } + if err := st.DB().QueryRow(` + SELECT COUNT(*) FROM message_labels ml + JOIN labels l ON l.id = ml.label_id + JOIN messages m ON m.id = ml.message_id + WHERE l.name = 'Messenger / Archived' + AND m.source_message_id LIKE 'archived_threads/zoe_ARCH__%' + `).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 1 { + t.Errorf("archived labels on zoe msgs: %d want 1", n) + } +} + +func TestImportDYI_SelfParticipantSeeded(t *testing.T) { + st := testutil.NewTestStore(t) + tmp := t.TempDir() + // Empty DYI tree with just messages/inbox/. + if err := os.MkdirAll(filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox"), 0755); err != nil { + t.Fatal(err) + } + summary, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: t.TempDir(), + }) + if err != nil { + t.Fatal(err) + } + if summary.MessagesProcessed != 0 { + t.Errorf("MessagesProcessed=%d want 0", summary.MessagesProcessed) + } + if summary.HardErrors { + t.Error("HardErrors=true") + } + var n int + if err := st.DB().QueryRow( + "SELECT COUNT(*) FROM participants WHERE email_address = ? AND domain = 'facebook.messenger'", + "test.user@facebook.messenger", + ).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 1 { + t.Errorf("self participant count=%d want 1", n) + } +} + +func TestImportDYI_MeDomainValidation(t *testing.T) { + st := testutil.NewTestStore(t) + _, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "wes@gmail.com", + RootDir: "testdata/json_simple", + AttachmentsDir: t.TempDir(), + }) + if err == nil { + t.Fatal("expected error") + } + if !strings.Contains(err.Error(), "facebook.messenger") { + t.Errorf("error should mention facebook.messenger, got %v", err) + } + var n int + if err := st.DB().QueryRow( + "SELECT COUNT(*) FROM sources WHERE source_type='facebook_messenger'", + ).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 0 { + t.Errorf("sources=%d want 0", n) + } +} + +// largeFixtureSize is the number of messages in the timing-tripwire +// fixture. Sized to be fast enough to always run (including under +// `go test -short`) while still catching catastrophic regressions. +const largeFixtureSize = 150 + +// Procedurally-generated fixture for the timing tripwire. +func writeLargeFixture(t *testing.T) string { + t.Helper() + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "big_BIG") + if err := os.MkdirAll(threadPath, 0755); err != nil { + t.Fatal(err) + } + type rawMsg struct { + SenderName string `json:"sender_name"` + TimestampMs int64 `json:"timestamp_ms"` + Content string `json:"content"` + Type string `json:"type"` + } + type rawPart struct { + Name string `json:"name"` + } + type rawExport struct { + Participants []rawPart `json:"participants"` + Messages []rawMsg `json:"messages"` + Title string `json:"title"` + } + exp := rawExport{ + Participants: []rawPart{{Name: "Test User"}, {Name: "Big Friend"}}, + Title: "Big Friend", + } + for i := 0; i < largeFixtureSize; i++ { + sender := "Test User" + if i%2 == 1 { + sender = "Big Friend" + } + exp.Messages = append(exp.Messages, rawMsg{ + SenderName: sender, + TimestampMs: 1600000000000 + int64(i)*60000, + Content: fmt.Sprintf("Message %d", i), + Type: "Generic", + }) + } + data, err := json.Marshal(exp) + if err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(threadPath, "message_1.json"), data, 0644); err != nil { + t.Fatal(err) + } + return tmp +} + +// writeMultiThreadFixture lays out `n` inbox threads under a DYI root +// at tmp, each with a single short message. Threads are named +// "thread_{i}_OK" so that Discover sorts them deterministically. +func writeMultiThreadFixture(t *testing.T, n int) string { + t.Helper() + tmp := t.TempDir() + for i := 0; i < n; i++ { + name := fmt.Sprintf("thread_%02d_OK", i) + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", name) + if err := os.MkdirAll(threadPath, 0755); err != nil { + t.Fatal(err) + } + body := fmt.Sprintf( + `{"participants":[{"name":"Test User"},{"name":"Friend %d"}],"messages":[`+ + `{"sender_name":"Friend %d","timestamp_ms":%d,"type":"Generic","content":"hello from %d"}`+ + `],"title":"Friend %d"}`, + i, i, 1600000000000+int64(i)*60000, i, i, + ) + if err := os.WriteFile(filepath.Join(threadPath, "message_1.json"), []byte(body), 0644); err != nil { + t.Fatal(err) + } + } + return tmp +} + +// TestImportDYI_ResumeFromCheckpoint seeds an active sync with a prior +// fbmessengerCheckpoint pointing past the first thread, then runs +// ImportDYI and verifies that (a) WasResumed is true and (b) the +// already-processed thread is skipped on the second run (while still +// present in the store from the first run so idempotence holds). +func TestImportDYI_ResumeFromCheckpoint(t *testing.T) { + st := testutil.NewTestStore(t) + root := writeMultiThreadFixture(t, 3) + + // First run: import everything. This populates the DB and leaves + // no checkpoint (CompleteSync clears cursor_before in practice + // because we only read when there's an *active* run; a completed + // run is fine to coexist). + first := importFixture(t, st, root) + if first.WasResumed { + t.Errorf("first run WasResumed=true, want false") + } + if first.MessagesAdded != 3 { + t.Fatalf("first run MessagesAdded=%d want 3", first.MessagesAdded) + } + if first.ThreadsProcessed != 3 { + t.Fatalf("first run ThreadsProcessed=%d want 3", first.ThreadsProcessed) + } + + // Simulate an in-progress run: create a new running sync_run for + // the facebook_messenger source and write a fbmessengerCheckpoint + // whose ThreadIndex == 2 (two threads already done). + src, err := st.GetOrCreateSource("facebook_messenger", "test.user@facebook.messenger") + if err != nil { + t.Fatal(err) + } + syncID, err := st.StartSync(src.ID, "import-messenger") + if err != nil { + t.Fatal(err) + } + absRoot, err := filepath.Abs(root) + if err != nil { + t.Fatal(err) + } + cpJSON, err := json.Marshal(fbmessengerCheckpoint{ + RootDir: absRoot, + ThreadIndex: 2, + LastMessageIndex: 0, + }) + if err != nil { + t.Fatal(err) + } + if err := st.UpdateSyncCheckpoint(syncID, &store.Checkpoint{ + PageToken: string(cpJSON), + MessagesProcessed: 2, + MessagesAdded: 2, + }); err != nil { + t.Fatal(err) + } + + // Second run: should detect the active checkpoint and resume, + // processing only the 3rd thread. + before := snapshotRowCounts(t, st) + second := importFixture(t, st, root) + after := snapshotRowCounts(t, st) + + if !second.WasResumed { + t.Errorf("second run WasResumed=false, want true") + } + if second.ThreadsProcessed != 1 { + t.Errorf("second run ThreadsProcessed=%d want 1 (only last thread)", second.ThreadsProcessed) + } + // Idempotence: row counts must not change (source_message_id + // dedupes the one re-imported thread if it were processed; but + // here the resume skip means it is not touched at all). + for k, v := range before { + if after[k] != v { + t.Errorf("%s: before=%d after=%d", k, v, after[k]) + } + } + // All three threads must still be present. + var n int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM conversations WHERE source_conversation_id LIKE 'inbox/thread_%_OK'`, + ).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 3 { + t.Errorf("conversations=%d want 3", n) + } +} + +// TestImportDYI_ResumeWrongRootRejected verifies that a prior +// checkpoint for a different RootDir is rejected. +func TestImportDYI_ResumeWrongRootRejected(t *testing.T) { + st := testutil.NewTestStore(t) + root := writeMultiThreadFixture(t, 2) + + src, err := st.GetOrCreateSource("facebook_messenger", "test.user@facebook.messenger") + if err != nil { + t.Fatal(err) + } + syncID, err := st.StartSync(src.ID, "import-messenger") + if err != nil { + t.Fatal(err) + } + cpJSON, err := json.Marshal(fbmessengerCheckpoint{ + RootDir: "/some/other/dir", + ThreadIndex: 1, + }) + if err != nil { + t.Fatal(err) + } + if err := st.UpdateSyncCheckpoint(syncID, &store.Checkpoint{ + PageToken: string(cpJSON), + }); err != nil { + t.Fatal(err) + } + + _, err = ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: root, + AttachmentsDir: t.TempDir(), + }) + if err == nil { + t.Fatal("expected error for wrong root, got nil") + } + if !strings.Contains(err.Error(), "different root") { + t.Errorf("error=%v want mention of 'different root'", err) + } +} + +// TestImportDYI_ResumeFromFailedSync verifies that a checkpoint saved +// before FailSync is still found on the next run, so interrupted imports +// can resume instead of restarting from scratch. +func TestImportDYI_ResumeFromFailedSync(t *testing.T) { + st := testutil.NewTestStore(t) + root := writeMultiThreadFixture(t, 3) + + // First run: import everything. + first := importFixture(t, st, root) + if first.MessagesAdded != 3 { + t.Fatalf("first run MessagesAdded=%d want 3", first.MessagesAdded) + } + + // Simulate a failed (interrupted) sync: create a sync run, save a + // checkpoint, then mark it failed — mimicking what happens when the + // user hits Ctrl-C. + src, err := st.GetOrCreateSource("facebook_messenger", "test.user@facebook.messenger") + if err != nil { + t.Fatal(err) + } + syncID, err := st.StartSync(src.ID, "import-messenger") + if err != nil { + t.Fatal(err) + } + absRoot, err := filepath.Abs(root) + if err != nil { + t.Fatal(err) + } + cpJSON, err := json.Marshal(fbmessengerCheckpoint{ + RootDir: absRoot, + ThreadIndex: 2, + }) + if err != nil { + t.Fatal(err) + } + if err := st.UpdateSyncCheckpoint(syncID, &store.Checkpoint{ + PageToken: string(cpJSON), + MessagesProcessed: 2, + MessagesAdded: 2, + }); err != nil { + t.Fatal(err) + } + // Mark the sync as failed, simulating a graceful interrupt. + if err := st.FailSync(syncID, "context canceled"); err != nil { + t.Fatal(err) + } + + // The next run must find the failed sync's checkpoint and resume. + second := importFixture(t, st, root) + if !second.WasResumed { + t.Errorf("second run WasResumed=false, want true") + } + if second.ThreadsProcessed != 1 { + t.Errorf("second run ThreadsProcessed=%d want 1 (only last thread)", second.ThreadsProcessed) + } +} + +// TestImportDYI_ResumeFromFirstThreadCheckpoint verifies that a +// checkpoint saved while still processing the first thread +// (ThreadIndex == 0) is treated as resumable rather than ignored. +// source_message_id dedup covers the data path; this test asserts the +// UX-visible state (WasResumed true and cumulative counters carried +// forward) so a user-visible interrupt during thread 0 is reflected in +// the next run's summary. +func TestImportDYI_ResumeFromFirstThreadCheckpoint(t *testing.T) { + st := testutil.NewTestStore(t) + root := writeMultiThreadFixture(t, 2) + + // Seed a failed sync whose checkpoint is mid-first-thread. + src, err := st.GetOrCreateSource("facebook_messenger", "test.user@facebook.messenger") + if err != nil { + t.Fatal(err) + } + syncID, err := st.StartSync(src.ID, "import-messenger") + if err != nil { + t.Fatal(err) + } + absRoot, err := filepath.Abs(root) + if err != nil { + t.Fatal(err) + } + cpJSON, err := json.Marshal(fbmessengerCheckpoint{ + RootDir: absRoot, + ThreadIndex: 0, + LastMessageIndex: 0, + }) + if err != nil { + t.Fatal(err) + } + if err := st.UpdateSyncCheckpoint(syncID, &store.Checkpoint{ + PageToken: string(cpJSON), + MessagesProcessed: 1, + MessagesAdded: 1, + }); err != nil { + t.Fatal(err) + } + if err := st.FailSync(syncID, "context canceled"); err != nil { + t.Fatal(err) + } + + summary := importFixture(t, st, root) + if !summary.WasResumed { + t.Errorf("WasResumed=false, want true for first-thread checkpoint") + } + // Cumulative counters must carry over from the prior run. + if summary.MessagesProcessed < 1 { + t.Errorf("MessagesProcessed=%d, want carry-over from prior run (>=1)", summary.MessagesProcessed) + } +} + +func TestImportDYI_InvalidFormatRejected(t *testing.T) { + st := testutil.NewTestStore(t) + root := writeMultiThreadFixture(t, 1) + _, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: root, + AttachmentsDir: t.TempDir(), + Format: "jsno", + }) + if err == nil { + t.Fatal("expected error for invalid format, got nil") + } + if !strings.Contains(err.Error(), "unknown --format") { + t.Errorf("error=%v want mention of 'unknown --format'", err) + } +} + +// TestImportDYI_StaleFailedCheckpointIgnoredAfterCompletion verifies that a +// failed run's checkpoint is not used to resume a later import once a +// successful run has occurred since. Without the fix, +// GetLatestCheckpointedSync would still return the older failed run because +// its status filter (running/failed) excluded the more recent completed +// run, and a re-import would silently resume from the stale checkpoint +// and skip threads already covered by the successful run. +func TestImportDYI_StaleFailedCheckpointIgnoredAfterCompletion(t *testing.T) { + st := testutil.NewTestStore(t) + root := writeMultiThreadFixture(t, 3) + + // Seed a failed sync with a checkpoint pointing past thread 0. + src, err := st.GetOrCreateSource("facebook_messenger", "test.user@facebook.messenger") + if err != nil { + t.Fatal(err) + } + failID, err := st.StartSync(src.ID, "import-messenger") + if err != nil { + t.Fatal(err) + } + absRoot, err := filepath.Abs(root) + if err != nil { + t.Fatal(err) + } + cpJSON, err := json.Marshal(fbmessengerCheckpoint{RootDir: absRoot, ThreadIndex: 2}) + if err != nil { + t.Fatal(err) + } + if err := st.UpdateSyncCheckpoint(failID, &store.Checkpoint{ + PageToken: string(cpJSON), MessagesProcessed: 2, MessagesAdded: 2, + }); err != nil { + t.Fatal(err) + } + if err := st.FailSync(failID, "context canceled"); err != nil { + t.Fatal(err) + } + + // Run a successful import after the failed run. This becomes the + // latest sync, so a future re-import must NOT resume from the older + // failed checkpoint. + first := importFixture(t, st, root) + if first.MessagesAdded != 3 { + t.Fatalf("first run MessagesAdded=%d want 3", first.MessagesAdded) + } + + second := importFixture(t, st, root) + if second.WasResumed { + t.Errorf("WasResumed=true: stale failed checkpoint resumed despite later completed run") + } + if second.ThreadsProcessed != 3 { + t.Errorf("ThreadsProcessed=%d want 3 (full re-scan)", second.ThreadsProcessed) + } +} + +// TestImportDYI_ReimportPicksUpNewMessages verifies that re-importing a root +// after a successful import picks up new messages added to an existing thread, +// rather than treating the completed run as resumable and skipping threads. +// Regression test for: GetLatestCheckpointedSync matching completed runs. +func TestImportDYI_ReimportPicksUpNewMessages(t *testing.T) { + // Copy json_simple fixture to a temp dir so we can mutate it. + root := t.TempDir() + cpDir(t, "testdata/json_simple", root) + + st := testutil.NewTestStore(t) + + // First import: 4 messages (3 inbox + 1 archived). + s1 := importFixture(t, st, root) + if s1.MessagesAdded != 4 { + t.Fatalf("first import: MessagesAdded=%d want 4", s1.MessagesAdded) + } + before := countMessages(t, st, "message_type='fbmessenger'") + if before != 4 { + t.Fatalf("messages after first import=%d want 4", before) + } + + // Add a new message to the existing alice thread. + threadFile := filepath.Join(root, "your_activity_across_facebook/messages/inbox/alice_ABC123/message_1.json") + raw, err := os.ReadFile(threadFile) + if err != nil { + t.Fatal(err) + } + var thread map[string]any + if err := json.Unmarshal(raw, &thread); err != nil { + t.Fatal(err) + } + msgs := thread["messages"].([]any) + newMsg := map[string]any{ + "sender_name": "Alice Example", + "timestamp_ms": float64(1600000200000), + "content": "New message after first import", + "type": "Generic", + } + thread["messages"] = append([]any{newMsg}, msgs...) + updated, err := json.MarshalIndent(thread, "", " ") + if err != nil { + t.Fatal(err) + } + if err := os.WriteFile(threadFile, updated, 0o644); err != nil { + t.Fatal(err) + } + + // Re-import the same root. The new message must be picked up. + s2 := importFixture(t, st, root) + after := countMessages(t, st, "message_type='fbmessenger'") + if after != before+1 { + t.Errorf("messages after re-import=%d want %d (added=%d)", after, before+1, s2.MessagesAdded) + } +} + +// cpDir recursively copies src into dst. +func cpDir(t *testing.T, src, dst string) { + t.Helper() + entries, err := os.ReadDir(src) + if err != nil { + t.Fatal(err) + } + for _, e := range entries { + sp := filepath.Join(src, e.Name()) + dp := filepath.Join(dst, e.Name()) + if e.IsDir() { + if err := os.MkdirAll(dp, 0o755); err != nil { + t.Fatal(err) + } + cpDir(t, sp, dp) + } else { + data, err := os.ReadFile(sp) + if err != nil { + t.Fatal(err) + } + if err := os.WriteFile(dp, data, 0o644); err != nil { + t.Fatal(err) + } + } + } +} + +// TestImportDYI_SynthesizedSenderLinkedToConversation verifies that when a +// message's sender_name is not in the thread's participants array (a +// system/orphan sender), the synthesized participant is still linked to the +// conversation via conversation_participants. Regression for the case where +// senderID was recorded on the message but not joined to the conversation, +// skewing participant-based analytics. +func TestImportDYI_SynthesizedSenderLinkedToConversation(t *testing.T) { + st := testutil.NewTestStore(t) + tmp := t.TempDir() + threadDir := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "alice_ORPH") + if err := os.MkdirAll(threadDir, 0o755); err != nil { + t.Fatal(err) + } + fixture := map[string]any{ + "participants": []map[string]any{ + {"name": "Test User"}, + {"name": "Alice Example"}, + }, + "messages": []map[string]any{ + { + "sender_name": "Alice Example", + "timestamp_ms": 1600000000000, + "content": "hi", + "type": "Generic", + }, + { + // Orphan sender: not in participants. + "sender_name": "Facebook User", + "timestamp_ms": 1600000001000, + "content": "system message", + "type": "Generic", + }, + }, + "title": "Alice Example", + "is_still_participant": true, + "thread_type": "Regular", + "thread_path": "inbox/alice_ORPH", + } + data, err := json.Marshal(fixture) + if err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(threadDir, "message_1.json"), data, 0o644); err != nil { + t.Fatal(err) + } + _, err = ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: t.TempDir(), + }) + if err != nil { + t.Fatal(err) + } + + // The synthesized "Facebook User" sender must be linked to the + // conversation via conversation_participants, not just present as + // sender_id on its message. + var n int + if err := st.DB().QueryRow(` + SELECT COUNT(*) FROM conversation_participants cp + JOIN participants p ON p.id = cp.participant_id + WHERE cp.conversation_id = ( + SELECT id FROM conversations WHERE source_conversation_id = 'inbox/alice_ORPH' + ) + AND p.email_address = 'facebook.user@facebook.messenger' + `).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 1 { + t.Errorf("orphan sender not linked to conversation: got %d want 1", n) + } +} + +// TestImportDYI_SenderIDPreservedOnReimport verifies that re-importing a +// thread whose message sender no longer resolves (e.g. the participant +// was renamed or the second import uses a different fixture) does not +// null-out the previously-recorded messages.sender_id, display name, or +// is_from_me flag. The importer reads any existing sender data and reuses +// it when the current run can't produce one. +func TestImportDYI_SenderIDPreservedOnReimport(t *testing.T) { + st := testutil.NewTestStore(t) + tmp := t.TempDir() + threadDir := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "alice_PRES") + if err := os.MkdirAll(threadDir, 0o755); err != nil { + t.Fatal(err) + } + + write := func(msg0Sender, msg1Sender string) { + fixture := map[string]any{ + "participants": []map[string]any{ + {"name": "Test User"}, + {"name": "Alice Example"}, + }, + "messages": []map[string]any{ + { + "sender_name": msg0Sender, + "timestamp_ms": 1600000000000, + "content": "from alice", + "type": "Generic", + }, + { + "sender_name": msg1Sender, + "timestamp_ms": 1600000001000, + "content": "from me", + "type": "Generic", + }, + }, + "title": "Alice Example", + "is_still_participant": true, + "thread_type": "Regular", + "thread_path": "inbox/alice_PRES", + } + data, err := json.Marshal(fixture) + if err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(threadDir, "message_1.json"), data, 0o644); err != nil { + t.Fatal(err) + } + } + + // First import: message 0 from Alice, message 1 from Test User (self). + write("Alice Example", "Test User") + if _, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: t.TempDir(), + }); err != nil { + t.Fatal(err) + } + + type snap struct { + senderID sql.NullInt64 + isFromMe bool + fromName sql.NullString + fromPID sql.NullInt64 + } + capture := func(srcMsgID string) snap { + t.Helper() + var s snap + if err := st.DB().QueryRow( + `SELECT sender_id, is_from_me FROM messages WHERE source_message_id = ?`, + srcMsgID, + ).Scan(&s.senderID, &s.isFromMe); err != nil { + t.Fatalf("messages row for %s: %v", srcMsgID, err) + } + if err := st.DB().QueryRow(` + SELECT mr.display_name, mr.participant_id + FROM message_recipients mr + JOIN messages m ON m.id = mr.message_id + WHERE m.source_message_id = ? AND mr.recipient_type = 'from' + `, srcMsgID).Scan(&s.fromName, &s.fromPID); err != nil { + t.Fatalf("from recipient for %s: %v", srcMsgID, err) + } + return s + } + + aliceBefore := capture("inbox/alice_PRES__0") + selfBefore := capture("inbox/alice_PRES__1") + if !aliceBefore.senderID.Valid || aliceBefore.isFromMe { + t.Fatalf("alice msg setup: senderID=%v isFromMe=%v", aliceBefore.senderID, aliceBefore.isFromMe) + } + if !selfBefore.senderID.Valid || !selfBefore.isFromMe { + t.Fatalf("self msg setup: senderID=%v isFromMe=%v", selfBefore.senderID, selfBefore.isFromMe) + } + + // Second import: both sender_names stripped so the current run can't + // resolve them. The importer must preserve prior sender_id, is_from_me, + // and message_recipients for both messages. + write("", "") + summary, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: t.TempDir(), + }) + if err != nil { + t.Fatal(err) + } + // Rehydrated self-authored messages must still count toward + // FromMeCount so the CLI doesn't warn about a --me mismatch. + if summary.FromMeCount < 1 { + t.Errorf("FromMeCount=%d want >=1 on rehydration", summary.FromMeCount) + } + + aliceAfter := capture("inbox/alice_PRES__0") + selfAfter := capture("inbox/alice_PRES__1") + + if !aliceAfter.senderID.Valid || aliceAfter.senderID.Int64 != aliceBefore.senderID.Int64 { + t.Errorf("alice sender_id not preserved: before=%v after=%v", aliceBefore.senderID, aliceAfter.senderID) + } + if aliceAfter.isFromMe { + t.Errorf("alice is_from_me flipped to true") + } + if !aliceAfter.fromName.Valid || aliceAfter.fromName.String != "Alice Example" { + t.Errorf("alice from display_name=%q want Alice Example", aliceAfter.fromName.String) + } + if !aliceAfter.fromPID.Valid || aliceAfter.fromPID.Int64 != aliceBefore.senderID.Int64 { + t.Errorf("alice from participant_id not preserved: got %v want %d", aliceAfter.fromPID, aliceBefore.senderID.Int64) + } + + if !selfAfter.senderID.Valid || selfAfter.senderID.Int64 != selfBefore.senderID.Int64 { + t.Errorf("self sender_id not preserved: before=%v after=%v", selfBefore.senderID, selfAfter.senderID) + } + if !selfAfter.isFromMe { + t.Errorf("self is_from_me not preserved (flipped to false)") + } + // The self participant is seeded with an empty participants.display_name, + // so rehydration must fall back to the prior message_recipients display + // name rather than clobbering it with "". + if !selfAfter.fromName.Valid || selfAfter.fromName.String != "Test User" { + t.Errorf("self from display_name=%q want Test User", selfAfter.fromName.String) + } + + // The account owner must NOT appear in "to" for the self-authored + // message — otherwise the dropped is_from_me flag would inflate + // participant analytics and cause self-to-self recipient rows. + var selfInTo int + if err := st.DB().QueryRow(` + SELECT COUNT(*) FROM message_recipients mr + JOIN messages m ON m.id = mr.message_id + WHERE m.source_message_id = 'inbox/alice_PRES__1' + AND mr.recipient_type = 'to' + AND mr.participant_id = ? + `, selfBefore.senderID.Int64).Scan(&selfInTo); err != nil { + t.Fatal(err) + } + if selfInTo != 0 { + t.Errorf("self participant appeared in 'to' for self-authored message: count=%d", selfInTo) + } +} + +// TestImportDYI_ReimportRepairsConversationParticipant verifies that a +// conversation missing a participant row (the pre-fix state for databases +// imported before synthesized senders were linked) gets re-linked on a +// subsequent import via the sender_id-preservation rehydration path. +func TestImportDYI_ReimportRepairsConversationParticipant(t *testing.T) { + st := testutil.NewTestStore(t) + tmp := t.TempDir() + threadDir := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "alice_REPAIR") + if err := os.MkdirAll(threadDir, 0o755); err != nil { + t.Fatal(err) + } + // The sender "Facebook User" is intentionally NOT in the participants + // list, so the message goes through the synthesized-sender path. Only + // the rehydration branch (not the thread-participants loop) would + // re-link this participant on a later re-import. + fixture := map[string]any{ + "participants": []map[string]any{ + {"name": "Test User"}, + {"name": "Alice Example"}, + }, + "messages": []map[string]any{ + { + "sender_name": "Facebook User", + "timestamp_ms": 1600000000000, + "content": "system message", + "type": "Generic", + }, + }, + "title": "Alice Example", + "is_still_participant": true, + "thread_type": "Regular", + "thread_path": "inbox/alice_REPAIR", + } + data, err := json.Marshal(fixture) + if err != nil { + t.Fatal(err) + } + writeFixture := func() { + if err := os.WriteFile(filepath.Join(threadDir, "message_1.json"), data, 0o644); err != nil { + t.Fatal(err) + } + } + + writeFixture() + if _, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: t.TempDir(), + }); err != nil { + t.Fatal(err) + } + var orphanID int64 + if err := st.DB().QueryRow( + `SELECT sender_id FROM messages WHERE source_message_id = 'inbox/alice_REPAIR__0'`, + ).Scan(&orphanID); err != nil { + t.Fatal(err) + } + + // Simulate the pre-fix DB state: delete the synthesized sender's + // conversation_participants row while leaving the message sender_id + // intact. This is the scenario a database imported before synthesized + // senders were linked would end up in. + var convID int64 + if err := st.DB().QueryRow( + `SELECT id FROM conversations WHERE source_conversation_id = 'inbox/alice_REPAIR'`, + ).Scan(&convID); err != nil { + t.Fatal(err) + } + if _, err := st.DB().Exec( + `DELETE FROM conversation_participants WHERE conversation_id = ? AND participant_id = ?`, + convID, orphanID, + ); err != nil { + t.Fatal(err) + } + + // Re-import with sender_name stripped so the current run can't + // synthesize the orphan — rehydration is the only path that can + // recover the participant link. + fixture["messages"].([]map[string]any)[0]["sender_name"] = "" + data, err = json.Marshal(fixture) + if err != nil { + t.Fatal(err) + } + writeFixture() + if _, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: t.TempDir(), + }); err != nil { + t.Fatal(err) + } + + var n int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM conversation_participants WHERE conversation_id = ? AND participant_id = ?`, + convID, orphanID, + ).Scan(&n); err != nil { + t.Fatal(err) + } + if n != 1 { + t.Errorf("conversation_participants not repaired on re-import: got %d want 1", n) + } +} + +func TestImportDYI_TimingTripwire(t *testing.T) { + st := testutil.NewTestStore(t) + root := writeLargeFixture(t) + start := time.Now() + summary, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: root, + AttachmentsDir: t.TempDir(), + }) + if err != nil { + t.Fatal(err) + } + elapsed := time.Since(start) + if elapsed > 30*time.Second { + t.Errorf("import took %v, want < 30s", elapsed) + } + if summary.MessagesAdded != int64(largeFixtureSize) { + t.Errorf("MessagesAdded=%d want %d", summary.MessagesAdded, largeFixtureSize) + } +} diff --git a/internal/fbmessenger/json_parser.go b/internal/fbmessenger/json_parser.go new file mode 100644 index 00000000..acff675b --- /dev/null +++ b/internal/fbmessenger/json_parser.go @@ -0,0 +1,414 @@ +package fbmessenger + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "regexp" + "sort" + "strconv" + "strings" + "time" +) + +// rawJSONExport is the shape of a single message_*.json DYI file. +type rawJSONExport struct { + Participants []rawJSONParticipant `json:"participants"` + Messages []rawJSONMessage `json:"messages"` + Title string `json:"title"` + ThreadType string `json:"thread_type"` + ThreadPath string `json:"thread_path"` +} + +type rawJSONParticipant struct { + Name string `json:"name"` +} + +type rawJSONPhoto struct { + URI string `json:"uri"` + CreationTimestamp int64 `json:"creation_timestamp"` +} + +type rawJSONSticker struct { + URI string `json:"uri"` +} + +type rawJSONShare struct { + Link string `json:"link"` + ShareText string `json:"share_text"` +} + +type rawJSONReaction struct { + Reaction string `json:"reaction"` + Actor string `json:"actor"` +} + +type rawJSONMessage struct { + SenderName string `json:"sender_name"` + TimestampMs int64 `json:"timestamp_ms"` + Content string `json:"content"` + Type string `json:"type"` + Photos []rawJSONPhoto `json:"photos"` + Videos []rawJSONPhoto `json:"videos"` + Audio []rawJSONPhoto `json:"audio_files"` + Files []rawJSONPhoto `json:"files"` + Gifs []rawJSONPhoto `json:"gifs"` + Sticker *rawJSONSticker `json:"sticker"` + Share *rawJSONShare `json:"share"` + CallDuration *int64 `json:"call_duration"` + Missed bool `json:"missed"` + Reactions []rawJSONReaction `json:"reactions"` +} + +var reMessageFile = regexp.MustCompile(`^message_(\d+)\.json$`) + +// ParseJSONThread parses every message_*.json file in a DYI thread +// directory and returns a populated Thread. rootDir is the DYI export +// root; threadDir is the thread path returned by Discover. +func ParseJSONThread(rootDir, threadDir string) (*Thread, error) { + entries, err := os.ReadDir(threadDir) + if err != nil { + return nil, fmt.Errorf("fbmessenger: read thread dir: %w", err) + } + + type numbered struct { + name string + num int + } + var files []numbered + var badSiblings []string + for _, e := range entries { + if e.IsDir() { + continue + } + name := e.Name() + if !strings.HasPrefix(name, "message_") || !strings.HasSuffix(name, ".json") { + continue + } + m := reMessageFile.FindStringSubmatch(name) + if m == nil { + // Unrecognized sibling (e.g. message_final.json). Skip it and + // record the bad name so the importer can log + count it + // without aborting the entire thread. + badSiblings = append(badSiblings, name) + continue + } + n, err := strconv.Atoi(m[1]) + if err != nil { + // Regex guarantees \d+, so this is effectively unreachable; + // treat it the same as an unrecognized sibling. + badSiblings = append(badSiblings, name) + continue + } + files = append(files, numbered{name: name, num: n}) + } + if len(files) == 0 { + return nil, fmt.Errorf("fbmessenger: no message_*.json files in %s", threadDir) + } + sort.Slice(files, func(i, j int) bool { return files[i].num < files[j].num }) + + // Decode every file. The DYI format emits each file as a complete + // object with `participants` and `messages`. We take participants + // from the lowest-numbered file, and concatenate all messages. + var thread Thread + thread.Format = "json" + thread.DirName = filepath.Base(threadDir) + thread.BadSiblings = badSiblings + var rawConcat []rawJSONMessage + var title string + var participants []Participant + + for i, f := range files { + full := filepath.Join(threadDir, f.name) + data, err := os.ReadFile(full) + if err != nil { + return nil, fmt.Errorf("fbmessenger: read %s: %w", f.name, err) + } + var decoded rawJSONExport + if err := json.Unmarshal(data, &decoded); err != nil { + return nil, fmt.Errorf("%w: %s: %v", ErrCorruptJSON, f.name, err) + } + if i == 0 { + title = DecodeMojibake(decoded.Title) + for _, p := range decoded.Participants { + participants = append(participants, Participant{Name: DecodeMojibake(p.Name)}) + } + } + rawConcat = append(rawConcat, decoded.Messages...) + } + + // Deduplicate participants (Facebook sometimes repeats). + seen := make(map[string]bool, len(participants)) + uniq := participants[:0] + for _, p := range participants { + if seen[p.Name] { + continue + } + seen[p.Name] = true + uniq = append(uniq, p) + } + participants = uniq + + // Sort concatenated messages chronologically. + sort.SliceStable(rawConcat, func(i, j int) bool { + return rawConcat[i].TimestampMs < rawConcat[j].TimestampMs + }) + + thread.Title = title + thread.Participants = participants + if len(participants) <= 2 { + thread.ConvType = "direct_chat" + } else { + thread.ConvType = "group_chat" + } + + // Store raw bytes: we concatenate the original JSON files into a + // JSON array so the stored raw is a self-contained round-trippable + // payload. Callers that want the exact per-file text can still read + // from disk. + var rawBuf strings.Builder + rawBuf.WriteByte('[') + for i, f := range files { + if i > 0 { + rawBuf.WriteByte(',') + } + data, err := os.ReadFile(filepath.Join(threadDir, f.name)) + if err != nil { + return nil, err + } + rawBuf.Write(data) + } + rawBuf.WriteByte(']') + thread.RawBytes = []byte(rawBuf.String()) + + // Render each message. + absRoot, err := filepath.Abs(rootDir) + if err != nil { + return nil, fmt.Errorf("fbmessenger: abs root: %w", err) + } + thread.Messages = make([]Message, 0, len(rawConcat)) + for idx, m := range rawConcat { + msg := Message{ + Index: idx, + SenderName: DecodeMojibake(m.SenderName), + Type: m.Type, + } + if m.TimestampMs > 0 { + msg.SentAt = time.UnixMilli(m.TimestampMs).UTC() + } + msg.Body = renderJSONBody(m) + msg.Attachments = resolveAttachments(absRoot, m) + for _, r := range m.Reactions { + msg.Reactions = append(msg.Reactions, Reaction{ + Actor: DecodeMojibake(r.Actor), + Reaction: DecodeMojibake(r.Reaction), + }) + } + if len(msg.Reactions) > 0 { + msg.Body = appendReactionSummary(msg.Body, msg.Reactions) + } + thread.Messages = append(thread.Messages, msg) + } + return &thread, nil +} + +// renderJSONBody computes the body string for one DYI message, following +// the placeholder and rendering rules from plan decisions D9/D10. +func renderJSONBody(m rawJSONMessage) string { + content := DecodeMojibake(m.Content) + switch strings.ToLower(m.Type) { + case "call": + return renderCallBody(m) + case "share": + return renderShareBody(m, content) + case "unsubscribe": + if content == "" { + return "[system]" + } + return "[system] " + content + } + if content != "" { + return content + } + if m.Sticker != nil && m.Sticker.URI != "" { + return "[sticker]" + } + if len(m.Photos) > 0 { + return "[photo]" + } + if len(m.Videos) > 0 { + return "[video]" + } + if len(m.Audio) > 0 { + return "[audio]" + } + if len(m.Gifs) > 0 { + return "[gif]" + } + if len(m.Files) > 0 { + return "[file]" + } + return "" +} + +func renderCallBody(m rawJSONMessage) string { + dur := int64(0) + if m.CallDuration != nil { + dur = *m.CallDuration + } + if m.Missed || dur == 0 { + return "[call: missed, 0s]" + } + minutes := dur / 60 + seconds := dur % 60 + if minutes > 0 { + return fmt.Sprintf("[call: %dm %ds]", minutes, seconds) + } + return fmt.Sprintf("[call: %ds]", seconds) +} + +func renderShareBody(m rawJSONMessage, content string) string { + var b strings.Builder + b.WriteString("[shared link] ") + if m.Share != nil { + b.WriteString(m.Share.Link) + text := DecodeMojibake(m.Share.ShareText) + if text != "" { + b.WriteByte('\n') + b.WriteString(text) + } + } + if content != "" { + b.WriteByte('\n') + b.WriteString(content) + } + return b.String() +} + +// reReactionSuffix matches the "\n\n[reacted: ...]" summary appended by +// appendReactionSummary. It is anchored to the end of the string because +// callers append the suffix last. +var reReactionSuffix = regexp.MustCompile(`(?s)\n\n\[reacted: .*\]\z`) + +// stripReactionSuffix removes the trailing reaction summary if present, +// returning the raw body text that the parser extracted before the +// suffix was appended. HTML exports do not carry this suffix, so this +// helper is what the convergence test uses to compare JSON and HTML on +// their common ground (the message body proper). +func stripReactionSuffix(body string) string { + return reReactionSuffix.ReplaceAllString(body, "") +} + +func appendReactionSummary(body string, rs []Reaction) string { + parts := make([]string, 0, len(rs)) + for _, r := range rs { + parts = append(parts, fmt.Sprintf("%s (%s)", r.Reaction, r.Actor)) + } + suffix := "\n\n[reacted: " + strings.Join(parts, ", ") + "]" + if body == "" { + return suffix + } + return body + suffix +} + +// resolveAttachments resolves every attachment URI for one DYI message, +// guarding against paths that escape rootDir. +func resolveAttachments(absRoot string, m rawJSONMessage) []Attachment { + var out []Attachment + add := func(kind string, items []rawJSONPhoto) { + for _, it := range items { + if it.URI == "" { + continue + } + abs := resolveAttachmentURI(absRoot, it.URI) + out = append(out, Attachment{ + URI: it.URI, + AbsPath: abs, + Kind: kind, + Filename: filepath.Base(it.URI), + MimeType: guessMime(it.URI), + }) + } + } + add("photo", m.Photos) + add("video", m.Videos) + add("audio", m.Audio) + add("file", m.Files) + add("gif", m.Gifs) + if m.Sticker != nil && m.Sticker.URI != "" { + abs := resolveAttachmentURI(absRoot, m.Sticker.URI) + out = append(out, Attachment{ + URI: m.Sticker.URI, + AbsPath: abs, + Kind: "sticker", + Filename: filepath.Base(m.Sticker.URI), + MimeType: guessMime(m.Sticker.URI), + }) + } + return out +} + +// resolveAttachmentURI returns the absolute path for a DYI attachment URI +// relative to absRoot. DYI URIs are typically rooted at the "messages/" +// parent, which in the post-2024 layout lives at +// absRoot/your_activity_across_facebook. We try both candidates and +// return the first that stays inside absRoot and refers to an existing +// file. Empty string if the path escapes the root or nothing matches. +func resolveAttachmentURI(absRoot, uri string) string { + cleaned := filepath.Clean(uri) + if filepath.IsAbs(cleaned) { + return "" + } + // Reject early if the cleaned path tries to escape. + if strings.HasPrefix(cleaned, ".."+string(filepath.Separator)) || cleaned == ".." { + return "" + } + candidates := []string{ + filepath.Join(absRoot, "your_activity_across_facebook", cleaned), + filepath.Join(absRoot, "your_facebook_activity", cleaned), + filepath.Join(absRoot, cleaned), + } + var firstInside string + for _, full := range candidates { + absFull, err := filepath.Abs(full) + if err != nil { + continue + } + if !strings.HasPrefix(absFull, absRoot+string(filepath.Separator)) && absFull != absRoot { + continue + } + if firstInside == "" { + firstInside = absFull + } + if _, err := os.Stat(absFull); err == nil { + return absFull + } + } + // No candidate existed on disk; return the first inside-root candidate + // so callers can still record the (missing) row with the intended path. + return firstInside +} + +func guessMime(uri string) string { + ext := strings.ToLower(filepath.Ext(uri)) + switch ext { + case ".png": + return "image/png" + case ".jpg", ".jpeg": + return "image/jpeg" + case ".gif": + return "image/gif" + case ".mp4": + return "video/mp4" + case ".mp3": + return "audio/mpeg" + case ".wav": + return "audio/wav" + case ".pdf": + return "application/pdf" + case ".webp": + return "image/webp" + } + return "" +} diff --git a/internal/fbmessenger/json_parser_test.go b/internal/fbmessenger/json_parser_test.go new file mode 100644 index 00000000..dc0940e9 --- /dev/null +++ b/internal/fbmessenger/json_parser_test.go @@ -0,0 +1,262 @@ +package fbmessenger + +import ( + "errors" + "os" + "path/filepath" + "strings" + "testing" +) + +func threadDir(t *testing.T, root, section, name string) string { + t.Helper() + abs, err := filepath.Abs(root) + if err != nil { + t.Fatal(err) + } + return filepath.Join(abs, "your_activity_across_facebook", "messages", section, name) +} + +func TestParseJSONThread_Simple(t *testing.T) { + root := "testdata/json_simple" + th, err := ParseJSONThread(root, threadDir(t, root, "inbox", "alice_ABC123")) + if err != nil { + t.Fatalf("parse: %v", err) + } + if th.ConvType != "direct_chat" { + t.Errorf("conv_type=%q want direct_chat", th.ConvType) + } + if len(th.Participants) != 2 { + t.Errorf("participants=%d want 2", len(th.Participants)) + } + if len(th.Messages) != 3 { + t.Fatalf("messages=%d want 3", len(th.Messages)) + } + // Messages must be chronological ascending. + for i := 1; i < len(th.Messages); i++ { + if th.Messages[i-1].SentAt.After(th.Messages[i].SentAt) { + t.Errorf("messages out of order at %d", i) + } + } + // Mojibake repair: message 1 body must contain "café". + if !strings.Contains(th.Messages[1].Body, "café") { + t.Errorf("mojibake not repaired: body=%q", th.Messages[1].Body) + } + // Reactions appended to body. + if !strings.Contains(th.Messages[1].Body, "[reacted:") { + t.Errorf("reactions not appended: body=%q", th.Messages[1].Body) + } + if len(th.Messages[1].Reactions) != 2 { + t.Errorf("reactions=%d want 2", len(th.Messages[1].Reactions)) + } + // Index monotonic. + for i, m := range th.Messages { + if m.Index != i { + t.Errorf("index[%d]=%d want %d", i, m.Index, i) + } + } +} + +func TestParseJSONThread_Group(t *testing.T) { + root := "testdata/json_group" + th, err := ParseJSONThread(root, threadDir(t, root, "inbox", "crew_GRP123")) + if err != nil { + t.Fatalf("parse: %v", err) + } + if th.ConvType != "group_chat" { + t.Errorf("conv_type=%q want group_chat", th.ConvType) + } + if len(th.Participants) != 3 { + t.Errorf("participants=%d want 3", len(th.Participants)) + } +} + +func TestParseJSONThread_Multifile_NumericSort(t *testing.T) { + root := "testdata/json_multifile" + th, err := ParseJSONThread(root, threadDir(t, root, "inbox", "dave_MULTI")) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(th.Messages) != 4 { + t.Fatalf("messages=%d want 4", len(th.Messages)) + } + // Bodies, in chronological order, must be A,B,C,D. + wantBodies := []string{ + "Message A (from file 1, oldest)", + "Message B (from file 1, newer)", + "Message C (from file 2)", + "Message D (from file 10, newest)", + } + for i, w := range wantBodies { + if th.Messages[i].Body != w { + t.Errorf("messages[%d].Body=%q want %q", i, th.Messages[i].Body, w) + } + } +} + +func TestParseJSONThread_Corrupt(t *testing.T) { + root := "testdata/corrupt" + _, err := ParseJSONThread(root, threadDir(t, root, "inbox", "broken_BAD")) + if err == nil { + t.Fatal("expected error, got nil") + } + if !errors.Is(err, ErrCorruptJSON) { + t.Errorf("expected ErrCorruptJSON, got %v", err) + } +} + +func TestParseJSONThread_Attachments(t *testing.T) { + root := "testdata/json_with_media" + th, err := ParseJSONThread(root, threadDir(t, root, "inbox", "bob_XYZ789")) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(th.Messages) != 1 { + t.Fatalf("messages=%d want 1", len(th.Messages)) + } + m := th.Messages[0] + if len(m.Attachments) != 1 { + t.Fatalf("attachments=%d want 1", len(m.Attachments)) + } + if m.Attachments[0].Kind != "photo" { + t.Errorf("kind=%q want photo", m.Attachments[0].Kind) + } + if _, err := os.Stat(m.Attachments[0].AbsPath); err != nil { + t.Errorf("attachment file should exist on disk: %v", err) + } + if m.Attachments[0].MimeType != "image/png" { + t.Errorf("mime=%q want image/png", m.Attachments[0].MimeType) + } +} + +func TestParseJSONThread_Attachments_AltLayout(t *testing.T) { + root := "testdata/json_with_media_alt" + absRoot, err := filepath.Abs(root) + if err != nil { + t.Fatal(err) + } + td := filepath.Join(absRoot, "your_facebook_activity", "messages", "inbox", "carol_ALT456") + th, err := ParseJSONThread(root, td) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(th.Messages) != 1 { + t.Fatalf("messages=%d want 1", len(th.Messages)) + } + m := th.Messages[0] + if len(m.Attachments) != 1 { + t.Fatalf("attachments=%d want 1", len(m.Attachments)) + } + if m.Attachments[0].Kind != "photo" { + t.Errorf("kind=%q want photo", m.Attachments[0].Kind) + } + if _, err := os.Stat(m.Attachments[0].AbsPath); err != nil { + t.Errorf("attachment file should exist on disk: %v", err) + } +} + +func TestParseJSONThread_NonTextBodies(t *testing.T) { + root := "testdata/json_nontext" + th, err := ParseJSONThread(root, threadDir(t, root, "inbox", "sam_NONTXT")) + if err != nil { + t.Fatalf("parse: %v", err) + } + // Ordered chronologically ascending: unsubscribe, share, missed call, call, photo, sticker. + wantBodies := []string{ + "[system] Sam left the chat", + "[shared link] https://example.com/article\nExample share text", + "[call: missed, 0s]", + "[call: 3m 12s]", + "[photo]", + "[sticker]", + } + if len(th.Messages) != len(wantBodies) { + t.Fatalf("messages=%d want %d", len(th.Messages), len(wantBodies)) + } + for i, w := range wantBodies { + if th.Messages[i].Body != w { + t.Errorf("messages[%d].Body=%q want %q", i, th.Messages[i].Body, w) + } + } +} + +func TestParseJSONThread_PathEscapeRejected(t *testing.T) { + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "evil_ESC") + if err := os.MkdirAll(threadPath, 0755); err != nil { + t.Fatal(err) + } + body := `{"participants":[{"name":"A"},{"name":"B"}],"messages":[ +{"sender_name":"A","timestamp_ms":1600000000000,"type":"Generic","photos":[{"uri":"../../etc/passwd"}]} +],"title":"x"}` + if err := os.WriteFile(filepath.Join(threadPath, "message_1.json"), []byte(body), 0644); err != nil { + t.Fatal(err) + } + th, err := ParseJSONThread(tmp, threadPath) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(th.Messages) != 1 { + t.Fatalf("messages=%d want 1", len(th.Messages)) + } + att := th.Messages[0].Attachments + if len(att) != 1 { + t.Fatalf("attachments=%d want 1", len(att)) + } + if att[0].AbsPath != "" { + t.Errorf("path escape not rejected: AbsPath=%q", att[0].AbsPath) + } +} + +// When a thread dir has no valid numbered message files at all, the +// parser returns an error because there is nothing to import. +func TestParseJSONThread_OnlyUnnumberedFiles(t *testing.T) { + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "bad_NAME") + if err := os.MkdirAll(threadPath, 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(threadPath, "message_final.json"), []byte(`{}`), 0644); err != nil { + t.Fatal(err) + } + _, err := ParseJSONThread(tmp, threadPath) + if err == nil { + t.Fatal("expected error when no valid numbered files present") + } +} + +// When a thread dir contains BOTH valid numbered files and a sibling +// whose name doesn't match the `^message_(\d+)\.json$` pattern, the +// parser must import the valid file(s) and report the bad sibling via +// Thread.BadSiblings rather than aborting the entire thread. +func TestParseJSONThread_SkipsUnnumberedSibling(t *testing.T) { + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "mix_MIXED") + if err := os.MkdirAll(threadPath, 0755); err != nil { + t.Fatal(err) + } + good := `{"participants":[{"name":"A"},{"name":"B"}],"messages":[ +{"sender_name":"A","timestamp_ms":1600000000000,"type":"Generic","content":"hi from A"} +],"title":"mix"}` + if err := os.WriteFile(filepath.Join(threadPath, "message_1.json"), []byte(good), 0644); err != nil { + t.Fatal(err) + } + // Facebook sometimes writes a human-named sibling; content doesn't + // matter because we skip it by name before attempting to parse it. + if err := os.WriteFile(filepath.Join(threadPath, "message_final.json"), []byte(`not even valid json`), 0644); err != nil { + t.Fatal(err) + } + th, err := ParseJSONThread(tmp, threadPath) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(th.Messages) != 1 { + t.Fatalf("messages=%d want 1", len(th.Messages)) + } + if th.Messages[0].Body != "hi from A" { + t.Errorf("body=%q want %q", th.Messages[0].Body, "hi from A") + } + if len(th.BadSiblings) != 1 || th.BadSiblings[0] != "message_final.json" { + t.Errorf("BadSiblings=%v want [message_final.json]", th.BadSiblings) + } +} diff --git a/internal/fbmessenger/slug.go b/internal/fbmessenger/slug.go new file mode 100644 index 00000000..c50e312c --- /dev/null +++ b/internal/fbmessenger/slug.go @@ -0,0 +1,97 @@ +package fbmessenger + +import ( + "crypto/sha1" + "encoding/hex" + "strings" + "unicode" + "unicode/utf8" + + "github.com/wesm/msgvault/internal/mime" + "golang.org/x/text/runes" + "golang.org/x/text/transform" + "golang.org/x/text/unicode/norm" +) + +// Domain is the synthetic domain used for all Facebook Messenger participants. +const Domain = "facebook.messenger" + +// Slug normalizes a Facebook display name into a deterministic ASCII slug +// usable as the local-part of a synthetic email address. Diacritics are +// stripped via NFKD + combining-mark removal; runs of non-alphanumerics +// collapse to a single dot; leading and trailing dots are trimmed. +// Returns an empty string when no ASCII-foldable letters or digits remain. +func Slug(name string) string { + t := transform.Chain(norm.NFKD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) + folded, _, err := transform.String(t, name) + if err != nil { + folded = name + } + var b strings.Builder + prevDot := true + for _, r := range folded { + switch { + case r >= 'A' && r <= 'Z': + b.WriteRune(r + 32) + prevDot = false + case (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9'): + b.WriteRune(r) + prevDot = false + default: + if !prevDot { + b.WriteByte('.') + prevDot = true + } + } + } + return strings.Trim(b.String(), ".") +} + +// Address returns a deterministic synthetic mime.Address for the given +// Facebook display name. The name is preserved verbatim; the email +// local-part is Slug(name), falling back to "user.<8-hex-sha1>" when the +// slug is empty so callers never see an address without a local-part. +func Address(name string) mime.Address { + local := Slug(name) + if local == "" { + sum := sha1.Sum([]byte(name)) + local = "user." + hex.EncodeToString(sum[:4]) + } + return mime.Address{ + Name: name, + Email: local + "@" + Domain, + Domain: Domain, + } +} + +// DecodeMojibake reverses Facebook DYI JSON's well-known Latin-1-over-UTF-8 +// encoding. Facebook's JSON exporter writes UTF-8 bytes as if they were +// Latin-1 code points, so "café" (UTF-8: 0x63 0x61 0x66 0xC3 0xA9) becomes +// the JSON string "caf\u00c3\u00a9". This function re-interprets those +// Latin-1 code points as raw bytes and checks whether the result is valid +// UTF-8. If not (or if the input contains runes above U+00FF), the original +// string is returned unchanged. +func DecodeMojibake(s string) string { + buf := make([]byte, 0, len(s)) + for _, r := range s { + if r > 0xFF { + return s + } + buf = append(buf, byte(r)) + } + decoded := string(buf) + if !utf8.ValidString(decoded) { + return s + } + return decoded +} + +// StripDomain returns the local part of a synthetic "<slug>@facebook.messenger" +// email. If the input lacks the expected domain suffix, it is returned as-is. +func StripDomain(email string) string { + suffix := "@" + Domain + if strings.HasSuffix(email, suffix) { + return strings.TrimSuffix(email, suffix) + } + return email +} diff --git a/internal/fbmessenger/slug_test.go b/internal/fbmessenger/slug_test.go new file mode 100644 index 00000000..62a28045 --- /dev/null +++ b/internal/fbmessenger/slug_test.go @@ -0,0 +1,96 @@ +package fbmessenger + +import ( + "strings" + "testing" +) + +func TestSlug(t *testing.T) { + cases := []struct { + in, want string + }{ + {"Test User", "test.user"}, + {"Marie-Ève Côté", "marie.eve.cote"}, + {" Alice ", "alice"}, + {"alice@example.com", "alice.example.com"}, + {"小明", ""}, + {"", ""}, + } + for _, c := range cases { + if got := Slug(c.in); got != c.want { + t.Errorf("Slug(%q)=%q want %q", c.in, got, c.want) + } + } +} + +func TestAddressFallbackForEmptySlug(t *testing.T) { + a := Address("小明") + if !strings.HasPrefix(a.Email, "user.") || !strings.HasSuffix(a.Email, "@facebook.messenger") { + t.Fatalf("unexpected fallback email: %q", a.Email) + } + if Address("小明").Email != a.Email { + t.Fatal("fallback must be deterministic across calls") + } + if a.Domain != "facebook.messenger" { + t.Fatalf("domain=%q", a.Domain) + } + if a.Name != "小明" { + t.Fatalf("display name must be preserved unaltered, got %q", a.Name) + } +} + +func TestAddressRegular(t *testing.T) { + a := Address("Test User") + if a.Email != "test.user@facebook.messenger" { + t.Fatalf("email=%q", a.Email) + } + if a.Name != "Test User" { + t.Fatalf("name=%q", a.Name) + } + if a.Domain != "facebook.messenger" { + t.Fatalf("domain=%q", a.Domain) + } +} + +func TestDecodeMojibake(t *testing.T) { + // "é" (U+00E9) encoded as UTF-8 is bytes 0xC3 0xA9. Interpreted as + // Latin-1, those are runes U+00C3 U+00A9, which Facebook then emits + // as JSON. DecodeMojibake must reverse that. + in := "caf\u00c3\u00a9" + want := "café" + if got := DecodeMojibake(in); got != want { + t.Fatalf("DecodeMojibake(%q)=%q want %q", in, got, want) + } + // Non-Latin-1 input must round-trip unchanged. + if got := DecodeMojibake("正常"); got != "正常" { + t.Fatalf("non-Latin-1 round-trip failed: got %q", got) + } + // ASCII round-trips unchanged. + if got := DecodeMojibake("hello"); got != "hello" { + t.Fatalf("ascii round-trip: got %q", got) + } + // Already-valid UTF-8 with Latin-1-range code points must be preserved. + // "café" has é = U+00E9, which is <= 0xFF, so the old code would + // convert it to the single byte 0xE9 (invalid UTF-8). The fix detects + // that the converted result is not valid UTF-8 and returns the original. + if got := DecodeMojibake("café"); got != "café" { + t.Fatalf("valid UTF-8 café corrupted: got %q", got) + } + // "naïve" has ï = U+00EF, same risk. + if got := DecodeMojibake("naïve"); got != "naïve" { + t.Fatalf("valid UTF-8 naïve corrupted: got %q", got) + } + // "über" has ü = U+00FC. + if got := DecodeMojibake("über"); got != "über" { + t.Fatalf("valid UTF-8 über corrupted: got %q", got) + } +} + +func TestStripDomain(t *testing.T) { + if got := StripDomain("test.user@facebook.messenger"); got != "test.user" { + t.Fatalf("got %q", got) + } + if got := StripDomain("test.user"); got != "test.user" { + t.Fatalf("got %q", got) + } +} diff --git a/internal/fbmessenger/testdata/corrupt/your_activity_across_facebook/messages/inbox/broken_BAD/message_1.json b/internal/fbmessenger/testdata/corrupt/your_activity_across_facebook/messages/inbox/broken_BAD/message_1.json new file mode 100644 index 00000000..5753c1fd --- /dev/null +++ b/internal/fbmessenger/testdata/corrupt/your_activity_across_facebook/messages/inbox/broken_BAD/message_1.json @@ -0,0 +1 @@ +{this is not: valid json, missing quotes and structure diff --git a/internal/fbmessenger/testdata/corrupt/your_activity_across_facebook/messages/inbox/goodsibling_OK/message_1.json b/internal/fbmessenger/testdata/corrupt/your_activity_across_facebook/messages/inbox/goodsibling_OK/message_1.json new file mode 100644 index 00000000..b9b39931 --- /dev/null +++ b/internal/fbmessenger/testdata/corrupt/your_activity_across_facebook/messages/inbox/goodsibling_OK/message_1.json @@ -0,0 +1,16 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Good Sibling"} + ], + "messages": [ + { + "sender_name": "Good Sibling", + "timestamp_ms": 1600000000000, + "content": "Still valid", + "type": "Generic" + } + ], + "title": "Good Sibling", + "thread_path": "inbox/goodsibling_OK" +} diff --git a/internal/fbmessenger/testdata/e2ee_simple/your_activity_across_facebook/messages/alice_1.json b/internal/fbmessenger/testdata/e2ee_simple/your_activity_across_facebook/messages/alice_1.json new file mode 100644 index 00000000..5f9b47ad --- /dev/null +++ b/internal/fbmessenger/testdata/e2ee_simple/your_activity_across_facebook/messages/alice_1.json @@ -0,0 +1,44 @@ +{ + "participants": ["Test User", "Alice Example"], + "threadName": "Alice Example", + "messages": [ + { + "senderName": "Test User", + "text": "Hello", + "timestamp": 1600000000000, + "type": "Generic", + "isUnsent": false, + "media": [], + "reactions": [] + }, + { + "senderName": "Alice Example", + "text": "caf\u00c3\u00a9 time?", + "timestamp": 1600000060000, + "type": "Generic", + "isUnsent": false, + "media": [], + "reactions": [ + {"actor": "Test User", "reaction": "\u00e2\u009d\u00a4"} + ] + }, + { + "senderName": "Test User", + "text": "See you soon", + "timestamp": 1600000120000, + "type": "Generic", + "isUnsent": false, + "media": [], + "reactions": [] + }, + { + "senderName": "Test User", + "text": "oops", + "timestamp": 1600000180000, + "type": "Generic", + "isUnsent": true, + "media": [], + "reactions": [] + } + ] +} diff --git a/internal/fbmessenger/testdata/e2ee_simple/your_activity_across_facebook/messages/group_2.json b/internal/fbmessenger/testdata/e2ee_simple/your_activity_across_facebook/messages/group_2.json new file mode 100644 index 00000000..e44aedfe --- /dev/null +++ b/internal/fbmessenger/testdata/e2ee_simple/your_activity_across_facebook/messages/group_2.json @@ -0,0 +1,24 @@ +{ + "participants": ["Test User", "Alice Example", "Bob Builder"], + "threadName": "Weekend crew", + "messages": [ + { + "senderName": "Alice Example", + "text": "Plans?", + "timestamp": 1600000000000, + "type": "Generic", + "isUnsent": false, + "media": [], + "reactions": [] + }, + { + "senderName": "Bob Builder", + "text": "", + "timestamp": 1600000060000, + "type": "Generic", + "isUnsent": false, + "media": [{"uri": "./media/photo.jpg"}], + "reactions": [] + } + ] +} diff --git a/internal/fbmessenger/testdata/e2ee_simple/your_activity_across_facebook/messages/media/photo.jpg b/internal/fbmessenger/testdata/e2ee_simple/your_activity_across_facebook/messages/media/photo.jpg new file mode 100644 index 00000000..c6596fe6 --- /dev/null +++ b/internal/fbmessenger/testdata/e2ee_simple/your_activity_across_facebook/messages/media/photo.jpg @@ -0,0 +1 @@ +fake-jpeg \ No newline at end of file diff --git a/internal/fbmessenger/testdata/html_multi_media/your_activity_across_facebook/messages/inbox/carol_IMG456/message_1.html b/internal/fbmessenger/testdata/html_multi_media/your_activity_across_facebook/messages/inbox/carol_IMG456/message_1.html new file mode 100644 index 00000000..233b0899 --- /dev/null +++ b/internal/fbmessenger/testdata/html_multi_media/your_activity_across_facebook/messages/inbox/carol_IMG456/message_1.html @@ -0,0 +1,25 @@ +<!DOCTYPE html> +<html lang="en"> +<head><meta charset="utf-8"><title>Carol Photoshare + +
+
Participants: Test User, Carol Photoshare
+
+
Test User
+
Hello Carol
+
Sep 13, 2020, 12:00 PM
+
+
+
Carol Photoshare
+
Check out this photo
+
+
Sep 13, 2020, 12:05 PM
+
+
+
Test User
+
Nice picture
+
Sep 13, 2020, 12:10 PM
+
+
+ + diff --git a/internal/fbmessenger/testdata/html_multi_media/your_activity_across_facebook/messages/inbox/carol_IMG456/photos/photo1.png b/internal/fbmessenger/testdata/html_multi_media/your_activity_across_facebook/messages/inbox/carol_IMG456/photos/photo1.png new file mode 100644 index 00000000..62a5f8f4 Binary files /dev/null and b/internal/fbmessenger/testdata/html_multi_media/your_activity_across_facebook/messages/inbox/carol_IMG456/photos/photo1.png differ diff --git a/internal/fbmessenger/testdata/html_simple/your_activity_across_facebook/messages/inbox/alice_ABC123/message_1.html b/internal/fbmessenger/testdata/html_simple/your_activity_across_facebook/messages/inbox/alice_ABC123/message_1.html new file mode 100644 index 00000000..98f83082 --- /dev/null +++ b/internal/fbmessenger/testdata/html_simple/your_activity_across_facebook/messages/inbox/alice_ABC123/message_1.html @@ -0,0 +1,28 @@ + + +Alice Example + +
+
+
Participants: Test User, Alice Example
+
+
+
+
Test User
+
Hello
+
Sep 13, 2020, 12:26 PM
+
+
+
Alice Example
+
café time?
+
Sep 13, 2020, 12:27 PM
+
+
+
Test User
+
See you soon
+
Sep 13, 2020, 12:28 PM
+
+
+
+ + diff --git a/internal/fbmessenger/testdata/html_timestamps/layout1.html b/internal/fbmessenger/testdata/html_timestamps/layout1.html new file mode 100644 index 00000000..a33d8fa2 --- /dev/null +++ b/internal/fbmessenger/testdata/html_timestamps/layout1.html @@ -0,0 +1,6 @@ + +TS Test + +
Participants: Test User, TS Test
+
Test User
ping
Oct 19, 2019, 2:37 PM
+ diff --git a/internal/fbmessenger/testdata/html_timestamps/layout2.html b/internal/fbmessenger/testdata/html_timestamps/layout2.html new file mode 100644 index 00000000..9ceb2a03 --- /dev/null +++ b/internal/fbmessenger/testdata/html_timestamps/layout2.html @@ -0,0 +1,6 @@ + +TS Test + +
Participants: Test User, TS Test
+
Test User
ping
Oct 19, 2019 at 2:37 PM
+ diff --git a/internal/fbmessenger/testdata/html_timestamps/layout3.html b/internal/fbmessenger/testdata/html_timestamps/layout3.html new file mode 100644 index 00000000..b41f1715 --- /dev/null +++ b/internal/fbmessenger/testdata/html_timestamps/layout3.html @@ -0,0 +1,6 @@ + +TS Test + +
Participants: Test User, TS Test
+
Test User
ping
19 Oct 2019, 14:37
+ diff --git a/internal/fbmessenger/testdata/html_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/message_1.html b/internal/fbmessenger/testdata/html_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/message_1.html new file mode 100644 index 00000000..0d25deb6 --- /dev/null +++ b/internal/fbmessenger/testdata/html_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/message_1.html @@ -0,0 +1,15 @@ + + +Bob Mediafile + +
+
Participants: Test User, Bob Mediafile
+
+
Bob Mediafile
+
Here's a photo
+
+
Sep 13, 2020, 12:26 PM
+
+
+ + diff --git a/internal/fbmessenger/testdata/html_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png b/internal/fbmessenger/testdata/html_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png new file mode 100644 index 00000000..62a5f8f4 Binary files /dev/null and b/internal/fbmessenger/testdata/html_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png differ diff --git a/internal/fbmessenger/testdata/json_group/your_activity_across_facebook/messages/inbox/crew_GRP123/message_1.json b/internal/fbmessenger/testdata/json_group/your_activity_across_facebook/messages/inbox/crew_GRP123/message_1.json new file mode 100644 index 00000000..f409fe6a --- /dev/null +++ b/internal/fbmessenger/testdata/json_group/your_activity_across_facebook/messages/inbox/crew_GRP123/message_1.json @@ -0,0 +1,29 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Alice Example"}, + {"name": "Bob Example"} + ], + "messages": [ + { + "sender_name": "Test User", + "timestamp_ms": 1600000000000, + "content": "Hey crew", + "type": "Generic" + }, + { + "sender_name": "Alice Example", + "timestamp_ms": 1600000060000, + "content": "Hi!", + "type": "Generic" + }, + { + "sender_name": "Bob Example", + "timestamp_ms": 1600000120000, + "content": "Hello.", + "type": "Generic" + } + ], + "title": "Crew", + "thread_path": "inbox/crew_GRP123" +} diff --git a/internal/fbmessenger/testdata/json_multifile/your_activity_across_facebook/messages/inbox/dave_MULTI/message_1.json b/internal/fbmessenger/testdata/json_multifile/your_activity_across_facebook/messages/inbox/dave_MULTI/message_1.json new file mode 100644 index 00000000..92a0ea03 --- /dev/null +++ b/internal/fbmessenger/testdata/json_multifile/your_activity_across_facebook/messages/inbox/dave_MULTI/message_1.json @@ -0,0 +1,22 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Dave Multi"} + ], + "messages": [ + { + "sender_name": "Test User", + "timestamp_ms": 1700000060000, + "content": "Message B (from file 1, newer)", + "type": "Generic" + }, + { + "sender_name": "Dave Multi", + "timestamp_ms": 1700000000000, + "content": "Message A (from file 1, oldest)", + "type": "Generic" + } + ], + "title": "Dave Multi", + "thread_path": "inbox/dave_MULTI" +} diff --git a/internal/fbmessenger/testdata/json_multifile/your_activity_across_facebook/messages/inbox/dave_MULTI/message_10.json b/internal/fbmessenger/testdata/json_multifile/your_activity_across_facebook/messages/inbox/dave_MULTI/message_10.json new file mode 100644 index 00000000..7d13484a --- /dev/null +++ b/internal/fbmessenger/testdata/json_multifile/your_activity_across_facebook/messages/inbox/dave_MULTI/message_10.json @@ -0,0 +1,16 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Dave Multi"} + ], + "messages": [ + { + "sender_name": "Test User", + "timestamp_ms": 1700000180000, + "content": "Message D (from file 10, newest)", + "type": "Generic" + } + ], + "title": "Dave Multi", + "thread_path": "inbox/dave_MULTI" +} diff --git a/internal/fbmessenger/testdata/json_multifile/your_activity_across_facebook/messages/inbox/dave_MULTI/message_2.json b/internal/fbmessenger/testdata/json_multifile/your_activity_across_facebook/messages/inbox/dave_MULTI/message_2.json new file mode 100644 index 00000000..f5e8f19f --- /dev/null +++ b/internal/fbmessenger/testdata/json_multifile/your_activity_across_facebook/messages/inbox/dave_MULTI/message_2.json @@ -0,0 +1,16 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Dave Multi"} + ], + "messages": [ + { + "sender_name": "Dave Multi", + "timestamp_ms": 1700000120000, + "content": "Message C (from file 2)", + "type": "Generic" + } + ], + "title": "Dave Multi", + "thread_path": "inbox/dave_MULTI" +} diff --git a/internal/fbmessenger/testdata/json_nontext/your_activity_across_facebook/messages/inbox/sam_NONTXT/message_1.json b/internal/fbmessenger/testdata/json_nontext/your_activity_across_facebook/messages/inbox/sam_NONTXT/message_1.json new file mode 100644 index 00000000..b0def154 --- /dev/null +++ b/internal/fbmessenger/testdata/json_nontext/your_activity_across_facebook/messages/inbox/sam_NONTXT/message_1.json @@ -0,0 +1,52 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Sam NonText"} + ], + "messages": [ + { + "sender_name": "Test User", + "timestamp_ms": 1600000600000, + "type": "Generic", + "sticker": {"uri": "messages/stickers_used/1.png"} + }, + { + "sender_name": "Sam NonText", + "timestamp_ms": 1600000500000, + "type": "Generic", + "photos": [ + {"uri": "messages/inbox/sam_NONTXT/photos/missing.jpg", "creation_timestamp": 1600000500} + ] + }, + { + "sender_name": "Test User", + "timestamp_ms": 1600000400000, + "type": "Call", + "call_duration": 192 + }, + { + "sender_name": "Sam NonText", + "timestamp_ms": 1600000300000, + "type": "Call", + "call_duration": 0, + "missed": true + }, + { + "sender_name": "Test User", + "timestamp_ms": 1600000200000, + "type": "Share", + "share": { + "link": "https://example.com/article", + "share_text": "Example share text" + } + }, + { + "sender_name": "Sam NonText", + "timestamp_ms": 1600000100000, + "content": "Sam left the chat", + "type": "Unsubscribe" + } + ], + "title": "Sam NonText", + "thread_path": "inbox/sam_NONTXT" +} diff --git a/internal/fbmessenger/testdata/json_simple/your_activity_across_facebook/messages/archived_threads/zoe_ARCH/message_1.json b/internal/fbmessenger/testdata/json_simple/your_activity_across_facebook/messages/archived_threads/zoe_ARCH/message_1.json new file mode 100644 index 00000000..b3be6fd6 --- /dev/null +++ b/internal/fbmessenger/testdata/json_simple/your_activity_across_facebook/messages/archived_threads/zoe_ARCH/message_1.json @@ -0,0 +1,16 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Zoe Archive"} + ], + "messages": [ + { + "sender_name": "Zoe Archive", + "timestamp_ms": 1500000000000, + "content": "Old message in archive", + "type": "Generic" + } + ], + "title": "Zoe Archive", + "thread_path": "archived_threads/zoe_ARCH" +} diff --git a/internal/fbmessenger/testdata/json_simple/your_activity_across_facebook/messages/inbox/alice_ABC123/message_1.json b/internal/fbmessenger/testdata/json_simple/your_activity_across_facebook/messages/inbox/alice_ABC123/message_1.json new file mode 100644 index 00000000..02323b95 --- /dev/null +++ b/internal/fbmessenger/testdata/json_simple/your_activity_across_facebook/messages/inbox/alice_ABC123/message_1.json @@ -0,0 +1,34 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Alice Example"} + ], + "messages": [ + { + "sender_name": "Test User", + "timestamp_ms": 1600000120000, + "content": "See you soon", + "type": "Generic" + }, + { + "sender_name": "Alice Example", + "timestamp_ms": 1600000060000, + "content": "caf\u00c3\u00a9 time?", + "type": "Generic", + "reactions": [ + {"reaction": "\u00e2\u009d\u00a4", "actor": "Test User"}, + {"reaction": "\u00f0\u009f\u0091\u008d", "actor": "Alice Example"} + ] + }, + { + "sender_name": "Test User", + "timestamp_ms": 1600000000000, + "content": "Hello", + "type": "Generic" + } + ], + "title": "Alice Example", + "is_still_participant": true, + "thread_type": "Regular", + "thread_path": "inbox/alice_ABC123" +} diff --git a/internal/fbmessenger/testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/message_1.json b/internal/fbmessenger/testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/message_1.json new file mode 100644 index 00000000..25a9485b --- /dev/null +++ b/internal/fbmessenger/testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/message_1.json @@ -0,0 +1,19 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Bob Mediafile"} + ], + "messages": [ + { + "sender_name": "Bob Mediafile", + "timestamp_ms": 1600000000000, + "type": "Generic", + "content": "Here's a photo", + "photos": [ + {"uri": "messages/inbox/bob_XYZ789/photos/tiny.png", "creation_timestamp": 1600000000} + ] + } + ], + "title": "Bob Mediafile", + "thread_path": "inbox/bob_XYZ789" +} diff --git a/internal/fbmessenger/testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png b/internal/fbmessenger/testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png new file mode 100644 index 00000000..62a5f8f4 Binary files /dev/null and b/internal/fbmessenger/testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png differ diff --git a/internal/fbmessenger/testdata/json_with_media_alt/your_facebook_activity/messages/inbox/carol_ALT456/message_1.json b/internal/fbmessenger/testdata/json_with_media_alt/your_facebook_activity/messages/inbox/carol_ALT456/message_1.json new file mode 100644 index 00000000..875cec80 --- /dev/null +++ b/internal/fbmessenger/testdata/json_with_media_alt/your_facebook_activity/messages/inbox/carol_ALT456/message_1.json @@ -0,0 +1,19 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Carol Altlayout"} + ], + "messages": [ + { + "sender_name": "Carol Altlayout", + "timestamp_ms": 1700000000000, + "type": "Generic", + "content": "Photo from alt layout", + "photos": [ + {"uri": "messages/inbox/carol_ALT456/photos/tiny.png", "creation_timestamp": 1700000000} + ] + } + ], + "title": "Carol Altlayout", + "thread_path": "inbox/carol_ALT456" +} diff --git a/internal/fbmessenger/testdata/json_with_media_alt/your_facebook_activity/messages/inbox/carol_ALT456/photos/tiny.png b/internal/fbmessenger/testdata/json_with_media_alt/your_facebook_activity/messages/inbox/carol_ALT456/photos/tiny.png new file mode 100644 index 00000000..62a5f8f4 Binary files /dev/null and b/internal/fbmessenger/testdata/json_with_media_alt/your_facebook_activity/messages/inbox/carol_ALT456/photos/tiny.png differ diff --git a/internal/fbmessenger/testdata/mixed/your_activity_across_facebook/messages/inbox/eve_MIX/message_1.html b/internal/fbmessenger/testdata/mixed/your_activity_across_facebook/messages/inbox/eve_MIX/message_1.html new file mode 100644 index 00000000..dc198565 --- /dev/null +++ b/internal/fbmessenger/testdata/mixed/your_activity_across_facebook/messages/inbox/eve_MIX/message_1.html @@ -0,0 +1,15 @@ + +Eve Mixer + +
Participants: Test User, Eve Mixer
+
+
Test User
+
Mixed hello
+
Sep 13, 2020, 12:26 PM
+
+
+
Eve Mixer
+
Mixed reply
+
Sep 13, 2020, 12:27 PM
+
+ diff --git a/internal/fbmessenger/testdata/mixed/your_activity_across_facebook/messages/inbox/eve_MIX/message_1.json b/internal/fbmessenger/testdata/mixed/your_activity_across_facebook/messages/inbox/eve_MIX/message_1.json new file mode 100644 index 00000000..2de86639 --- /dev/null +++ b/internal/fbmessenger/testdata/mixed/your_activity_across_facebook/messages/inbox/eve_MIX/message_1.json @@ -0,0 +1,22 @@ +{ + "participants": [ + {"name": "Test User"}, + {"name": "Eve Mixer"} + ], + "messages": [ + { + "sender_name": "Eve Mixer", + "timestamp_ms": 1600000060000, + "content": "Mixed reply", + "type": "Generic" + }, + { + "sender_name": "Test User", + "timestamp_ms": 1600000000000, + "content": "Mixed hello", + "type": "Generic" + } + ], + "title": "Eve Mixer", + "thread_path": "inbox/eve_MIX" +} diff --git a/internal/fbmessenger/types.go b/internal/fbmessenger/types.go new file mode 100644 index 00000000..f1b83494 --- /dev/null +++ b/internal/fbmessenger/types.go @@ -0,0 +1,96 @@ +package fbmessenger + +import ( + "errors" + "time" +) + +// ErrCorruptJSON is returned by the JSON parser when a message_*.json file +// cannot be parsed as valid JSON. Callers should log and skip the thread. +var ErrCorruptJSON = errors.New("fbmessenger: corrupt json") + +// ErrNotE2EEThread is returned by ParseE2EEJSONFile when a candidate +// flat-export JSON file parses successfully but is not an E2EE thread +// (e.g. a DYI metadata file). Callers should silently skip — the +// discoverer does not pre-filter by shape, so the parser is responsible +// for rejecting non-thread JSON. +var ErrNotE2EEThread = errors.New("fbmessenger: not an e2ee thread") + +// Thread is the parsed form of one DYI thread directory, ready for import. +type Thread struct { + // DirName is the directory name (e.g. "testuser_ABC123XYZ"). This + // is used verbatim as source_conversation_id. + DirName string + // Section is the DYI section ("inbox", "archived_threads", ...). + // Populated by the importer from ThreadDir.Section. + Section string + // Title is the human-readable thread title (may be empty). + Title string + // ConvType is "direct_chat" for 2 participants or "group_chat" for 3+. + ConvType string + // Participants are every participant in the thread (including the + // importing user). + Participants []Participant + // Messages are sorted chronologically (ascending). + Messages []Message + // Format is "json" or "html". + Format string + // RawBytes is the original source bytes for round-tripping into + // message_raw. For multi-file JSON threads we concatenate files into + // a single JSON array for storage; HTML threads store the single file. + RawBytes []byte + // BadSiblings holds the names of sibling files in the thread directory + // that looked like `message_*.json` but did not match the expected + // `message_.json` pattern. The parser skips them and records them + // here so the importer can log and count them without aborting the + // rest of the thread. + BadSiblings []string +} + +// Participant is a thread participant from the DYI export. +type Participant struct { + Name string +} + +// Attachment represents a single file referenced by a message. +type Attachment struct { + // URI is the relative path from the DYI export root. + URI string + // AbsPath is the absolute filesystem path after resolution; empty + // when the file is missing or the URI escapes the root. + AbsPath string + // Kind is "photo", "video", "audio", "file", "gif", or "sticker". + Kind string + // MimeType is a best-guess content type; may be empty. + MimeType string + // Filename is the base file name from the URI. + Filename string +} + +// Reaction is a reaction attached to a message. +type Reaction struct { + Actor string + Reaction string +} + +// Message is one message in a parsed DYI thread. +type Message struct { + // Index is a monotonic per-thread index (0-based) used to construct + // a stable source_message_id. + Index int + // SenderName is the raw display name reported by DYI. Empty for + // unknown / system messages. + SenderName string + // SentAt is the message timestamp in UTC. Zero when we could not parse. + SentAt time.Time + // Body is the rendered message body. Placeholders like "[sticker]" or + // "[call: 3m 12s]" are produced for non-text messages per plan D10. + Body string + // Attachments are any files referenced from this message. + Attachments []Attachment + // Reactions are reactions applied to this message. + Reactions []Reaction + // Type is the DYI "type" string (e.g. "Generic", "Share", "Call", + // "Unsubscribe"). Empty for HTML imports. + Type string +} diff --git a/internal/query/duckdb.go b/internal/query/duckdb.go index 5c0e6ff3..22744cfe 100644 --- a/internal/query/duckdb.go +++ b/internal/query/duckdb.go @@ -650,10 +650,6 @@ func (e *DuckDBEngine) buildWhereClause(opts AggregateOptions, keyColumns ...str var conditions []string var args []interface{} - // Exclude text messages from email-mode queries. - // message_type IS NULL and '' handle old data without the column. - conditions = append(conditions, "(msg.message_type = 'email' OR msg.message_type IS NULL OR msg.message_type = '')") - conditions = append(conditions, store.LiveMessagesWhere("msg", opts.HideDeletedFromSource)) conditions, args = appendSourceFilter(conditions, args, "msg.", opts.SourceID, opts.SourceIDs) @@ -851,10 +847,6 @@ func (e *DuckDBEngine) buildFilterConditions(filter MessageFilter) (string, []in var conditions []string var args []interface{} - // Exclude text messages from email-mode queries. - // message_type IS NULL and '' handle old data without the column. - conditions = append(conditions, "(msg.message_type = 'email' OR msg.message_type IS NULL OR msg.message_type = '')") - conditions = append(conditions, store.LiveMessagesWhere("msg", filter.HideDeletedFromSource)) conditions, args = appendSourceFilter(conditions, args, "msg.", filter.SourceID, filter.SourceIDs) diff --git a/internal/query/sqlite.go b/internal/query/sqlite.go index 18e3a8ba..410cf9e4 100644 --- a/internal/query/sqlite.go +++ b/internal/query/sqlite.go @@ -186,10 +186,6 @@ func optsToFilterConditions(opts AggregateOptions, prefix string) ([]string, []i var conditions []string var args []interface{} - // Exclude text messages from email-mode queries. - // message_type IS NULL and '' handle old data without the column. - conditions = append(conditions, "("+prefix+"message_type = 'email' OR "+prefix+"message_type IS NULL OR "+prefix+"message_type = '')") - // Always exclude rows soft-deleted by deduplicate; gate // source-deleted on opts.HideDeletedFromSource via the helper. conditions = append(conditions, store.LiveMessagesWhere(strings.TrimSuffix(prefix, "."), opts.HideDeletedFromSource)) @@ -258,10 +254,6 @@ func buildFilterJoinsAndConditions(filter MessageFilter, tableAlias string) (str // Include all messages (deleted messages shown with indicator in TUI) - // Exclude text messages from email-mode queries. - // message_type IS NULL and '' handle old data without the column. - conditions = append(conditions, "("+prefix+"message_type = 'email' OR "+prefix+"message_type IS NULL OR "+prefix+"message_type = '')") - // Always exclude rows soft-deleted by deduplicate; gate // source-deleted on filter.HideDeletedFromSource via the helper. conditions = append(conditions, store.LiveMessagesWhere(strings.TrimSuffix(prefix, "."), filter.HideDeletedFromSource)) diff --git a/internal/store/schema.sql b/internal/store/schema.sql index ff1c2b50..83722d6a 100644 --- a/internal/store/schema.sql +++ b/internal/store/schema.sql @@ -113,7 +113,7 @@ CREATE TABLE IF NOT EXISTS messages ( rfc822_message_id TEXT, -- Message classification - message_type TEXT NOT NULL, -- 'email', 'imessage', 'sms', 'mms', 'rcs', 'whatsapp' + message_type TEXT NOT NULL, -- 'email', 'imessage', 'sms', 'mms', 'rcs', 'whatsapp', 'fbmessenger' -- Timestamps (sent_at is canonical, others platform-specific) sent_at DATETIME, diff --git a/internal/store/sources.go b/internal/store/sources.go index 258368c5..d641e42d 100644 --- a/internal/store/sources.go +++ b/internal/store/sources.go @@ -163,7 +163,7 @@ func (s *Store) RemoveSourceSerialized( var count int if err := conn.QueryRowContext(ctx, - `SELECT COUNT(*) FROM sync_runs WHERE status = 'running'`, + s.dialect.Rebind(`SELECT COUNT(*) FROM sync_runs WHERE status = 'running'`), ).Scan(&count); err != nil { return false, fmt.Errorf("check active syncs: %w", err) } @@ -178,7 +178,7 @@ func (s *Store) RemoveSourceSerialized( } res, err := conn.ExecContext( - ctx, `DELETE FROM sources WHERE id = ?`, sourceID, + ctx, s.dialect.Rebind(`DELETE FROM sources WHERE id = ?`), sourceID, ) if err != nil { return hadActiveSync, fmt.Errorf("delete source: %w", err) diff --git a/internal/store/sync.go b/internal/store/sync.go index 1daff77f..b3f7e243 100644 --- a/internal/store/sync.go +++ b/internal/store/sync.go @@ -237,6 +237,30 @@ func (s *Store) GetActiveSync(sourceID int64) (*SyncRun, error) { return run, err } +// GetLatestCheckpointedSync returns the most recent sync run for a source if +// (and only if) that latest run is running or failed and has a non-empty +// cursor_before. A completed run after a failed one means the failed run's +// checkpoint is stale: re-importing must re-scan all threads, so we return +// no row in that case. +func (s *Store) GetLatestCheckpointedSync(sourceID int64) (*SyncRun, error) { + row := s.db.QueryRow(` + SELECT id, source_id, started_at, completed_at, status, + messages_processed, messages_added, messages_updated, errors_count, + error_message, cursor_before, cursor_after + FROM sync_runs + WHERE source_id = ? + AND id = (SELECT MAX(id) FROM sync_runs WHERE source_id = ?) + AND status IN ('running', 'failed') + AND cursor_before IS NOT NULL AND cursor_before != '' + `, sourceID, sourceID) + + run, err := scanSyncRun(row) + if err == sql.ErrNoRows { + return nil, nil + } + return run, err +} + // HasAnyActiveSync returns true if any source currently has a running sync. // Use this as a safety gate before performing destructive file operations that // could race with concurrent attachment ingestion.