diff --git a/docs/PG_STATUS.md b/docs/PG_STATUS.md index ed6329e3..cdb39986 100644 --- a/docs/PG_STATUS.md +++ b/docs/PG_STATUS.md @@ -108,9 +108,20 @@ branch: ## Remaining for PR4 -- **FTS weight differences**: PostgreSQL applies `setweight('A')` to the - subject and `'B'` to the sender; SQLite FTS5 has no weighting. Ranking - results will still differ between backends. +- **FTS rank ordering (partially resolved)**: + `SQLiteDialect.FTSSearchClause()` now orders by + `bm25(messages_fts, 1.0, 10.0, 1.0, 4.0, 1.0, 1.0)` — weights are + positional over every declared FTS5 column (the leading 1.0 is the + slot for `message_id UNINDEXED`; the rest map to subject, body, + from, to, cc). The 10:4:1 ratio across subject/sender/body mirrors + PostgreSQL's `setweight 'A'=1.0 / 'B'=0.4 / 'D'=0.1`, so + subject-only matches outrank sender-only, which outrank body-only + on both backends. Verified by `TestFTSRankWeightsAcrossBackends`. + Note: bm25 (Okapi BM25) and `ts_rank` (cover-density) remain + different scorer functions, so intra-class tie-breaking (e.g. two + body-only matches) can still diverge. Top-N ordering is consistent + for most queries; expect occasional reorderings deep in the result + list. - **Deletion execution path on PostgreSQL**: end-to-end testing of staged-deletion → Gmail delete → archive update. - **Attachment storage paths** under PostgreSQL — content-hash dedup diff --git a/internal/store/dialect_sqlite.go b/internal/store/dialect_sqlite.go index f540f432..1eb2b171 100644 --- a/internal/store/dialect_sqlite.go +++ b/internal/store/dialect_sqlite.go @@ -93,11 +93,23 @@ func (d *SQLiteDialect) FTSUpsert(q querier, doc FTSDoc) error { } // FTSSearchClause returns SQL fragments for FTS5 full-text search. -// "rank" is an implicit FTS5 column, so orderArgCount is 0. +// The bm25 weights mirror the PostgreSQL dialect's setweight scheme so the +// two backends order results consistently. Weights are positional over +// every column declared in messages_fts — UNINDEXED columns count too even +// though they cannot match — so the leading 1.0 is the placeholder for +// `message_id UNINDEXED`. The remaining slots map to (subject, body, +// from_addr, to_addr, cc_addr). PostgreSQL applies setweight 'A'=1.0 to +// subject and 'B'=0.4 to sender, leaving body and other recipients at +// default 'D'=0.1 — a 10:4:1 ratio. bm25 multiplies per-column scores by +// these weights, so 10/1/4/1/1 across (subject, body, from, to, cc) +// reproduces that ordering: subject-only > sender-only > body/recipient- +// only matches. bm25 returns lower (more negative) scores for more +// relevant rows, so callers ORDER BY this expression ascending (the +// default). func (d *SQLiteDialect) FTSSearchClause() (join, where, orderBy string, orderArgCount int) { - return "JOIN messages_fts fts ON fts.rowid = m.id", + return "JOIN messages_fts ON messages_fts.rowid = m.id", "messages_fts MATCH ?", - "rank", + "bm25(messages_fts, 1.0, 10.0, 1.0, 4.0, 1.0, 1.0)", 0 } diff --git a/internal/store/fts_rank_parity_test.go b/internal/store/fts_rank_parity_test.go new file mode 100644 index 00000000..b2d5f696 --- /dev/null +++ b/internal/store/fts_rank_parity_test.go @@ -0,0 +1,162 @@ +package store_test + +import ( + "os" + "strings" + "testing" + + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/testutil" +) + +func currentBackend() string { + db := os.Getenv("MSGVAULT_TEST_DB") + if strings.HasPrefix(db, "postgres://") || strings.HasPrefix(db, "postgresql://") { + return "postgres" + } + return "sqlite" +} + +// TestFTSRankParityFixture seeds a fixed corpus and runs a handful of +// representative queries, printing the resulting message-ID order with +// t.Log. Run it under SQLite and under PostgreSQL (MSGVAULT_TEST_DB=...) +// and diff the captured logs: identical top-N ordering is the parity +// claim. The test asserts only the invariants that must hold on both +// backends (subject hit ranks above body hit), so it does not fail on +// the residual scorer-math differences documented in PG_STATUS.md. +func TestFTSRankParityFixture(t *testing.T) { + st := testutil.NewTestStore(t) + + src, err := st.GetOrCreateSource("gmail", "fixture@example.com") + if err != nil { + t.Fatalf("GetOrCreateSource: %v", err) + } + convID, err := st.EnsureConversation(src.ID, "fixture-thread", "Fixture") + if err != nil { + t.Fatalf("EnsureConversation: %v", err) + } + + mk := func(label, subject, body, fromAddr string) int64 { + id, err := st.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: src.ID, + SourceMessageID: label, + MessageType: "email", + Subject: nullString(subject), + Snippet: nullString(body), + SizeEstimate: 100, + }) + if err != nil { + t.Fatalf("UpsertMessage(%q): %v", label, err) + } + if err := st.UpsertFTS(id, subject, body, fromAddr, "", ""); err != nil { + t.Fatalf("UpsertFTS(%q): %v", label, err) + } + return id + } + + // Each row is { sourceMessageID, subject, body, fromAddr }. + // Tokens chosen so a query for any token below appears in known + // columns of known rows. + rows := []struct { + label, subject, body, from string + }{ + {"r1", "invoice march filler", + "alpha beta gamma delta epsilon", + "billing acme noreply@example.com"}, + {"r2", "weekly newsletter filler filler", + "invoice mentioned in body once filler", + "news editor@example.com"}, + {"r3", "team standup notes filler", + "march meeting prep filler filler", + "invoice billing@example.com"}, + {"r4", "march madness alpha filler", + "invoice invoice invoice filler filler", + "sender alpha@example.com"}, + {"r5", "alpha alpha alpha filler", + "delta filler filler filler filler", + "editor news@example.com"}, + {"r6", "delta filler filler filler", + "alpha alpha filler filler filler", + "noreply billing@example.com"}, + } + ids := make(map[string]int64, len(rows)) + for _, r := range rows { + ids[r.label] = mk(r.label, r.subject, r.body, r.from) + } + + // Reverse map for human-readable logging. + labelOf := make(map[int64]string, len(ids)) + for label, id := range ids { + labelOf[id] = label + } + + queries := []string{ + "invoice", // appears in subject(r1), body(r2,r4), from(r3) + "march", // subject(r4), body(r3), subject(r1) + "alpha", // subject(r4,r5), body(r6,r1), from(r4) + "delta", // subject(r6), body(r5,r1) + "billing", // from(r3,r6), from(r1) + } + + backend := currentBackend() + + for _, q := range queries { + t.Run(q, func(t *testing.T) { + results, total, err := st.SearchMessages(q, 0, 20) + if err != nil { + t.Fatalf("SearchMessages(%q): %v", q, err) + } + order := make([]string, 0, len(results)) + for _, r := range results { + order = append(order, labelOf[r.ID]) + } + t.Logf("backend=%s query=%q total=%d order=%s", + backend, q, total, strings.Join(order, ",")) + + // Sanity: rows where the token appears only in subject + // must rank above rows where it appears only in body. + // This is the invariant both backends must honor. + invariants := invariantsFor(q) + for _, inv := range invariants { + higher, lower := inv[0], inv[1] + hPos := indexOfLabel(order, higher) + lPos := indexOfLabel(order, lower) + if hPos == -1 || lPos == -1 { + continue // not both present, skip + } + if hPos >= lPos { + t.Errorf("query %q: row %q (pos %d) should rank above row %q (pos %d) — full order: %v", + q, higher, hPos, lower, lPos, order) + } + } + }) + } +} + +// invariantsFor returns (higher, lower) pairs that must hold for a query. +// Built from the fixture in TestFTSRankParityFixture: when one row has +// the term only in subject and another only in body, subject wins. +func invariantsFor(q string) [][2]string { + switch q { + case "invoice": + // r1 has invoice in subject only; r2 has it in body only. + return [][2]string{{"r1", "r2"}} + case "march": + // r1 and r4 have march in subject; r3 has it only in body. + return [][2]string{{"r1", "r3"}, {"r4", "r3"}} + case "delta": + // r6 has delta in subject only; r5 has it in body only. + return [][2]string{{"r6", "r5"}} + } + return nil +} + +func indexOfLabel(order []string, label string) int { + for i, l := range order { + if l == label { + return i + } + } + return -1 +} diff --git a/internal/store/fts_rank_test.go b/internal/store/fts_rank_test.go new file mode 100644 index 00000000..a6406634 --- /dev/null +++ b/internal/store/fts_rank_test.go @@ -0,0 +1,91 @@ +package store_test + +import ( + "database/sql" + "testing" + + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/testutil" +) + +func nullString(s string) sql.NullString { + return sql.NullString{String: s, Valid: true} +} + +// TestFTSRankWeightsAcrossBackends verifies that subject matches outrank +// sender matches, which outrank body-only matches. The same ordering must +// hold on SQLite (bm25 with column weights) and PostgreSQL (ts_rank over +// setweight'd tsvector). Each message is seeded with the search token +// "zappa" in exactly one FTS column so the rank attribution is unambiguous. +func TestFTSRankWeightsAcrossBackends(t *testing.T) { + st := testutil.NewTestStore(t) + + src, err := st.GetOrCreateSource("gmail", "rank@example.com") + if err != nil { + t.Fatalf("GetOrCreateSource: %v", err) + } + convID, err := st.EnsureConversation(src.ID, "rank-thread", "Rank Thread") + if err != nil { + t.Fatalf("EnsureConversation: %v", err) + } + + // Seed three rows with the rare token in different columns. Subject + // and body get filler distinct text so document length is comparable; + // bm25 docs of vastly different lengths skew the score. + mkMsg := func(srcMsgID, subject, snippet string) int64 { + id, err := st.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: src.ID, + SourceMessageID: srcMsgID, + MessageType: "email", + Subject: nullString(subject), + Snippet: nullString(snippet), + SizeEstimate: 100, + }) + if err != nil { + t.Fatalf("UpsertMessage(%q): %v", srcMsgID, err) + } + return id + } + + subjID := mkMsg("rank-subj", "zappa filler filler filler", "alpha beta gamma") + fromID := mkMsg("rank-from", "alpha beta gamma filler", "alpha beta gamma") + bodyID := mkMsg("rank-body", "alpha beta gamma filler", "zappa filler filler") + + // Push FTS docs explicitly so the test does not depend on the full + // sync pipeline; FTSUpsert maps each field to the column the dialect + // then weights. + if err := st.UpsertFTS(subjID, "zappa filler filler filler", "alpha beta gamma", "noreply@example.com", "", ""); err != nil { + t.Fatalf("UpsertFTS subj: %v", err) + } + // "zappa noreply@example.com" — keep zappa as its own whitespace- + // delimited token so PG's text parser (which treats `user@host.tld` + // as one email token) still matches it; SQLite's unicode61 tokenizer + // splits on `@` and would match either way. + if err := st.UpsertFTS(fromID, "alpha beta gamma filler", "alpha beta gamma", "zappa noreply@example.com", "", ""); err != nil { + t.Fatalf("UpsertFTS from: %v", err) + } + if err := st.UpsertFTS(bodyID, "alpha beta gamma filler", "zappa filler filler", "noreply@example.com", "", ""); err != nil { + t.Fatalf("UpsertFTS body: %v", err) + } + + results, total, err := st.SearchMessages("zappa", 0, 10) + if err != nil { + t.Fatalf("SearchMessages: %v", err) + } + if total != 3 { + t.Fatalf("total = %d, want 3 (subj/from/body all match)", total) + } + if len(results) != 3 { + t.Fatalf("len(results) = %d, want 3", len(results)) + } + + gotOrder := []int64{results[0].ID, results[1].ID, results[2].ID} + wantOrder := []int64{subjID, fromID, bodyID} + for i := range wantOrder { + if gotOrder[i] != wantOrder[i] { + t.Errorf("rank position %d: got message %d, want %d (full order got=%v want=%v: subj=%d from=%d body=%d)", + i, gotOrder[i], wantOrder[i], gotOrder, wantOrder, subjID, fromID, bodyID) + } + } +}