Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions docs/PG_STATUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,20 @@ branch:

## Remaining for PR4

- **FTS weight differences**: PostgreSQL applies `setweight('A')` to the
subject and `'B'` to the sender; SQLite FTS5 has no weighting. Ranking
results will still differ between backends.
- **FTS rank ordering (partially resolved)**:
`SQLiteDialect.FTSSearchClause()` now orders by
`bm25(messages_fts, 1.0, 10.0, 1.0, 4.0, 1.0, 1.0)` — weights are
positional over every declared FTS5 column (the leading 1.0 is the
slot for `message_id UNINDEXED`; the rest map to subject, body,
from, to, cc). The 10:4:1 ratio across subject/sender/body mirrors
PostgreSQL's `setweight 'A'=1.0 / 'B'=0.4 / 'D'=0.1`, so
subject-only matches outrank sender-only, which outrank body-only
on both backends. Verified by `TestFTSRankWeightsAcrossBackends`.
Note: bm25 (Okapi BM25) and `ts_rank` (cover-density) remain
different scorer functions, so intra-class tie-breaking (e.g. two
body-only matches) can still diverge. Top-N ordering is consistent
for most queries; expect occasional reorderings deep in the result
list.
- **Deletion execution path on PostgreSQL**: end-to-end testing of
staged-deletion → Gmail delete → archive update.
- **Attachment storage paths** under PostgreSQL — content-hash dedup
Expand Down
18 changes: 15 additions & 3 deletions internal/store/dialect_sqlite.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,23 @@ func (d *SQLiteDialect) FTSUpsert(q querier, doc FTSDoc) error {
}

// FTSSearchClause returns SQL fragments for FTS5 full-text search.
// "rank" is an implicit FTS5 column, so orderArgCount is 0.
// The bm25 weights mirror the PostgreSQL dialect's setweight scheme so the
// two backends order results consistently. Weights are positional over
// every column declared in messages_fts — UNINDEXED columns count too even
// though they cannot match — so the leading 1.0 is the placeholder for
// `message_id UNINDEXED`. The remaining slots map to (subject, body,
// from_addr, to_addr, cc_addr). PostgreSQL applies setweight 'A'=1.0 to
// subject and 'B'=0.4 to sender, leaving body and other recipients at
// default 'D'=0.1 — a 10:4:1 ratio. bm25 multiplies per-column scores by
// these weights, so 10/1/4/1/1 across (subject, body, from, to, cc)
// reproduces that ordering: subject-only > sender-only > body/recipient-
// only matches. bm25 returns lower (more negative) scores for more
// relevant rows, so callers ORDER BY this expression ascending (the
// default).
func (d *SQLiteDialect) FTSSearchClause() (join, where, orderBy string, orderArgCount int) {
return "JOIN messages_fts fts ON fts.rowid = m.id",
return "JOIN messages_fts ON messages_fts.rowid = m.id",
"messages_fts MATCH ?",
"rank",
"bm25(messages_fts, 1.0, 10.0, 1.0, 4.0, 1.0, 1.0)",
0
}

Expand Down
162 changes: 162 additions & 0 deletions internal/store/fts_rank_parity_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
package store_test

import (
"os"
"strings"
"testing"

"go.kenn.io/msgvault/internal/store"
"go.kenn.io/msgvault/internal/testutil"
)

func currentBackend() string {
db := os.Getenv("MSGVAULT_TEST_DB")
if strings.HasPrefix(db, "postgres://") || strings.HasPrefix(db, "postgresql://") {
return "postgres"
}
return "sqlite"
}

// TestFTSRankParityFixture seeds a fixed corpus and runs a handful of
// representative queries, printing the resulting message-ID order with
// t.Log. Run it under SQLite and under PostgreSQL (MSGVAULT_TEST_DB=...)
// and diff the captured logs: identical top-N ordering is the parity
// claim. The test asserts only the invariants that must hold on both
// backends (subject hit ranks above body hit), so it does not fail on
// the residual scorer-math differences documented in PG_STATUS.md.
func TestFTSRankParityFixture(t *testing.T) {
st := testutil.NewTestStore(t)

src, err := st.GetOrCreateSource("gmail", "fixture@example.com")
if err != nil {
t.Fatalf("GetOrCreateSource: %v", err)
}
convID, err := st.EnsureConversation(src.ID, "fixture-thread", "Fixture")
if err != nil {
t.Fatalf("EnsureConversation: %v", err)
}

mk := func(label, subject, body, fromAddr string) int64 {
id, err := st.UpsertMessage(&store.Message{
ConversationID: convID,
SourceID: src.ID,
SourceMessageID: label,
MessageType: "email",
Subject: nullString(subject),
Snippet: nullString(body),
SizeEstimate: 100,
})
if err != nil {
t.Fatalf("UpsertMessage(%q): %v", label, err)
}
if err := st.UpsertFTS(id, subject, body, fromAddr, "", ""); err != nil {
t.Fatalf("UpsertFTS(%q): %v", label, err)
}
return id
}

// Each row is { sourceMessageID, subject, body, fromAddr }.
// Tokens chosen so a query for any token below appears in known
// columns of known rows.
rows := []struct {
label, subject, body, from string
}{
{"r1", "invoice march filler",
"alpha beta gamma delta epsilon",
"billing acme noreply@example.com"},
{"r2", "weekly newsletter filler filler",
"invoice mentioned in body once filler",
"news editor@example.com"},
{"r3", "team standup notes filler",
"march meeting prep filler filler",
"invoice billing@example.com"},
{"r4", "march madness alpha filler",
"invoice invoice invoice filler filler",
"sender alpha@example.com"},
{"r5", "alpha alpha alpha filler",
"delta filler filler filler filler",
"editor news@example.com"},
{"r6", "delta filler filler filler",
"alpha alpha filler filler filler",
"noreply billing@example.com"},
}
ids := make(map[string]int64, len(rows))
for _, r := range rows {
ids[r.label] = mk(r.label, r.subject, r.body, r.from)
}

// Reverse map for human-readable logging.
labelOf := make(map[int64]string, len(ids))
for label, id := range ids {
labelOf[id] = label
}

queries := []string{
"invoice", // appears in subject(r1), body(r2,r4), from(r3)
"march", // subject(r4), body(r3), subject(r1)
"alpha", // subject(r4,r5), body(r6,r1), from(r4)
"delta", // subject(r6), body(r5,r1)
"billing", // from(r3,r6), from(r1)
}

backend := currentBackend()

for _, q := range queries {
t.Run(q, func(t *testing.T) {
results, total, err := st.SearchMessages(q, 0, 20)
if err != nil {
t.Fatalf("SearchMessages(%q): %v", q, err)
}
order := make([]string, 0, len(results))
for _, r := range results {
order = append(order, labelOf[r.ID])
}
t.Logf("backend=%s query=%q total=%d order=%s",
backend, q, total, strings.Join(order, ","))

// Sanity: rows where the token appears only in subject
// must rank above rows where it appears only in body.
// This is the invariant both backends must honor.
invariants := invariantsFor(q)
for _, inv := range invariants {
higher, lower := inv[0], inv[1]
hPos := indexOfLabel(order, higher)
lPos := indexOfLabel(order, lower)
if hPos == -1 || lPos == -1 {
continue // not both present, skip
}
if hPos >= lPos {
t.Errorf("query %q: row %q (pos %d) should rank above row %q (pos %d) — full order: %v",
q, higher, hPos, lower, lPos, order)
}
}
})
}
}

// invariantsFor returns (higher, lower) pairs that must hold for a query.
// Built from the fixture in TestFTSRankParityFixture: when one row has
// the term only in subject and another only in body, subject wins.
func invariantsFor(q string) [][2]string {
switch q {
case "invoice":
// r1 has invoice in subject only; r2 has it in body only.
return [][2]string{{"r1", "r2"}}
case "march":
// r1 and r4 have march in subject; r3 has it only in body.
return [][2]string{{"r1", "r3"}, {"r4", "r3"}}
case "delta":
// r6 has delta in subject only; r5 has it in body only.
return [][2]string{{"r6", "r5"}}
}
return nil
}

func indexOfLabel(order []string, label string) int {
for i, l := range order {
if l == label {
return i
}
}
return -1
}
91 changes: 91 additions & 0 deletions internal/store/fts_rank_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package store_test

import (
"database/sql"
"testing"

"go.kenn.io/msgvault/internal/store"
"go.kenn.io/msgvault/internal/testutil"
)

func nullString(s string) sql.NullString {
return sql.NullString{String: s, Valid: true}
}

// TestFTSRankWeightsAcrossBackends verifies that subject matches outrank
// sender matches, which outrank body-only matches. The same ordering must
// hold on SQLite (bm25 with column weights) and PostgreSQL (ts_rank over
// setweight'd tsvector). Each message is seeded with the search token
// "zappa" in exactly one FTS column so the rank attribution is unambiguous.
func TestFTSRankWeightsAcrossBackends(t *testing.T) {
st := testutil.NewTestStore(t)

src, err := st.GetOrCreateSource("gmail", "rank@example.com")
if err != nil {
t.Fatalf("GetOrCreateSource: %v", err)
}
convID, err := st.EnsureConversation(src.ID, "rank-thread", "Rank Thread")
if err != nil {
t.Fatalf("EnsureConversation: %v", err)
}

// Seed three rows with the rare token in different columns. Subject
// and body get filler distinct text so document length is comparable;
// bm25 docs of vastly different lengths skew the score.
mkMsg := func(srcMsgID, subject, snippet string) int64 {
id, err := st.UpsertMessage(&store.Message{
ConversationID: convID,
SourceID: src.ID,
SourceMessageID: srcMsgID,
MessageType: "email",
Subject: nullString(subject),
Snippet: nullString(snippet),
SizeEstimate: 100,
})
if err != nil {
t.Fatalf("UpsertMessage(%q): %v", srcMsgID, err)
}
return id
}

subjID := mkMsg("rank-subj", "zappa filler filler filler", "alpha beta gamma")
fromID := mkMsg("rank-from", "alpha beta gamma filler", "alpha beta gamma")
bodyID := mkMsg("rank-body", "alpha beta gamma filler", "zappa filler filler")

// Push FTS docs explicitly so the test does not depend on the full
// sync pipeline; FTSUpsert maps each field to the column the dialect
// then weights.
if err := st.UpsertFTS(subjID, "zappa filler filler filler", "alpha beta gamma", "noreply@example.com", "", ""); err != nil {
t.Fatalf("UpsertFTS subj: %v", err)
}
// "zappa noreply@example.com" — keep zappa as its own whitespace-
// delimited token so PG's text parser (which treats `user@host.tld`
// as one email token) still matches it; SQLite's unicode61 tokenizer
// splits on `@` and would match either way.
if err := st.UpsertFTS(fromID, "alpha beta gamma filler", "alpha beta gamma", "zappa noreply@example.com", "", ""); err != nil {
t.Fatalf("UpsertFTS from: %v", err)
}
if err := st.UpsertFTS(bodyID, "alpha beta gamma filler", "zappa filler filler", "noreply@example.com", "", ""); err != nil {
t.Fatalf("UpsertFTS body: %v", err)
}

results, total, err := st.SearchMessages("zappa", 0, 10)
if err != nil {
t.Fatalf("SearchMessages: %v", err)
}
if total != 3 {
t.Fatalf("total = %d, want 3 (subj/from/body all match)", total)
}
if len(results) != 3 {
t.Fatalf("len(results) = %d, want 3", len(results))
}

gotOrder := []int64{results[0].ID, results[1].ID, results[2].ID}
wantOrder := []int64{subjID, fromID, bodyID}
for i := range wantOrder {
if gotOrder[i] != wantOrder[i] {
t.Errorf("rank position %d: got message %d, want %d (full order got=%v want=%v: subj=%d from=%d body=%d)",
i, gotOrder[i], wantOrder[i], gotOrder, wantOrder, subjID, fromID, bodyID)
}
}
}
Loading