From d4fed5fb20efc54711fe3cc242d49628ccb20fdf Mon Sep 17 00:00:00 2001 From: Kunal Lanjewar <5488221+kunallanjewar@users.noreply.github.com> Date: Tue, 28 Apr 2026 08:51:30 -0700 Subject: [PATCH 1/5] fix(audit): replace em dashes with hyphens/punctuation Eliminates three U+2014 em-dash characters in source comments and test strings per the no-em-dash style constraint. --- pkg/lore/ingest/heuristic/rules.go | 4 ++-- pkg/lore/retrieve/hybrid/hybrid_test.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/lore/ingest/heuristic/rules.go b/pkg/lore/ingest/heuristic/rules.go index 6d98a73..75b87c0 100644 --- a/pkg/lore/ingest/heuristic/rules.go +++ b/pkg/lore/ingest/heuristic/rules.go @@ -37,7 +37,7 @@ type Rule struct { // - *.md in project root reference (catch-wide-root) func DefaultRules() []Rule { return []Rule{ - // ADRs — architectural decision records. + // ADRs (architectural decision records). {PathGlob: "docs/adr/*.md", Kind: lore.KindDecision, Tags: []string{"adr"}}, {PathGlob: "docs/adr/*.markdown", Kind: lore.KindDecision, Tags: []string{"adr"}}, {PathGlob: "docs/decisions/*.md", Kind: lore.KindDecision, Tags: []string{"adr"}}, @@ -71,7 +71,7 @@ func DefaultRules() []Rule { {PathGlob: "CONTRIBUTING.md", Kind: lore.KindProcedure, Tags: []string{"contributing"}}, {PathGlob: "CONTRIBUTING.markdown", Kind: lore.KindProcedure, Tags: []string{"contributing"}}, - // Changelog — records of what changed. + // Changelog: records of what changed. {PathGlob: "CHANGELOG.md", Kind: lore.KindObservation, Tags: []string{"changelog"}}, {PathGlob: "CHANGELOG.markdown", Kind: lore.KindObservation, Tags: []string{"changelog"}}, } diff --git a/pkg/lore/retrieve/hybrid/hybrid_test.go b/pkg/lore/retrieve/hybrid/hybrid_test.go index 1ad7cba..04ff022 100644 --- a/pkg/lore/retrieve/hybrid/hybrid_test.go +++ b/pkg/lore/retrieve/hybrid/hybrid_test.go @@ -258,7 +258,7 @@ func TestHybrid_BothBeats(t *testing.T) { } // Both found: this is the "hybrid beats single-mode" condition. if !foundMigration || !foundDeployment { - t.Logf("partial coverage (migration=%v, deployment=%v) — acceptable with small corpus", foundMigration, foundDeployment) + t.Logf("partial coverage (migration=%v, deployment=%v); acceptable with small corpus", foundMigration, foundDeployment) } } From b8741daf94d30beca6ba7d0c7f2d371f5e98f92a Mon Sep 17 00:00:00 2001 From: Kunal Lanjewar <5488221+kunallanjewar@users.noreply.github.com> Date: Tue, 28 Apr 2026 08:53:17 -0700 Subject: [PATCH 2/5] docs(readme): rewrite for v0.1.1 release Full release-ready README: quickstart, interface overview, Path A/B distinction, K8s production deployment patterns, reference impl table, configuration guide, stability disclaimer, attribution. Removes any internal path references and restructures per public-reader requirements. --- README.md | 531 ++++++++++++++++++++---------------------------------- 1 file changed, 196 insertions(+), 335 deletions(-) diff --git a/README.md b/README.md index 1e889c6..dd2b75c 100644 --- a/README.md +++ b/README.md @@ -1,392 +1,253 @@ # lore -A structured knowledge primitive for AI agents. Apache 2.0 OSS Go library. - -Lore stores classified knowledge entries (decisions, principles, procedures, -references, explanations, observations, research, ideas) and the typed edges -that connect them, then serves them back to retrieval pipelines that combine -lexical and semantic ranking. It ships as a Go library, not a service: callers -compose it into their own MCP servers, HTTP services, ingestion pipelines, or -CLI tools. - -The library is built around three pluggable interfaces (`Store`, `Embedder`, -`VectorStore`) plus a composing `Retriever` and an optional `Ingester`. Each -interface ships with an in-process reference implementation (modernc.org/sqlite, -BGE int8, sqlite-vec) so a single binary can run against a local SQLite file -out of the box. Swap any of the three for Postgres, a remote embedding API, -pgvector, or anything else by implementing the interface. +Lore is a structured knowledge library for AI agents. It stores classified +entries (decisions, principles, procedures, references, explanations, +observations, research, ideas) and the edges between them, then serves +them to retrieval pipelines that combine lexical and semantic ranking. + +Lore ships as a Go library, not a service. Callers compose it into their own +MCP servers, HTTP services, ingestion pipelines, or CLI tools. Three pluggable +interfaces (`Store`, `Embedder`, `VectorStore`) each have an in-process +reference implementation that runs against a local SQLite file out of the box. +Swap any of the three for Postgres, a remote embedding API, or a purpose-built +vector database by satisfying the interface. ## Install ``` -go get github.com/mathomhaus/lore@latest +go get github.com/mathomhaus/lore@v0.1.1 ``` Requires Go 1.23 or newer. -## Usage +## Quickstart -### Store: persist and retrieve entries - -The `store.Store` interface is the primary write and read surface. Open a -`*sql.DB` with the `"sqlite"` driver (registered by `modernc.org/sqlite`), -pass it to `sqlite.New`, and the constructor runs schema migrations -automatically. +The example below wires the three reference implementations together, inscribes +an entry, embeds it into the vector store, and runs a hybrid search. ```go -import ( - "context" - "database/sql" - "fmt" - - _ "modernc.org/sqlite" - - "github.com/mathomhaus/lore/pkg/lore" - "github.com/mathomhaus/lore/pkg/lore/store/sqlite" -) - -func main() { - dsn := "lore.db" + - "?_pragma=journal_mode(WAL)" + - "&_pragma=busy_timeout(5000)" + - "&_pragma=synchronous(NORMAL)" + - "&_pragma=foreign_keys(ON)" - - db, err := sql.Open("sqlite", dsn) - if err != nil { - panic(err) - } - defer db.Close() - - st, err := sqlite.New(db) - if err != nil { - panic(err) - } - defer st.Close(context.Background()) - - // Persist a decision. - id, err := st.Inscribe(context.Background(), lore.Entry{ - Project: "myproject", - Kind: lore.KindDecision, - Title: "Use SQLite for local persistence", - Body: "Chosen for zero-dependency deployment and strong ACID guarantees.", - Tags: []string{"adr", "storage"}, - }) - if err != nil { - panic(err) - } - fmt.Println("inscribed", id) - - // Retrieve it. - entry, err := st.Get(context.Background(), id) - if err != nil { - panic(err) - } - fmt.Println(entry.Title) - - // Full-text search. - hits, err := st.SearchText(context.Background(), "SQLite persistence", lore.SearchOpts{Limit: 5}) - if err != nil { - panic(err) - } - for _, h := range hits { - fmt.Printf("%.3f %s\n", h.Score, h.Entry.Title) - } -} -``` +package main -The `store.Store` interface is backend-agnostic. Swap `sqlite.New` for any -implementation that satisfies the interface to use a different storage engine -without changing callers. - -### Path B: document ingestion - -Path B ingests existing Markdown document trees into lore entries. The -ingester is a pure functional transform: it returns entries and the caller -writes them to a Store. - -```go import ( - "context" - "log" - - "github.com/mathomhaus/lore/pkg/lore/ingest/heuristic" + "context" + "database/sql" + "errors" + "fmt" + "log" + + _ "modernc.org/sqlite" + + "github.com/mathomhaus/lore/pkg/lore" + "github.com/mathomhaus/lore/pkg/lore/embed" + "github.com/mathomhaus/lore/pkg/lore/embed/bge" + "github.com/mathomhaus/lore/pkg/lore/retrieve/hybrid" + "github.com/mathomhaus/lore/pkg/lore/store/sqlite" + "github.com/mathomhaus/lore/pkg/lore/vector/sqlitevec" ) func main() { - ing := heuristic.NewIngester() - - result, err := ing.Process(context.Background(), "/workspace/docs") - if err != nil { - log.Fatal(err) - } - - for _, fe := range result.Errors { - log.Printf("warn: %v", fe) - } - - for _, e := range result.Entries { - log.Printf("entry kind=%s title=%q source=%s", e.Kind, e.Title, e.Source) - // write e to your store here - } + ctx := context.Background() + + // Open a single SQLite file. All three backends share the same DB. + dsn := "knowledge.db" + + "?_pragma=journal_mode(WAL)" + + "&_pragma=busy_timeout(5000)" + + "&_pragma=synchronous(NORMAL)" + + "&_pragma=foreign_keys(ON)" + + db, err := sql.Open("sqlite", dsn) + if err != nil { + log.Fatal(err) + } + defer db.Close() + + // Store: handles entry persistence and BM25 full-text search. + st, err := sqlite.New(db) + if err != nil { + log.Fatal(err) + } + defer st.Close(ctx) + + // VectorStore: stores and queries float32 vectors (384-dim for BGE-small). + vs, err := sqlitevec.New(db, 384) + if err != nil { + log.Fatal(err) + } + defer vs.Close(ctx) + + // Embedder: in-process BGE int8 model. Falls back gracefully on platforms + // without ONNX Runtime. + var emb embed.Embedder + emb, err = bge.New() + if err != nil { + if !errors.Is(err, embed.ErrUnsupported) { + log.Fatal(err) + } + log.Print("embedder unavailable; using BM25-only retrieval") + emb = nil + } + if emb != nil { + defer emb.Close(ctx) + } + + // Inscribe a decision entry. + id, err := st.Inscribe(ctx, lore.Entry{ + Project: "decisionLog", + Kind: lore.KindDecision, + Title: "Use SQLite for local persistence", + Body: "Chosen for zero-dependency deployment and strong ACID guarantees under single-writer workloads.", + Tags: []string{"adr", "storage"}, + }) + if err != nil { + log.Fatal(err) + } + fmt.Printf("inscribed entry id=%d\n", id) + + // Embed and store the vector for the new entry if an embedder is available. + if emb != nil { + entry, _ := st.Get(ctx, id) + vecs, err := emb.Embed(ctx, []string{entry.Title + " " + entry.Body}) + if err == nil { + _ = vs.Upsert(ctx, id, vecs[0]) + } + } + + // Build a hybrid retriever that fuses BM25 + vector via RRF. + r := hybrid.New(st, emb, vs, + hybrid.WithRRFK(60), + hybrid.WithCandidatePoolSize(50), + ) + + // Search. + hits, err := r.Search(ctx, "SQLite persistence decision", lore.SearchOpts{ + Project: "decisionLog", + Limit: 5, + }) + if err != nil { + log.Fatal(err) + } + for _, h := range hits { + fmt.Printf("%.4f %s\n", h.Score, h.Entry.Title) + } } ``` -#### Classification priority +## What lore is -The heuristic ingester classifies each chunk using this priority order (first -match wins): +Lore exposes three pluggable interfaces: -1. YAML front matter with an explicit `kind:` field. -2. Path rules: `docs/adr/*.md` maps to decision+adr; `docs/runbooks/*.md` - maps to procedure+runbook; `CLAUDE.md`/`agents.md`/`skills.md` map to - reference+agent-config; and so on. See `heuristic.DefaultRules()`. -3. Heading keywords: `## What is` maps to explanation; `## Decision` / - `## Context` / `## Consequences` maps to decision; `## Procedure` / - `## Steps` maps to procedure; etc. -4. Fallback: kind=research (catch-all). +**Store** (`pkg/lore/store`) persists entries and edges. The reference +implementation in `pkg/lore/store/sqlite` uses `modernc.org/sqlite` (pure Go) +with FTS5 for BM25 full-text search. Replace it with Postgres, MySQL, or any +other engine by satisfying the interface. -#### Customizing rules +**Embedder** (`pkg/lore/embed`) turns text into dense vectors. The reference +implementation in `pkg/lore/embed/bge` runs an int8-quantized BGE-small-en-v1.5 +model in process via ONNX Runtime. Replace it with a remote embedding API or a +different local model without changing retrieval logic. -```go -import "github.com/mathomhaus/lore/pkg/lore/ingest/heuristic" - -rules := heuristic.DefaultRules() -rules = append(rules, heuristic.Rule{ - PathGlob: "docs/specs/*.md", - Kind: lore.KindDecision, - Tags: []string{"spec"}, -}) - -ing := heuristic.NewIngester( - heuristic.WithRules(rules), - heuristic.WithLogger(slog.Default()), -) -``` +**VectorStore** (`pkg/lore/vector`) stores and queries float32 vectors. The +reference implementation in `pkg/lore/vector/sqlitevec` stores vectors as BLOB +columns in the same SQLite file and does cosine similarity in Go (no CGO, no +extensions). Replace it with pgvector, Qdrant, or a native extension backend. -#### Walker behavior (v0.1.1) +On top of these three, lore composes a **Retriever** (`pkg/lore/retrieve`) that +fuses BM25 lexical and vector semantic rankings via Reciprocal Rank Fusion (RRF, +k=60). An **Ingester** (`pkg/lore/ingest`) optionally walks document trees and +classifies chunks into entries (Path B). -- Only `.md` and `.markdown` files are processed. -- `.git/`, `node_modules/`, `vendor/`, and any hidden directory (name - starting with `.`) are skipped unconditionally. -- Files larger than 10 MB are skipped with a FileError. -- Symlinks are not followed. -- `.gitignore` patterns are not honored (planned for v0.2). +Two write paths: -## Status: pre-v1.0 - -Lore is pre-v1.0. The exported surface is stable in shape but may change in -detail between minor versions. Pin to a version, read release notes before -upgrading, and expect occasional breakage on `main`. +- **Path A (agent inscribe):** an agent calls `Store.Inscribe` directly, then + `Embedder.Embed` + `VectorStore.Upsert`. No document parsing, no LLM cost. + High-frequency path suitable for session-level knowledge capture. +- **Path B (document ingestion):** `Ingester.Process` walks a directory, chunks + Markdown files, classifies each chunk (YAML front matter, then path rules, + then heading patterns, then fallback to `research`), and returns entries. The + caller writes them to a Store. Suitable for bulk ingestion of existing docs. ## What lore is not -- Not a CLI binary. Not an MCP server. Not an HTTP server. Not a UI. -- Not a hosted service. Not multi-tenant. Not an LLM client. +- Not a CLI binary. +- Not an MCP server. +- Not an HTTP server. +- Not a UI. +- Not a hosted service. +- Not multi-tenant (all isolation is caller-provided via the `Project` field). +- Not an LLM client. - Not a replacement for a full retrieval-augmented-generation framework. -Lore is the substrate. Everything above is a consumer's choice. +Lore is the substrate. Everything above is a consumer's responsibility. -## VectorStore +## Production deployment patterns -`pkg/lore/vector` defines the `VectorStore` interface. The reference -implementation in `pkg/lore/vector/sqlitevec` stores vectors as BLOB columns -inside your existing `*sql.DB` and runs cosine similarity entirely in Go -(no CGO, no extensions). +Because lore accepts caller-owned `*sql.DB` instances rather than connection +strings, it maps cleanly to multi-replica Kubernetes deployments. A typical +consumer service structure uses three stateless Deployments: -```go -import ( - "context" - "database/sql" +**Ingester worker** reads source documents from a queue (Pub/Sub, SQS, or a +database queue table), calls `Ingester.Process`, and writes the returned entries +to a shared `Store`. One or more replicas; only requires write access to the +database. - _ "modernc.org/sqlite" +**Query API service** receives search queries over HTTP or gRPC. It opens a +read-optimized `*sql.DB` connection pool (WAL mode allows concurrent readers), +constructs a `Store` + `Embedder` + `VectorStore`, wires them into a `Retriever`, +and returns ranked hits. Scales horizontally; each replica is stateless. - "github.com/mathomhaus/lore/pkg/lore/vector" - "github.com/mathomhaus/lore/pkg/lore/vector/sqlitevec" -) +**MCP gateway** exposes lore to AI agent harnesses via the Model Context +Protocol. It wraps the same `Store` + `Retriever` in tool handlers for +`inscribe`, `search`, and `list`. The library provides the knowledge primitives; +the MCP surface is the consumer's thin adaptation layer. -db, _ := sql.Open("sqlite", "lore.db") +All three Deployments can share a single underlying SQLite file (via a network +volume or single-writer proxy) or migrate to Postgres by swapping the `Store` +and `VectorStore` implementations. No code changes are required in the consumer +services when backends are swapped. -// Bind to a 384-dimension space (BGE-small-en-v1.5). -store, err := sqlitevec.New(db, 384) -if err != nil { - // handle -} -defer store.Close(context.Background()) - -ctx := context.Background() +## Reference implementations -// Store a vector. -vec := make([]float32, 384) // fill from your Embedder -_ = store.Upsert(ctx, entryID, vec) +| Package | Role | Backend | Scale guidance | +|---|---|---|---| +| `pkg/lore/store/sqlite` | Store | modernc.org/sqlite + FTS5 | Suitable for most single-service workloads | +| `pkg/lore/embed/bge` | Embedder | BGE-small-en-v1.5 int8 via ONNX Runtime | Requires ONNX Runtime dylib; pure CPU | +| `pkg/lore/vector/sqlitevec` | VectorStore | SQLite BLOB + Go cosine scan | Good to ~100K vectors of 384 dim (~100ms scan on modern hardware) | +| `pkg/lore/retrieve/hybrid` | Retriever | BM25 + vector via RRF | Inherits limits of Store + VectorStore | +| `pkg/lore/retrieve/bm25` | Retriever (lexical only) | Store.SearchText | No embedder required | +| `pkg/lore/ingest/heuristic` | Ingester | Rule-based heuristic classifier | Pure Go; no LLM cost | -// Search: returns top-5 hits in descending cosine similarity order. -hits, err := store.Search(ctx, queryVec, vector.SearchOpts{Limit: 5}) -for _, h := range hits { - fmt.Printf("entry %d score %.4f\n", h.ID, h.Score) -} -``` +All reference implementations are pure Go with no CGO requirement. The BGE +embedder uses `purego` for ONNX Runtime binding rather than CGO. -Kind and tag filters in `SearchOpts` are advisory. The sqlitevec reference -implementation does not apply them (a full-table-scan store has no efficient -join). Post-filter results via your `Store.Get` call or swap in a -VectorStore that understands your schema. +## Configuration -Scale: the reference impl performs a full linear scan. Acceptable for up to -roughly 100K vectors of 384 dimensions (benchmark: ~100ms on Apple M3 Pro). -Beyond that, implement `VectorStore` with pgvector, Qdrant, or a native -sqlite-vec extension backend. +**Logger.** Pass `WithLogger(*slog.Logger)` to any constructor. Defaults to +`slog.Default()`. -## Embedder +**Tracer.** Pass `WithTracer(trace.Tracer)` to enable OpenTelemetry spans. +Defaults to the global tracer provider (`otel.GetTracerProvider()`). Wire an +exporter in your service bootstrap to send traces to your backend of choice. -The `Embedder` interface turns text into dense vectors for semantic retrieval: +Span names follow the pattern `lore..` (for example +`lore.store.inscribe`, `lore.vector.search`, `lore.retrieve.search`). -```go -import ( - "context" - "errors" +**BGE embedder.** Set `LORE_ONNXRUNTIME_LIB` to override the default shared +library search path. On macOS: `brew install onnxruntime` puts the dylib where +the probe expects it. When the library is absent, `bge.New` returns +`embed.ErrUnsupported` and callers should fall back to lexical-only retrieval. - "github.com/mathomhaus/lore/pkg/lore/embed" - "github.com/mathomhaus/lore/pkg/lore/embed/bge" -) +## Stability -func embedTexts(ctx context.Context, texts []string) ([][]float32, error) { - emb, err := bge.New() - if err != nil { - if errors.Is(err, embed.ErrUnsupported) { - // Platform has no ONNX Runtime; fall through to lexical-only retrieval. - return nil, err - } - return nil, err - } - defer emb.Close(ctx) - - vecs, err := emb.Embed(ctx, texts) - if err != nil { - return nil, err - } - // Each vecs[i] is a float32 slice of length emb.Dimensions() (384 for BGE-small). - return vecs, nil -} -``` - -`bge.New` options: - -- `bge.WithLogger(*slog.Logger)` for a structured logger covering init and runtime warnings. -- `bge.WithTracer(trace.Tracer)` for an OTel tracer; spans named `lore.embed.encode`. - -The BGE reference implementation requires the ONNX Runtime shared library on the -host (e.g. `brew install onnxruntime` on macOS). Set `LORE_ONNXRUNTIME_LIB` to -override the default search path. When the library is absent, `bge.New` returns -`embed.ErrUnsupported` and callers should fall back to lexical retrieval. - -Implement the `embed.Embedder` interface to swap in a remote embedding API or a -different model without changing any retrieval code. - -## Retriever: hybrid BM25 + vector search - -`pkg/lore/retrieve` defines the `Retriever` interface. The reference -implementation in `pkg/lore/retrieve/hybrid` fuses BM25 lexical search -(via `Store.SearchText`) and vector nearest-neighbour search (via -`Embedder.Embed` + `VectorStore.Search`) using Reciprocal Rank Fusion -(RRF, k=60). This approach avoids tuning score scales across rankers: -only ordinal rank positions matter. - -```go -import ( - "context" - "database/sql" - "fmt" - "log" - - _ "modernc.org/sqlite" - - "github.com/mathomhaus/lore/pkg/lore" - "github.com/mathomhaus/lore/pkg/lore/embed/bge" - "github.com/mathomhaus/lore/pkg/lore/retrieve/hybrid" - "github.com/mathomhaus/lore/pkg/lore/store/sqlite" - "github.com/mathomhaus/lore/pkg/lore/vector/sqlitevec" -) - -func search(db *sql.DB, query string) ([]lore.SearchHit, error) { - // Store handles BM25. - st, err := sqlite.New(db) - if err != nil { - return nil, err - } - defer st.Close(context.Background()) - - // Embedder handles query vectorisation. - emb, err := bge.New() - if err != nil { - // ErrUnsupported on platforms without ONNX Runtime: use BM25-only. - log.Printf("warn: embedder unavailable, using BM25 only: %v", err) - emb = nil - } - if emb != nil { - defer emb.Close(context.Background()) - } - - // VectorStore handles nearest-neighbour lookup. - vs, err := sqlitevec.New(db, 384) - if err != nil { - return nil, err - } - defer vs.Close(context.Background()) - - r := hybrid.New(st, emb, vs, - hybrid.WithRRFK(60), - hybrid.WithCandidatePoolSize(50), - ) - - return r.Search(context.Background(), query, lore.SearchOpts{Limit: 10}) -} -``` - -The hybrid retriever tolerates partial failures gracefully: - -- If `Embedder.Embed` returns an error (e.g. `embed.ErrUnsupported`), the vector - arm is skipped and BM25 results are returned alone. -- If `VectorStore.Search` returns an error, the BM25 arm continues independently. -- Only when both arms fail does `Search` return an error. - -When the embedder is nil, pass a no-op stub or use `bm25.New(store)` directly: - -```go -import "github.com/mathomhaus/lore/pkg/lore/retrieve/bm25" - -r := bm25.New(st) -hits, err := r.Search(ctx, "deployment rollout", lore.SearchOpts{Limit: 10}) -``` - -### RRF algorithm - -`pkg/lore/retrieve/rrf` exposes `Fuse(rankings [][]int64, k int) []ScoredID` -for callers that want to run their own ranked lists through RRF without the -hybrid retriever: - -```go -import "github.com/mathomhaus/lore/pkg/lore/retrieve/rrf" - -bm25IDs := []int64{10, 20, 30} -vecIDs := []int64{20, 10, 40} - -fused := rrf.Fuse([][]int64{bm25IDs, vecIDs}, rrf.DefaultK) -for _, s := range fused { - fmt.Printf("id=%d score=%.4f\n", s.ID, s.Score) -} -``` - -Output is sorted by descending score; ties break by ascending ID for -determinism. +Lore is pre-v1.0. The exported surface is stable in shape but may change in +detail between minor versions. Pin to a version, read release notes before +upgrading, and expect occasional breaking changes on minor version bumps. ## Attribution Lore extracts and generalizes the storage, embedding, and retrieval primitives originally built inside [`mathomhaus/guild`](https://github.com/mathomhaus/guild). -Guild remains the opinionated agent-coordination platform that adds -quest, oath, and brief on top of these primitives. - +Guild remains the opinionated agent-coordination platform that adds quest, oath, +and brief on top of these primitives. ## License From 4eb6e0fa798ccc73ff8a3fab587ff75400ef6215 Mon Sep 17 00:00:00 2001 From: Kunal Lanjewar <5488221+kunallanjewar@users.noreply.github.com> Date: Tue, 28 Apr 2026 08:55:08 -0700 Subject: [PATCH 3/5] docs: add CHANGELOG, ARCHITECTURE, and INTERFACES reference docs CHANGELOG follows Keep-a-Changelog format with v0.1.1 entry describing all shipped packages and components. doc/ARCHITECTURE.md covers the three-interface design, Path A vs Path B, reference impl scale limits, RRF fusion rationale, and caller-owned deps. doc/INTERFACES.md is a condensed method-level reference for Store, Embedder, VectorStore, Retriever, Ingester, and the rrf.Fuse utility. --- CHANGELOG.md | 90 +++++++++++++++++++++ doc/ARCHITECTURE.md | 136 +++++++++++++++++++++++++++++++ doc/INTERFACES.md | 193 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 419 insertions(+) create mode 100644 CHANGELOG.md create mode 100644 doc/ARCHITECTURE.md create mode 100644 doc/INTERFACES.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ab1907d --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,90 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.1.1] - 2026-04-27 + +### Added + +- **`pkg/lore`** - Core types: `Entry`, `Edge`, `SearchHit`, `ListOpts`, + `SearchOpts`. Eight canonical `Kind` values: `decision`, `principle`, + `procedure`, `reference`, `explanation`, `observation`, `research`, `idea`. + Sentinel errors: `ErrNotFound`, `ErrDuplicate`, `ErrInvalidKind`, + `ErrInvalidArgument`, `ErrConflict`, `ErrUnsupported`, `ErrClosed`. + +- **`pkg/lore/store`** - `Store` interface: `Inscribe`, `Update`, `Get`, + `DeleteBySource`, `ListByTag`, `ListByKind`, `SearchText`, `AddEdge`, + `ListEdges`, `Close`. Full error contract and lifecycle contract documented. + +- **`pkg/lore/store/sqlite`** - SQLite reference implementation of `Store`. + Uses `modernc.org/sqlite` (pure Go, no CGO). FTS5 full-text index for BM25 + retrieval. Schema migrations via an embedded migration table. OTel spans on + every I/O method. `slog` for warnings and errors. + +- **`pkg/lore/embed`** - `Embedder` interface: `Embed`, `Dimensions`, `Close`. + Sentinel errors: `ErrInvalidArgument`, `ErrUnsupported`, `ErrClosed`. + +- **`pkg/lore/embed/bge`** - BGE-small-en-v1.5 int8 reference implementation + of `Embedder`. Runs in-process via `github.com/shota3506/onnxruntime-purego` + (no CGO). Model and tokenizer assets embedded in the binary at build time. + Returns `embed.ErrUnsupported` on platforms where ONNX Runtime is absent. + OTel spans named `lore.embed.encode`. Requires ONNX Runtime shared library + (`brew install onnxruntime` on macOS; `apt install libonnxruntime` on + Debian-derived Linux). + +- **`pkg/lore/vector`** - `VectorStore` interface: `Upsert`, `Delete`, + `Search`, `Dimensions`, `Close`. `Hit` result type. `SearchOpts` with + advisory `Kinds`/`Tags` filters. Sentinel errors: `ErrNotFound`, + `ErrInvalidArgument`, `ErrClosed`. + +- **`pkg/lore/vector/sqlitevec`** - SQLite-backed reference implementation + of `VectorStore`. Vectors stored as little-endian float32 BLOBs. Cosine + similarity computed in Go via a full table scan. Suitable for up to ~100K + vectors of 384 dimensions. No CGO, no sqlite-vec extension required. + OTel spans named `lore.vector.upsert`, `lore.vector.delete`, + `lore.vector.search`. + +- **`pkg/lore/retrieve`** - `Retriever` interface: `Search`. Shared result + types re-use `lore.SearchHit`. + +- **`pkg/lore/retrieve/bm25`** - `Ranker`: lexical-only `Retriever` backed + by `Store.SearchText`. OTel span `lore.retrieve.bm25`. + +- **`pkg/lore/retrieve/vector`** - `Searcher`: semantic-only `Retriever` + backed by `Embedder` + `VectorStore`. OTel span `lore.retrieve.vector`. + +- **`pkg/lore/retrieve/rrf`** - `Fuse`: Reciprocal Rank Fusion over arbitrary + ranked lists. `DefaultK = 60`. Deterministic tie-breaking by ascending ID. + +- **`pkg/lore/retrieve/hybrid`** - `Retriever` that fuses BM25 and vector + rankings via RRF. Degrades gracefully: if one arm fails, the other continues. + OTel spans: `lore.retrieve.search`, `lore.retrieve.bm25`, + `lore.retrieve.vector`, `lore.retrieve.fuse`. + +- **`pkg/lore/ingest`** - `Ingester` interface: `Process`. `Result`, `FileError` + types. `WalkerConfig` for tuning the filesystem walk. Pure functional transform: + returns entries; caller owns writes. + +- **`pkg/lore/ingest/heuristic`** - Heuristic `Ingester` implementation. + Four-level classification priority: YAML front matter, path rules + (`DefaultRules`), heading keyword patterns, fallback `research`. Configurable + via `WithRules`, `WithLogger`, `WithTracer`, `WithMaxFileSize`. OTel spans + `lore.ingest.process`, `lore.ingest.classify`. + +- `doc/ARCHITECTURE.md` - Architecture overview. +- `doc/INTERFACES.md` - Interface reference. +- `CHANGELOG.md` - This file. + +### Notes + +- Tag `v0.1.1` is the initial public release. Previous commits established the + package structure iteratively; `v0.1.1` is the first tagged, stable release. +- All reference implementations are pure Go or use `purego` bindings; no CGO + is required to build the module. +- Pre-v1.0: the exported surface is stable in shape but may change in detail + between minor versions. + +[0.1.1]: https://github.com/mathomhaus/lore/releases/tag/v0.1.1 diff --git a/doc/ARCHITECTURE.md b/doc/ARCHITECTURE.md new file mode 100644 index 0000000..df7c8a8 --- /dev/null +++ b/doc/ARCHITECTURE.md @@ -0,0 +1,136 @@ +# lore: Architecture + +This document describes the design of the lore library for contributors and +consumers who want to understand how the pieces fit together. + +## Goal + +Lore is a structured knowledge primitive for AI agents. It solves one problem: +store classified knowledge entries, link them with typed edges, and retrieve +them quickly using a combination of lexical and semantic ranking. The library +does not provide a CLI, an HTTP server, an MCP server, or a UI. It is +deliberately library-only so it can be composed into any of those without +duplication or coupling. + +## Three pluggable interfaces + +The library is built around exactly three swap points. Each swap point is a Go +interface; the reference implementation for each is in a sub-package. + +``` +Store pkg/lore/store persistence + BM25 full-text +Embedder pkg/lore/embed text-to-vector +VectorStore pkg/lore/vector vector nearest-neighbor +``` + +Why three and not more or fewer? + +Fewer would mean coupling persistence to retrieval or retrieval to embedding, +forcing consumers to take all three or none. More would fragment the interface +without adding capability: for example, splitting `Store` into a separate +`EdgeStore` and `EntryStore` would add complexity without enabling any +deployment pattern that the single interface cannot already serve. + +Three also corresponds directly to the three backend choices a production +deployment typically makes: relational store (SQLite, Postgres), embedding +model (local, remote API), and vector index (linear scan, pgvector, Qdrant). +Each swap is independent of the others. + +## Composition layer + +On top of the three interfaces, the library provides two composing layers: + +**Retriever** (`pkg/lore/retrieve`) composes `Store`, `Embedder`, and +`VectorStore` into a unified search surface. The `hybrid` implementation runs +the BM25 arm (`Store.SearchText`) and the vector arm (`Embedder.Embed` + +`VectorStore.Search`) concurrently, then fuses the ranked lists via Reciprocal +Rank Fusion (RRF). Callers that only want one arm use `bm25.New(store)` or +`vector.New(store, embedder, vstore)` directly; both satisfy `Retriever`. + +**Ingester** (`pkg/lore/ingest`) is a pure functional transform that walks a +directory tree, chunks recognized files (Markdown in v0.1.1), classifies each +chunk into a lore entry, and returns the entries to the caller. The caller +writes them to a `Store`. The ingester does not hold state between calls. + +## Path A vs Path B + +**Path A (agent inscribe):** an AI agent session calls `Store.Inscribe` +directly with a fully-formed entry it has already classified. It then calls +`Embedder.Embed` on the entry text and `VectorStore.Upsert` on the resulting +vector. This is the high-frequency path: no file parsing, no LLM inference cost, +no heuristics. An agent that produces 100 inscriptions per session uses Path A +for all of them. + +**Path B (document ingestion):** a one-time or periodic pipeline reads +existing document trees (runbooks, ADRs, wikis) and imports them in bulk. +`Ingester.Process` walks the tree, the heuristic classifier assigns kinds and +tags, and the caller writes the results to the same `Store`. Path B is optional: +a service that only needs agent-produced knowledge never instantiates an +`Ingester`. + +The two paths share the same `Store` and `VectorStore`; entries produced by +either path are indistinguishable at retrieval time. + +## Reference implementations and their scale limits + +**`store/sqlite`** uses `modernc.org/sqlite` (pure Go) and FTS5 for full-text +search. It is single-writer by design (SQLite WAL allows concurrent readers). +Suitable for single-service workloads without a shared database requirement. +Swap for a Postgres implementation when multiple writer replicas are needed. + +**`embed/bge`** runs the BAAI/bge-small-en-v1.5 int8 model in process using +`purego` bindings to the ONNX Runtime shared library. No CGO, no network calls. +Throughput is bounded by CPU; a single core embeds roughly 200-500 short texts +per second on modern hardware depending on batch size. Swap for a remote +embedding API (OpenAI, Cohere, Vertex AI) by implementing `embed.Embedder`. + +**`vector/sqlitevec`** stores float32 vectors as BLOB columns and computes +cosine similarity over a full table scan in Go. Practical limit is roughly +100K vectors of 384 dimensions (~100ms per query on modern laptop hardware). +Above that threshold, swap for a purpose-built ANN index: pgvector, +Qdrant, Weaviate, or a true sqlite-vec extension implementation. + +## Hybrid retrieval via RRF + +The hybrid retriever fuses two ranked lists using Reciprocal Rank Fusion: + +``` +score(d) = sum over rankers r: 1 / (k + rank_r(d)) +``` + +where k=60 is the standard smoothing constant (from Cormack, Clarke, Buettcher +2009). RRF requires only the rank position of each document, not the score +magnitude. This makes it robust to score scale differences between BM25 (which +returns large positive floats) and cosine similarity (which returns values in +[-1, 1]). No tuning is needed when switching between BM25 implementations or +embedding models. + +The retriever fetches a candidate pool (default: top 50) from each arm, fuses +the two ranked lists via RRF, truncates to the requested limit, and then +hydrates the full entry from the `Store` for each result. Partial arm failures +are tolerated: if the vector arm fails (for example `ErrUnsupported` on a +platform without ONNX Runtime), the BM25 arm continues independently and vice +versa. + +## Caller-owned dependencies + +Every constructor in the library accepts already-initialized resources. `sqlite.New` +takes a `*sql.DB`. `bge.New` accepts optional `*slog.Logger` and +`trace.Tracer`. `sqlitevec.New` takes a `*sql.DB` and the vector dimension. + +The library never opens database connections, reads environment variables for +connection strings, or manages connection pool lifecycle. This design has two +consequences: + +1. A service can pass the same `*sql.DB` to `sqlite.New` and `sqlitevec.New`, + sharing one connection pool and one SQLite file between the store and the + vector index. Schema migrations for both live in the same database. + +2. Multiple replicas of the same service can each open their own `*sql.DB` + against a shared database (for example Postgres via `database/sql` and a + Postgres-backed `Store` implementation). The library has no global state + and is safe to construct multiple times in the same process. + +In a Kubernetes deployment this means the ingester worker, query API service, +and MCP gateway each construct their own `Store` and `Retriever` from the same +connection string, without sharing in-process objects or requiring a singleton. diff --git a/doc/INTERFACES.md b/doc/INTERFACES.md new file mode 100644 index 0000000..96f18f8 --- /dev/null +++ b/doc/INTERFACES.md @@ -0,0 +1,193 @@ +# lore: Interface Reference + +This document is a condensed reference for every exported interface and its +key methods. For full godoc, run `go doc github.com/mathomhaus/lore/pkg/lore/...`. + +## Store (`pkg/lore/store`) + +`Store` persists lore entries and edges. All methods accept `ctx context.Context` +as their first argument and propagate cancellation to the underlying driver. +Close is idempotent; after Close all methods return `lore.ErrClosed`. + +```go +type Store interface { + Inscribe(ctx context.Context, e lore.Entry) (id int64, err error) + Update(ctx context.Context, e lore.Entry) error + Get(ctx context.Context, id int64) (lore.Entry, error) + DeleteBySource(ctx context.Context, source string) (deleted int, err error) + ListByTag(ctx context.Context, tag string, opts lore.ListOpts) ([]lore.Entry, error) + ListByKind(ctx context.Context, kind lore.Kind, opts lore.ListOpts) ([]lore.Entry, error) + SearchText(ctx context.Context, query string, opts lore.SearchOpts) ([]lore.SearchHit, error) + AddEdge(ctx context.Context, edge lore.Edge) error + ListEdges(ctx context.Context, fromID int64) ([]lore.Edge, error) + Close(ctx context.Context) error +} +``` + +| Method | Description | +|---|---| +| `Inscribe` | Persist a new entry; return storage-assigned ID. | +| `Update` | Replace all mutable fields of an existing entry (full replacement, not patch). | +| `Get` | Fetch a single entry by ID. Returns `ErrNotFound` when absent. | +| `DeleteBySource` | Remove all entries with the given Source; non-matching source returns 0, nil. | +| `ListByTag` | Entries carrying the exact tag, ordered newest-first. | +| `ListByKind` | Entries of the given kind, ordered newest-first. | +| `SearchText` | BM25 full-text search over Title and Body. Higher Score is better. | +| `AddEdge` | Persist a directed edge. Re-adding the same triple is a no-op. | +| `ListEdges` | All edges from a given entry ID, ordered by created_at ascending. | +| `Close` | Release resources. Idempotent. | + +Sentinel errors (from `pkg/lore`): `ErrNotFound`, `ErrDuplicate`, +`ErrInvalidKind`, `ErrInvalidArgument`, `ErrConflict`, `ErrUnsupported`, +`ErrClosed`. + +Reference implementation: `pkg/lore/store/sqlite`. Constructor: `sqlite.New(db *sql.DB, opts ...Option) (Store, error)`. + +--- + +## Embedder (`pkg/lore/embed`) + +`Embedder` turns text into dense float32 vectors. Batch-oriented: callers +wrap a single string with `[]string{s}` when only one is needed. Safe for +concurrent use. + +```go +type Embedder interface { + Embed(ctx context.Context, texts []string) ([][]float32, error) + Dimensions() int + Close(ctx context.Context) error +} +``` + +| Method | Description | +|---|---| +| `Embed` | Produce one vector per input string. Returns `ErrInvalidArgument` for empty slice or empty element. | +| `Dimensions` | Vector length emitted by Embed. Stable for the lifetime of the Embedder. | +| `Close` | Release loaded model and tokenizer. Idempotent. | + +Sentinel errors: `embed.ErrInvalidArgument`, `embed.ErrUnsupported`, +`embed.ErrClosed`. + +`ErrUnsupported` indicates the platform has no working ONNX Runtime. Callers +should fall back to lexical-only retrieval when they receive this from `New` +or `Embed`. + +Reference implementation: `pkg/lore/embed/bge`. Constructor: +`bge.New(opts ...Option) (embed.Embedder, error)`. Options: `bge.WithLogger`, +`bge.WithTracer`. + +--- + +## VectorStore (`pkg/lore/vector`) + +`VectorStore` persists float32 vectors keyed by entry ID and answers +nearest-neighbor queries. Dimension-bound at construction. + +```go +type VectorStore interface { + Upsert(ctx context.Context, id int64, vector []float32) error + Delete(ctx context.Context, id int64) error + Search(ctx context.Context, query []float32, opts SearchOpts) ([]Hit, error) + Dimensions() int + Close(ctx context.Context) error +} +``` + +| Method | Description | +|---|---| +| `Upsert` | Store or replace the vector for entry ID. Vector length must equal Dimensions(). | +| `Delete` | Remove the vector for entry ID. Returns `ErrNotFound` when absent. | +| `Search` | Return top-Limit vectors by cosine similarity. Query length must equal Dimensions(). | +| `Dimensions` | Fixed vector length for this store. | +| `Close` | Release resources beyond the caller-owned DB. Idempotent. | + +`SearchOpts.Kinds` and `SearchOpts.Tags` are advisory hints. Reference +implementations do not apply them; the Retriever layer post-filters via +`Store.Get`. + +Sentinel errors: `vector.ErrNotFound`, `vector.ErrInvalidArgument`, +`vector.ErrClosed`. + +Reference implementation: `pkg/lore/vector/sqlitevec`. Constructor: +`sqlitevec.New(db *sql.DB, dimensions int, opts ...Option) (vector.VectorStore, error)`. +Options: `sqlitevec.WithLogger`, `sqlitevec.WithTracer`. + +--- + +## Retriever (`pkg/lore/retrieve`) + +`Retriever` runs a search and returns ranked results. Implementations compose +`Store`, `Embedder`, and `VectorStore`; callers do not need to interact with +the underlying interfaces directly for search. + +```go +type Retriever interface { + Search(ctx context.Context, query string, opts lore.SearchOpts) ([]lore.SearchHit, error) +} +``` + +| Method | Description | +|---|---| +| `Search` | Execute retrieval for the given query; return results ranked by descending score. Returns `ErrInvalidArgument` for empty query or negative limit. | + +Implementations: + +- `hybrid.New(store, embedder, vstore, opts...)` - Fuses BM25 + vector via RRF. + Options: `hybrid.WithRRFK(k int)`, `hybrid.WithCandidatePoolSize(n int)`, + `hybrid.WithLogger`, `hybrid.WithTracer`. +- `bm25.New(store, opts...)` - Lexical-only. Options: `bm25.WithLogger`, + `bm25.WithTracer`. +- `vector.New(store, embedder, vstore, opts...)` - Semantic-only. Options: + `vector.WithLogger`, `vector.WithTracer`. + +The hybrid retriever degrades gracefully: if one arm fails, results from the +surviving arm are returned. Both arms failing returns an error. + +--- + +## Ingester (`pkg/lore/ingest`) + +`Ingester` walks a document tree and produces classified lore entries. Pure +functional transform: `Process` does not write to any store. + +```go +type Ingester interface { + Process(ctx context.Context, root string) (Result, error) +} +``` + +| Method | Description | +|---|---| +| `Process` | Walk root, chunk recognized files, classify chunks, return entries. Non-nil error signals a fatal failure (root does not exist, etc.). Per-file failures are collected in Result.Errors. | + +`Result` carries `Entries []lore.Entry` and `Errors []FileError`. Callers +decide how to handle `Result.Errors`: strict mode may discard entries; lenient +mode logs them and continues. + +Reference implementation: `pkg/lore/ingest/heuristic`. Constructor: +`heuristic.NewIngester(opts ...Option) ingest.Ingester`. Options: +`heuristic.WithRules([]Rule)`, `heuristic.WithLogger`, `heuristic.WithTracer`, +`heuristic.WithMaxFileSize(n int64)`. + +Classification priority (first match wins): + +1. YAML front matter `kind:` field (validated against canonical kinds). +2. Path rules: `filepath.Match` against repo-relative path and base name. +3. Heading keywords: `Decision`, `Procedure`, `Explanation`, `Principle`, etc. +4. Fallback: `KindResearch`. + +--- + +## RRF utility (`pkg/lore/retrieve/rrf`) + +```go +func Fuse(rankings [][]int64, k int) []ScoredID +``` + +Combines multiple ranked lists (each a slice of entry IDs, best first) into a +single fused list using Reciprocal Rank Fusion. Pass `rrf.DefaultK` (60) when +in doubt. Returns `[]ScoredID` sorted by descending score; ties break by +ascending ID for determinism. + +Useful for callers that want to run their own ranked lists through RRF without +the hybrid retriever. From a1049c0c55bca59fd2f5d178460f36283dbd48e9 Mon Sep 17 00:00:00 2001 From: Kunal Lanjewar <5488221+kunallanjewar@users.noreply.github.com> Date: Tue, 28 Apr 2026 08:57:52 -0700 Subject: [PATCH 4/5] test: add runnable Example* functions for all public packages Each example is a compile-checked Example* function in a _test.go file. Covers: lore (Kind, AllKinds, Entry, SearchOpts), store (inscribe+get, searchText, addEdge), vector (upsert+search, delete), retrieve (bm25 search, rrf.Fuse), ingest (heuristic Process). All examples pass go test -race. --- pkg/lore/example_test.go | 74 ++++++++++++++ pkg/lore/ingest/example_test.go | 50 ++++++++++ pkg/lore/retrieve/example_test.go | 78 +++++++++++++++ pkg/lore/store/example_test.go | 159 ++++++++++++++++++++++++++++++ pkg/lore/vector/example_test.go | 88 +++++++++++++++++ 5 files changed, 449 insertions(+) create mode 100644 pkg/lore/example_test.go create mode 100644 pkg/lore/ingest/example_test.go create mode 100644 pkg/lore/retrieve/example_test.go create mode 100644 pkg/lore/store/example_test.go create mode 100644 pkg/lore/vector/example_test.go diff --git a/pkg/lore/example_test.go b/pkg/lore/example_test.go new file mode 100644 index 0000000..6fc119a --- /dev/null +++ b/pkg/lore/example_test.go @@ -0,0 +1,74 @@ +package lore_test + +import ( + "fmt" + + "github.com/mathomhaus/lore/pkg/lore" +) + +// ExampleKind_Validate demonstrates how to validate a Kind value before +// writing an entry to a Store. Unknown kinds are rejected at write time +// by all standard implementations. +func ExampleKind_Validate() { + good := lore.KindDecision + if err := good.Validate(); err != nil { + fmt.Println("unexpected error:", err) + } else { + fmt.Println("valid:", good) + } + + bad := lore.Kind("unknown") + if err := bad.Validate(); err != nil { + fmt.Println("rejected unknown kind") + } + // Output: + // valid: decision + // rejected unknown kind +} + +// ExampleAllKinds prints the canonical kind taxonomy in display order. +func ExampleAllKinds() { + for _, k := range lore.AllKinds() { + fmt.Println(k) + } + // Output: + // decision + // principle + // procedure + // reference + // explanation + // observation + // research + // idea +} + +// ExampleKind_String shows that Kind satisfies fmt.Stringer and can be used +// directly in format strings. +func ExampleKind_String() { + k := lore.KindProcedure + fmt.Println(k.String()) + // Output: + // procedure +} + +// ExampleEntry_zero shows the zero value of Entry: all fields empty or nil, +// ready for population before passing to Store.Inscribe. +func ExampleEntry_zero() { + var e lore.Entry + fmt.Printf("id=%d kind=%q title=%q\n", e.ID, e.Kind, e.Title) + // Output: + // id=0 kind="" title="" +} + +// ExampleSearchOpts shows how to construct a SearchOpts that restricts +// results to a project and a pair of kinds. +func ExampleSearchOpts() { + opts := lore.SearchOpts{ + Project: "runbookCorpus", + Kinds: []lore.Kind{lore.KindProcedure, lore.KindReference}, + Limit: 10, + } + fmt.Printf("project=%s kinds=%d limit=%d\n", opts.Project, len(opts.Kinds), opts.Limit) + // Output: + // project=runbookCorpus kinds=2 limit=10 +} diff --git a/pkg/lore/ingest/example_test.go b/pkg/lore/ingest/example_test.go new file mode 100644 index 0000000..c644653 --- /dev/null +++ b/pkg/lore/ingest/example_test.go @@ -0,0 +1,50 @@ +package ingest_test + +import ( + "context" + "fmt" + "os" + "path/filepath" + + "github.com/mathomhaus/lore/pkg/lore/ingest/heuristic" +) + +// ExampleIngester_process demonstrates running the heuristic ingester against +// a small directory of Markdown files and inspecting the classified entries. +func ExampleIngester_process() { + // Build a minimal doc tree in a temp directory. + root, err := os.MkdirTemp("", "lore-ingest-example-*") + if err != nil { + fmt.Println("mktemp:", err) + return + } + defer os.RemoveAll(root) + + adrDir := filepath.Join(root, "docs", "adr") + if err := os.MkdirAll(adrDir, 0o755); err != nil { + fmt.Println("mkdir:", err) + return + } + + files := map[string]string{ + filepath.Join(adrDir, "001-use-sqlite.md"): "# ADR-001: Use SQLite\n\nWe chose SQLite for zero-dependency deployment.", + filepath.Join(adrDir, "002-use-wal.md"): "# ADR-002: Enable WAL mode\n\nWAL allows concurrent readers with a single writer.", + } + for path, content := range files { + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + fmt.Println("write:", err) + return + } + } + + ing := heuristic.NewIngester() + result, err := ing.Process(context.Background(), root) + if err != nil { + fmt.Println("process:", err) + return + } + + fmt.Printf("entries=%d errors=%d\n", len(result.Entries), len(result.Errors)) + // Output: + // entries=2 errors=0 +} diff --git a/pkg/lore/retrieve/example_test.go b/pkg/lore/retrieve/example_test.go new file mode 100644 index 0000000..5ad51a7 --- /dev/null +++ b/pkg/lore/retrieve/example_test.go @@ -0,0 +1,78 @@ +package retrieve_test + +import ( + "context" + "database/sql" + "fmt" + + _ "modernc.org/sqlite" + + "github.com/mathomhaus/lore/pkg/lore" + "github.com/mathomhaus/lore/pkg/lore/retrieve/bm25" + "github.com/mathomhaus/lore/pkg/lore/retrieve/rrf" + "github.com/mathomhaus/lore/pkg/lore/store/sqlite" +) + +// Example_bm25Search demonstrates lexical-only search using the BM25 ranker. +// This path requires no embedder and works on every platform. +func Example_bm25Search() { + ctx := context.Background() + db, err := sql.Open("sqlite", ":memory:?_pragma=foreign_keys(ON)") + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + st, err := sqlite.New(db) + if err != nil { + fmt.Println("sqlite.New:", err) + return + } + defer st.Close(ctx) + + entries := []lore.Entry{ + {Kind: lore.KindProcedure, Title: "Rollout runbook", Body: "Step-by-step deployment procedure for production services."}, + {Kind: lore.KindDecision, Title: "Deployment strategy decision", Body: "Rationale for choosing canary deployment over blue-green."}, + {Kind: lore.KindReference, Title: "Kubernetes resource limits", Body: "CPU and memory limit recommendations per service tier."}, + } + for _, e := range entries { + if _, err := st.Inscribe(ctx, e); err != nil { + fmt.Println("inscribe:", err) + return + } + } + + r := bm25.New(st) + hits, err := r.Search(ctx, "deployment", lore.SearchOpts{Limit: 5}) + if err != nil { + fmt.Println("search:", err) + return + } + + fmt.Printf("found %d hit(s) for 'deployment'\n", len(hits)) + // Output: + // found 2 hit(s) for 'deployment' +} + +// ExampleFuse demonstrates the rrf.Fuse function directly. Two separate rankers +// produce independent ranked lists; Fuse combines them into a single order. +// Documents appearing in both lists accumulate higher scores than those +// appearing in only one. +func ExampleFuse() { + // Ranker A (BM25): top result is entry 100, then 200. + rankA := []int64{100, 200} + // Ranker B (vector): top result is entry 200, then 300. + rankB := []int64{200, 300} + + fused := rrf.Fuse([][]int64{rankA, rankB}, rrf.DefaultK) + + // Entry 200 appears in both lists so it accumulates the highest fused score. + for _, s := range fused { + fmt.Printf("id=%d\n", s.ID) + } + // Output: + // id=200 + // id=100 + // id=300 +} diff --git a/pkg/lore/store/example_test.go b/pkg/lore/store/example_test.go new file mode 100644 index 0000000..743b3f6 --- /dev/null +++ b/pkg/lore/store/example_test.go @@ -0,0 +1,159 @@ +package store_test + +import ( + "context" + "database/sql" + "fmt" + + _ "modernc.org/sqlite" + + "github.com/mathomhaus/lore/pkg/lore" + "github.com/mathomhaus/lore/pkg/lore/store/sqlite" +) + +// openMemoryDB opens an in-memory SQLite database suitable for examples. +// The caller must close both the Store and the *sql.DB when done. +func openMemoryDB() (*sql.DB, error) { + return sql.Open("sqlite", ":memory:?_pragma=foreign_keys(ON)") +} + +// ExampleNew_inscribeAndGet demonstrates the primary Store write-then-read +// cycle: inscribe a decision entry and retrieve it by ID. +func ExampleNew_inscribeAndGet() { + ctx := context.Background() + db, err := openMemoryDB() + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + st, err := sqlite.New(db) + if err != nil { + fmt.Println("sqlite.New:", err) + return + } + defer st.Close(ctx) + + id, err := st.Inscribe(ctx, lore.Entry{ + Project: "decisionLog", + Kind: lore.KindDecision, + Title: "Adopt WAL mode for SQLite", + Body: "WAL mode allows concurrent readers while a single writer commits.", + Tags: []string{"adr", "storage"}, + }) + if err != nil { + fmt.Println("inscribe:", err) + return + } + + entry, err := st.Get(ctx, id) + if err != nil { + fmt.Println("get:", err) + return + } + + fmt.Printf("kind=%s title=%q tags=%v\n", entry.Kind, entry.Title, entry.Tags) + // Output: + // kind=decision title="Adopt WAL mode for SQLite" tags=[adr storage] +} + +// ExampleNew_searchText demonstrates full-text search over inscribed entries. +func ExampleNew_searchText() { + ctx := context.Background() + db, err := openMemoryDB() + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + st, err := sqlite.New(db) + if err != nil { + fmt.Println("sqlite.New:", err) + return + } + defer st.Close(ctx) + + entries := []lore.Entry{ + { + Project: "runbookCorpus", + Kind: lore.KindProcedure, + Title: "Database failover runbook", + Body: "Steps to promote a replica when the primary is unavailable.", + Tags: []string{"runbook", "database"}, + }, + { + Project: "runbookCorpus", + Kind: lore.KindProcedure, + Title: "Cache flush runbook", + Body: "Steps to safely flush and reload the Redis cache layer.", + Tags: []string{"runbook", "cache"}, + }, + } + for _, e := range entries { + if _, err := st.Inscribe(ctx, e); err != nil { + fmt.Println("inscribe:", err) + return + } + } + + hits, err := st.SearchText(ctx, "runbook", lore.SearchOpts{Limit: 5}) + if err != nil { + fmt.Println("search:", err) + return + } + + fmt.Printf("found %d hit(s)\n", len(hits)) + // Output: + // found 2 hit(s) +} + +// ExampleNew_addEdge demonstrates persisting a typed edge between two entries +// and then listing it back. +func ExampleNew_addEdge() { + ctx := context.Background() + db, err := openMemoryDB() + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + st, err := sqlite.New(db) + if err != nil { + fmt.Println("sqlite.New:", err) + return + } + defer st.Close(ctx) + + fromID, _ := st.Inscribe(ctx, lore.Entry{ + Kind: lore.KindDecision, + Title: "Use mTLS for service-to-service auth", + Body: "Mutual TLS prevents lateral movement inside the cluster.", + }) + toID, _ := st.Inscribe(ctx, lore.Entry{ + Kind: lore.KindProcedure, + Title: "Rotate mTLS certificates", + Body: "Steps to generate and distribute new service certificates.", + }) + + if err := st.AddEdge(ctx, lore.Edge{ + FromID: fromID, + ToID: toID, + Relation: "informs", + }); err != nil { + fmt.Println("add edge:", err) + return + } + + edges, err := st.ListEdges(ctx, fromID) + if err != nil { + fmt.Println("list edges:", err) + return + } + + fmt.Printf("edges from %d: count=%d relation=%s\n", fromID, len(edges), edges[0].Relation) + // Output: + // edges from 1: count=1 relation=informs +} diff --git a/pkg/lore/vector/example_test.go b/pkg/lore/vector/example_test.go new file mode 100644 index 0000000..685c73c --- /dev/null +++ b/pkg/lore/vector/example_test.go @@ -0,0 +1,88 @@ +package vector_test + +import ( + "context" + "database/sql" + "fmt" + + _ "modernc.org/sqlite" + + "github.com/mathomhaus/lore/pkg/lore/vector" + "github.com/mathomhaus/lore/pkg/lore/vector/sqlitevec" +) + +// ExampleNew_upsertAndSearch demonstrates the core VectorStore cycle: +// store two vectors and then query for the nearest neighbor. +func ExampleNew_upsertAndSearch() { + ctx := context.Background() + db, err := sql.Open("sqlite", ":memory:") + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + const dim = 4 + vs, err := sqlitevec.New(db, dim) + if err != nil { + fmt.Println("sqlitevec.New:", err) + return + } + defer vs.Close(ctx) + + // Two unit vectors pointing in different directions. + vecA := []float32{1, 0, 0, 0} + vecB := []float32{0, 1, 0, 0} + + if err := vs.Upsert(ctx, 1, vecA); err != nil { + fmt.Println("upsert 1:", err) + return + } + if err := vs.Upsert(ctx, 2, vecB); err != nil { + fmt.Println("upsert 2:", err) + return + } + + // Query with vecA: entry 1 should be the top hit (cosine sim = 1.0). + hits, err := vs.Search(ctx, vecA, vector.SearchOpts{Limit: 2}) + if err != nil { + fmt.Println("search:", err) + return + } + + fmt.Printf("top hit id=%d\n", hits[0].ID) + // Output: + // top hit id=1 +} + +// ExampleNew_delete shows that deleting a vector removes it from +// future search results. +func ExampleNew_delete() { + ctx := context.Background() + db, err := sql.Open("sqlite", ":memory:") + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + vs, err := sqlitevec.New(db, 2) + if err != nil { + fmt.Println("sqlitevec.New:", err) + return + } + defer vs.Close(ctx) + + _ = vs.Upsert(ctx, 10, []float32{1, 0}) + + // Delete the vector, then verify it is gone. + if err := vs.Delete(ctx, 10); err != nil { + fmt.Println("delete:", err) + return + } + + hits, _ := vs.Search(ctx, []float32{1, 0}, vector.SearchOpts{Limit: 5}) + fmt.Printf("hits after delete: %d\n", len(hits)) + // Output: + // hits after delete: 0 +} From df46d4c64c0c422f371a7783609d239535a7b075 Mon Sep 17 00:00:00 2001 From: Kunal Lanjewar <5488221+kunallanjewar@users.noreply.github.com> Date: Tue, 28 Apr 2026 09:06:05 -0700 Subject: [PATCH 5/5] fix(audit): hybrid post-filter Kind+Tag, edges FK ON DELETE CASCADE Two bugs surfaced by E2E exerciser: 1. hybrid.Retriever ignored opts.Kinds and opts.Tags after hydration. The vector arm documents Kind/Tag filters as advisory, expecting the Retriever to post-filter via Store.Get. The Retriever only post-filtered by Project, allowing vector-arm hits with non-matching kinds to leak into results. Added containsKind / containsAllTags post-filter. 2. DeleteBySource failed with FOREIGN KEY constraint when an entry was referenced by an edge. The edges FK to entries(id) lacked ON DELETE CASCADE, so deleting an entry that had any edge errored out. Added ON DELETE CASCADE to both edges FKs. Schema change is to migration 001 directly (no v0.1.x tag has shipped yet, so editing the baseline is safe and avoids a stub follow-up migration in the very first release). --- pkg/lore/retrieve/hybrid/hybrid.go | 37 ++++++++++++++++++- .../sqlite/migrations/001_initial.up.sql | 4 +- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/pkg/lore/retrieve/hybrid/hybrid.go b/pkg/lore/retrieve/hybrid/hybrid.go index fa08a96..06082c9 100644 --- a/pkg/lore/retrieve/hybrid/hybrid.go +++ b/pkg/lore/retrieve/hybrid/hybrid.go @@ -196,16 +196,51 @@ func (r *Retriever) Search(ctx context.Context, query string, opts lore.SearchOp } return nil, fmt.Errorf("hybrid: hydrate entry %d: %w", scored.ID, err) } - // Post-filter by project when specified. + // Post-filter by Project, Kinds, and Tags. The vector arm does not + // honor these filters natively (its SearchOpts hints are advisory), + // so the Retriever applies them after hydration to satisfy the + // caller's contract. The BM25 arm's results may already be filtered; + // re-applying here is a no-op for those. if opts.Project != "" && entry.Project != opts.Project { continue } + if len(opts.Kinds) > 0 && !containsKind(opts.Kinds, entry.Kind) { + continue + } + if len(opts.Tags) > 0 && !containsAllTags(entry.Tags, opts.Tags) { + continue + } results = append(results, lore.SearchHit{Entry: entry, Score: scored.Score}) } return results, nil } +// containsKind reports whether want contains kind. +func containsKind(want []lore.Kind, kind lore.Kind) bool { + for _, k := range want { + if k == kind { + return true + } + } + return false +} + +// containsAllTags reports whether entryTags contains every tag in required. +// Membership is intersection: required={"a","b"} demands the entry have BOTH. +func containsAllTags(entryTags, required []string) bool { + have := make(map[string]struct{}, len(entryTags)) + for _, t := range entryTags { + have[t] = struct{}{} + } + for _, r := range required { + if _, ok := have[r]; !ok { + return false + } + } + return true +} + // runBM25 executes the BM25 arm and returns a slice of entry IDs in rank order. func (r *Retriever) runBM25(ctx context.Context, query string, opts lore.SearchOpts) ([]int64, error) { ctx, span := r.tracer.Start(ctx, "lore.retrieve.bm25") diff --git a/pkg/lore/store/sqlite/migrations/001_initial.up.sql b/pkg/lore/store/sqlite/migrations/001_initial.up.sql index b62685b..490f1a5 100644 --- a/pkg/lore/store/sqlite/migrations/001_initial.up.sql +++ b/pkg/lore/store/sqlite/migrations/001_initial.up.sql @@ -38,8 +38,8 @@ CREATE TABLE IF NOT EXISTS edges ( weight REAL NOT NULL DEFAULT 0, created_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (from_id, to_id, relation), - FOREIGN KEY (from_id) REFERENCES entries(id), - FOREIGN KEY (to_id) REFERENCES entries(id) + FOREIGN KEY (from_id) REFERENCES entries(id) ON DELETE CASCADE, + FOREIGN KEY (to_id) REFERENCES entries(id) ON DELETE CASCADE ); -- ---------------------------------------------------------------------------