diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ab1907d --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,90 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.1.1] - 2026-04-27 + +### Added + +- **`pkg/lore`** - Core types: `Entry`, `Edge`, `SearchHit`, `ListOpts`, + `SearchOpts`. Eight canonical `Kind` values: `decision`, `principle`, + `procedure`, `reference`, `explanation`, `observation`, `research`, `idea`. + Sentinel errors: `ErrNotFound`, `ErrDuplicate`, `ErrInvalidKind`, + `ErrInvalidArgument`, `ErrConflict`, `ErrUnsupported`, `ErrClosed`. + +- **`pkg/lore/store`** - `Store` interface: `Inscribe`, `Update`, `Get`, + `DeleteBySource`, `ListByTag`, `ListByKind`, `SearchText`, `AddEdge`, + `ListEdges`, `Close`. Full error contract and lifecycle contract documented. + +- **`pkg/lore/store/sqlite`** - SQLite reference implementation of `Store`. + Uses `modernc.org/sqlite` (pure Go, no CGO). FTS5 full-text index for BM25 + retrieval. Schema migrations via an embedded migration table. OTel spans on + every I/O method. `slog` for warnings and errors. + +- **`pkg/lore/embed`** - `Embedder` interface: `Embed`, `Dimensions`, `Close`. + Sentinel errors: `ErrInvalidArgument`, `ErrUnsupported`, `ErrClosed`. + +- **`pkg/lore/embed/bge`** - BGE-small-en-v1.5 int8 reference implementation + of `Embedder`. Runs in-process via `github.com/shota3506/onnxruntime-purego` + (no CGO). Model and tokenizer assets embedded in the binary at build time. + Returns `embed.ErrUnsupported` on platforms where ONNX Runtime is absent. + OTel spans named `lore.embed.encode`. Requires ONNX Runtime shared library + (`brew install onnxruntime` on macOS; `apt install libonnxruntime` on + Debian-derived Linux). + +- **`pkg/lore/vector`** - `VectorStore` interface: `Upsert`, `Delete`, + `Search`, `Dimensions`, `Close`. `Hit` result type. `SearchOpts` with + advisory `Kinds`/`Tags` filters. Sentinel errors: `ErrNotFound`, + `ErrInvalidArgument`, `ErrClosed`. + +- **`pkg/lore/vector/sqlitevec`** - SQLite-backed reference implementation + of `VectorStore`. Vectors stored as little-endian float32 BLOBs. Cosine + similarity computed in Go via a full table scan. Suitable for up to ~100K + vectors of 384 dimensions. No CGO, no sqlite-vec extension required. + OTel spans named `lore.vector.upsert`, `lore.vector.delete`, + `lore.vector.search`. + +- **`pkg/lore/retrieve`** - `Retriever` interface: `Search`. Shared result + types re-use `lore.SearchHit`. + +- **`pkg/lore/retrieve/bm25`** - `Ranker`: lexical-only `Retriever` backed + by `Store.SearchText`. OTel span `lore.retrieve.bm25`. + +- **`pkg/lore/retrieve/vector`** - `Searcher`: semantic-only `Retriever` + backed by `Embedder` + `VectorStore`. OTel span `lore.retrieve.vector`. + +- **`pkg/lore/retrieve/rrf`** - `Fuse`: Reciprocal Rank Fusion over arbitrary + ranked lists. `DefaultK = 60`. Deterministic tie-breaking by ascending ID. + +- **`pkg/lore/retrieve/hybrid`** - `Retriever` that fuses BM25 and vector + rankings via RRF. Degrades gracefully: if one arm fails, the other continues. + OTel spans: `lore.retrieve.search`, `lore.retrieve.bm25`, + `lore.retrieve.vector`, `lore.retrieve.fuse`. + +- **`pkg/lore/ingest`** - `Ingester` interface: `Process`. `Result`, `FileError` + types. `WalkerConfig` for tuning the filesystem walk. Pure functional transform: + returns entries; caller owns writes. + +- **`pkg/lore/ingest/heuristic`** - Heuristic `Ingester` implementation. + Four-level classification priority: YAML front matter, path rules + (`DefaultRules`), heading keyword patterns, fallback `research`. Configurable + via `WithRules`, `WithLogger`, `WithTracer`, `WithMaxFileSize`. OTel spans + `lore.ingest.process`, `lore.ingest.classify`. + +- `doc/ARCHITECTURE.md` - Architecture overview. +- `doc/INTERFACES.md` - Interface reference. +- `CHANGELOG.md` - This file. + +### Notes + +- Tag `v0.1.1` is the initial public release. Previous commits established the + package structure iteratively; `v0.1.1` is the first tagged, stable release. +- All reference implementations are pure Go or use `purego` bindings; no CGO + is required to build the module. +- Pre-v1.0: the exported surface is stable in shape but may change in detail + between minor versions. + +[0.1.1]: https://github.com/mathomhaus/lore/releases/tag/v0.1.1 diff --git a/README.md b/README.md index 1e889c6..dd2b75c 100644 --- a/README.md +++ b/README.md @@ -1,392 +1,253 @@ # lore -A structured knowledge primitive for AI agents. Apache 2.0 OSS Go library. - -Lore stores classified knowledge entries (decisions, principles, procedures, -references, explanations, observations, research, ideas) and the typed edges -that connect them, then serves them back to retrieval pipelines that combine -lexical and semantic ranking. It ships as a Go library, not a service: callers -compose it into their own MCP servers, HTTP services, ingestion pipelines, or -CLI tools. - -The library is built around three pluggable interfaces (`Store`, `Embedder`, -`VectorStore`) plus a composing `Retriever` and an optional `Ingester`. Each -interface ships with an in-process reference implementation (modernc.org/sqlite, -BGE int8, sqlite-vec) so a single binary can run against a local SQLite file -out of the box. Swap any of the three for Postgres, a remote embedding API, -pgvector, or anything else by implementing the interface. +Lore is a structured knowledge library for AI agents. It stores classified +entries (decisions, principles, procedures, references, explanations, +observations, research, ideas) and the edges between them, then serves +them to retrieval pipelines that combine lexical and semantic ranking. + +Lore ships as a Go library, not a service. Callers compose it into their own +MCP servers, HTTP services, ingestion pipelines, or CLI tools. Three pluggable +interfaces (`Store`, `Embedder`, `VectorStore`) each have an in-process +reference implementation that runs against a local SQLite file out of the box. +Swap any of the three for Postgres, a remote embedding API, or a purpose-built +vector database by satisfying the interface. ## Install ``` -go get github.com/mathomhaus/lore@latest +go get github.com/mathomhaus/lore@v0.1.1 ``` Requires Go 1.23 or newer. -## Usage +## Quickstart -### Store: persist and retrieve entries - -The `store.Store` interface is the primary write and read surface. Open a -`*sql.DB` with the `"sqlite"` driver (registered by `modernc.org/sqlite`), -pass it to `sqlite.New`, and the constructor runs schema migrations -automatically. +The example below wires the three reference implementations together, inscribes +an entry, embeds it into the vector store, and runs a hybrid search. ```go -import ( - "context" - "database/sql" - "fmt" - - _ "modernc.org/sqlite" - - "github.com/mathomhaus/lore/pkg/lore" - "github.com/mathomhaus/lore/pkg/lore/store/sqlite" -) - -func main() { - dsn := "lore.db" + - "?_pragma=journal_mode(WAL)" + - "&_pragma=busy_timeout(5000)" + - "&_pragma=synchronous(NORMAL)" + - "&_pragma=foreign_keys(ON)" - - db, err := sql.Open("sqlite", dsn) - if err != nil { - panic(err) - } - defer db.Close() - - st, err := sqlite.New(db) - if err != nil { - panic(err) - } - defer st.Close(context.Background()) - - // Persist a decision. - id, err := st.Inscribe(context.Background(), lore.Entry{ - Project: "myproject", - Kind: lore.KindDecision, - Title: "Use SQLite for local persistence", - Body: "Chosen for zero-dependency deployment and strong ACID guarantees.", - Tags: []string{"adr", "storage"}, - }) - if err != nil { - panic(err) - } - fmt.Println("inscribed", id) - - // Retrieve it. - entry, err := st.Get(context.Background(), id) - if err != nil { - panic(err) - } - fmt.Println(entry.Title) - - // Full-text search. - hits, err := st.SearchText(context.Background(), "SQLite persistence", lore.SearchOpts{Limit: 5}) - if err != nil { - panic(err) - } - for _, h := range hits { - fmt.Printf("%.3f %s\n", h.Score, h.Entry.Title) - } -} -``` +package main -The `store.Store` interface is backend-agnostic. Swap `sqlite.New` for any -implementation that satisfies the interface to use a different storage engine -without changing callers. - -### Path B: document ingestion - -Path B ingests existing Markdown document trees into lore entries. The -ingester is a pure functional transform: it returns entries and the caller -writes them to a Store. - -```go import ( - "context" - "log" - - "github.com/mathomhaus/lore/pkg/lore/ingest/heuristic" + "context" + "database/sql" + "errors" + "fmt" + "log" + + _ "modernc.org/sqlite" + + "github.com/mathomhaus/lore/pkg/lore" + "github.com/mathomhaus/lore/pkg/lore/embed" + "github.com/mathomhaus/lore/pkg/lore/embed/bge" + "github.com/mathomhaus/lore/pkg/lore/retrieve/hybrid" + "github.com/mathomhaus/lore/pkg/lore/store/sqlite" + "github.com/mathomhaus/lore/pkg/lore/vector/sqlitevec" ) func main() { - ing := heuristic.NewIngester() - - result, err := ing.Process(context.Background(), "/workspace/docs") - if err != nil { - log.Fatal(err) - } - - for _, fe := range result.Errors { - log.Printf("warn: %v", fe) - } - - for _, e := range result.Entries { - log.Printf("entry kind=%s title=%q source=%s", e.Kind, e.Title, e.Source) - // write e to your store here - } + ctx := context.Background() + + // Open a single SQLite file. All three backends share the same DB. + dsn := "knowledge.db" + + "?_pragma=journal_mode(WAL)" + + "&_pragma=busy_timeout(5000)" + + "&_pragma=synchronous(NORMAL)" + + "&_pragma=foreign_keys(ON)" + + db, err := sql.Open("sqlite", dsn) + if err != nil { + log.Fatal(err) + } + defer db.Close() + + // Store: handles entry persistence and BM25 full-text search. + st, err := sqlite.New(db) + if err != nil { + log.Fatal(err) + } + defer st.Close(ctx) + + // VectorStore: stores and queries float32 vectors (384-dim for BGE-small). + vs, err := sqlitevec.New(db, 384) + if err != nil { + log.Fatal(err) + } + defer vs.Close(ctx) + + // Embedder: in-process BGE int8 model. Falls back gracefully on platforms + // without ONNX Runtime. + var emb embed.Embedder + emb, err = bge.New() + if err != nil { + if !errors.Is(err, embed.ErrUnsupported) { + log.Fatal(err) + } + log.Print("embedder unavailable; using BM25-only retrieval") + emb = nil + } + if emb != nil { + defer emb.Close(ctx) + } + + // Inscribe a decision entry. + id, err := st.Inscribe(ctx, lore.Entry{ + Project: "decisionLog", + Kind: lore.KindDecision, + Title: "Use SQLite for local persistence", + Body: "Chosen for zero-dependency deployment and strong ACID guarantees under single-writer workloads.", + Tags: []string{"adr", "storage"}, + }) + if err != nil { + log.Fatal(err) + } + fmt.Printf("inscribed entry id=%d\n", id) + + // Embed and store the vector for the new entry if an embedder is available. + if emb != nil { + entry, _ := st.Get(ctx, id) + vecs, err := emb.Embed(ctx, []string{entry.Title + " " + entry.Body}) + if err == nil { + _ = vs.Upsert(ctx, id, vecs[0]) + } + } + + // Build a hybrid retriever that fuses BM25 + vector via RRF. + r := hybrid.New(st, emb, vs, + hybrid.WithRRFK(60), + hybrid.WithCandidatePoolSize(50), + ) + + // Search. + hits, err := r.Search(ctx, "SQLite persistence decision", lore.SearchOpts{ + Project: "decisionLog", + Limit: 5, + }) + if err != nil { + log.Fatal(err) + } + for _, h := range hits { + fmt.Printf("%.4f %s\n", h.Score, h.Entry.Title) + } } ``` -#### Classification priority +## What lore is -The heuristic ingester classifies each chunk using this priority order (first -match wins): +Lore exposes three pluggable interfaces: -1. YAML front matter with an explicit `kind:` field. -2. Path rules: `docs/adr/*.md` maps to decision+adr; `docs/runbooks/*.md` - maps to procedure+runbook; `CLAUDE.md`/`agents.md`/`skills.md` map to - reference+agent-config; and so on. See `heuristic.DefaultRules()`. -3. Heading keywords: `## What is` maps to explanation; `## Decision` / - `## Context` / `## Consequences` maps to decision; `## Procedure` / - `## Steps` maps to procedure; etc. -4. Fallback: kind=research (catch-all). +**Store** (`pkg/lore/store`) persists entries and edges. The reference +implementation in `pkg/lore/store/sqlite` uses `modernc.org/sqlite` (pure Go) +with FTS5 for BM25 full-text search. Replace it with Postgres, MySQL, or any +other engine by satisfying the interface. -#### Customizing rules +**Embedder** (`pkg/lore/embed`) turns text into dense vectors. The reference +implementation in `pkg/lore/embed/bge` runs an int8-quantized BGE-small-en-v1.5 +model in process via ONNX Runtime. Replace it with a remote embedding API or a +different local model without changing retrieval logic. -```go -import "github.com/mathomhaus/lore/pkg/lore/ingest/heuristic" - -rules := heuristic.DefaultRules() -rules = append(rules, heuristic.Rule{ - PathGlob: "docs/specs/*.md", - Kind: lore.KindDecision, - Tags: []string{"spec"}, -}) - -ing := heuristic.NewIngester( - heuristic.WithRules(rules), - heuristic.WithLogger(slog.Default()), -) -``` +**VectorStore** (`pkg/lore/vector`) stores and queries float32 vectors. The +reference implementation in `pkg/lore/vector/sqlitevec` stores vectors as BLOB +columns in the same SQLite file and does cosine similarity in Go (no CGO, no +extensions). Replace it with pgvector, Qdrant, or a native extension backend. -#### Walker behavior (v0.1.1) +On top of these three, lore composes a **Retriever** (`pkg/lore/retrieve`) that +fuses BM25 lexical and vector semantic rankings via Reciprocal Rank Fusion (RRF, +k=60). An **Ingester** (`pkg/lore/ingest`) optionally walks document trees and +classifies chunks into entries (Path B). -- Only `.md` and `.markdown` files are processed. -- `.git/`, `node_modules/`, `vendor/`, and any hidden directory (name - starting with `.`) are skipped unconditionally. -- Files larger than 10 MB are skipped with a FileError. -- Symlinks are not followed. -- `.gitignore` patterns are not honored (planned for v0.2). +Two write paths: -## Status: pre-v1.0 - -Lore is pre-v1.0. The exported surface is stable in shape but may change in -detail between minor versions. Pin to a version, read release notes before -upgrading, and expect occasional breakage on `main`. +- **Path A (agent inscribe):** an agent calls `Store.Inscribe` directly, then + `Embedder.Embed` + `VectorStore.Upsert`. No document parsing, no LLM cost. + High-frequency path suitable for session-level knowledge capture. +- **Path B (document ingestion):** `Ingester.Process` walks a directory, chunks + Markdown files, classifies each chunk (YAML front matter, then path rules, + then heading patterns, then fallback to `research`), and returns entries. The + caller writes them to a Store. Suitable for bulk ingestion of existing docs. ## What lore is not -- Not a CLI binary. Not an MCP server. Not an HTTP server. Not a UI. -- Not a hosted service. Not multi-tenant. Not an LLM client. +- Not a CLI binary. +- Not an MCP server. +- Not an HTTP server. +- Not a UI. +- Not a hosted service. +- Not multi-tenant (all isolation is caller-provided via the `Project` field). +- Not an LLM client. - Not a replacement for a full retrieval-augmented-generation framework. -Lore is the substrate. Everything above is a consumer's choice. +Lore is the substrate. Everything above is a consumer's responsibility. -## VectorStore +## Production deployment patterns -`pkg/lore/vector` defines the `VectorStore` interface. The reference -implementation in `pkg/lore/vector/sqlitevec` stores vectors as BLOB columns -inside your existing `*sql.DB` and runs cosine similarity entirely in Go -(no CGO, no extensions). +Because lore accepts caller-owned `*sql.DB` instances rather than connection +strings, it maps cleanly to multi-replica Kubernetes deployments. A typical +consumer service structure uses three stateless Deployments: -```go -import ( - "context" - "database/sql" +**Ingester worker** reads source documents from a queue (Pub/Sub, SQS, or a +database queue table), calls `Ingester.Process`, and writes the returned entries +to a shared `Store`. One or more replicas; only requires write access to the +database. - _ "modernc.org/sqlite" +**Query API service** receives search queries over HTTP or gRPC. It opens a +read-optimized `*sql.DB` connection pool (WAL mode allows concurrent readers), +constructs a `Store` + `Embedder` + `VectorStore`, wires them into a `Retriever`, +and returns ranked hits. Scales horizontally; each replica is stateless. - "github.com/mathomhaus/lore/pkg/lore/vector" - "github.com/mathomhaus/lore/pkg/lore/vector/sqlitevec" -) +**MCP gateway** exposes lore to AI agent harnesses via the Model Context +Protocol. It wraps the same `Store` + `Retriever` in tool handlers for +`inscribe`, `search`, and `list`. The library provides the knowledge primitives; +the MCP surface is the consumer's thin adaptation layer. -db, _ := sql.Open("sqlite", "lore.db") +All three Deployments can share a single underlying SQLite file (via a network +volume or single-writer proxy) or migrate to Postgres by swapping the `Store` +and `VectorStore` implementations. No code changes are required in the consumer +services when backends are swapped. -// Bind to a 384-dimension space (BGE-small-en-v1.5). -store, err := sqlitevec.New(db, 384) -if err != nil { - // handle -} -defer store.Close(context.Background()) - -ctx := context.Background() +## Reference implementations -// Store a vector. -vec := make([]float32, 384) // fill from your Embedder -_ = store.Upsert(ctx, entryID, vec) +| Package | Role | Backend | Scale guidance | +|---|---|---|---| +| `pkg/lore/store/sqlite` | Store | modernc.org/sqlite + FTS5 | Suitable for most single-service workloads | +| `pkg/lore/embed/bge` | Embedder | BGE-small-en-v1.5 int8 via ONNX Runtime | Requires ONNX Runtime dylib; pure CPU | +| `pkg/lore/vector/sqlitevec` | VectorStore | SQLite BLOB + Go cosine scan | Good to ~100K vectors of 384 dim (~100ms scan on modern hardware) | +| `pkg/lore/retrieve/hybrid` | Retriever | BM25 + vector via RRF | Inherits limits of Store + VectorStore | +| `pkg/lore/retrieve/bm25` | Retriever (lexical only) | Store.SearchText | No embedder required | +| `pkg/lore/ingest/heuristic` | Ingester | Rule-based heuristic classifier | Pure Go; no LLM cost | -// Search: returns top-5 hits in descending cosine similarity order. -hits, err := store.Search(ctx, queryVec, vector.SearchOpts{Limit: 5}) -for _, h := range hits { - fmt.Printf("entry %d score %.4f\n", h.ID, h.Score) -} -``` +All reference implementations are pure Go with no CGO requirement. The BGE +embedder uses `purego` for ONNX Runtime binding rather than CGO. -Kind and tag filters in `SearchOpts` are advisory. The sqlitevec reference -implementation does not apply them (a full-table-scan store has no efficient -join). Post-filter results via your `Store.Get` call or swap in a -VectorStore that understands your schema. +## Configuration -Scale: the reference impl performs a full linear scan. Acceptable for up to -roughly 100K vectors of 384 dimensions (benchmark: ~100ms on Apple M3 Pro). -Beyond that, implement `VectorStore` with pgvector, Qdrant, or a native -sqlite-vec extension backend. +**Logger.** Pass `WithLogger(*slog.Logger)` to any constructor. Defaults to +`slog.Default()`. -## Embedder +**Tracer.** Pass `WithTracer(trace.Tracer)` to enable OpenTelemetry spans. +Defaults to the global tracer provider (`otel.GetTracerProvider()`). Wire an +exporter in your service bootstrap to send traces to your backend of choice. -The `Embedder` interface turns text into dense vectors for semantic retrieval: +Span names follow the pattern `lore..` (for example +`lore.store.inscribe`, `lore.vector.search`, `lore.retrieve.search`). -```go -import ( - "context" - "errors" +**BGE embedder.** Set `LORE_ONNXRUNTIME_LIB` to override the default shared +library search path. On macOS: `brew install onnxruntime` puts the dylib where +the probe expects it. When the library is absent, `bge.New` returns +`embed.ErrUnsupported` and callers should fall back to lexical-only retrieval. - "github.com/mathomhaus/lore/pkg/lore/embed" - "github.com/mathomhaus/lore/pkg/lore/embed/bge" -) +## Stability -func embedTexts(ctx context.Context, texts []string) ([][]float32, error) { - emb, err := bge.New() - if err != nil { - if errors.Is(err, embed.ErrUnsupported) { - // Platform has no ONNX Runtime; fall through to lexical-only retrieval. - return nil, err - } - return nil, err - } - defer emb.Close(ctx) - - vecs, err := emb.Embed(ctx, texts) - if err != nil { - return nil, err - } - // Each vecs[i] is a float32 slice of length emb.Dimensions() (384 for BGE-small). - return vecs, nil -} -``` - -`bge.New` options: - -- `bge.WithLogger(*slog.Logger)` for a structured logger covering init and runtime warnings. -- `bge.WithTracer(trace.Tracer)` for an OTel tracer; spans named `lore.embed.encode`. - -The BGE reference implementation requires the ONNX Runtime shared library on the -host (e.g. `brew install onnxruntime` on macOS). Set `LORE_ONNXRUNTIME_LIB` to -override the default search path. When the library is absent, `bge.New` returns -`embed.ErrUnsupported` and callers should fall back to lexical retrieval. - -Implement the `embed.Embedder` interface to swap in a remote embedding API or a -different model without changing any retrieval code. - -## Retriever: hybrid BM25 + vector search - -`pkg/lore/retrieve` defines the `Retriever` interface. The reference -implementation in `pkg/lore/retrieve/hybrid` fuses BM25 lexical search -(via `Store.SearchText`) and vector nearest-neighbour search (via -`Embedder.Embed` + `VectorStore.Search`) using Reciprocal Rank Fusion -(RRF, k=60). This approach avoids tuning score scales across rankers: -only ordinal rank positions matter. - -```go -import ( - "context" - "database/sql" - "fmt" - "log" - - _ "modernc.org/sqlite" - - "github.com/mathomhaus/lore/pkg/lore" - "github.com/mathomhaus/lore/pkg/lore/embed/bge" - "github.com/mathomhaus/lore/pkg/lore/retrieve/hybrid" - "github.com/mathomhaus/lore/pkg/lore/store/sqlite" - "github.com/mathomhaus/lore/pkg/lore/vector/sqlitevec" -) - -func search(db *sql.DB, query string) ([]lore.SearchHit, error) { - // Store handles BM25. - st, err := sqlite.New(db) - if err != nil { - return nil, err - } - defer st.Close(context.Background()) - - // Embedder handles query vectorisation. - emb, err := bge.New() - if err != nil { - // ErrUnsupported on platforms without ONNX Runtime: use BM25-only. - log.Printf("warn: embedder unavailable, using BM25 only: %v", err) - emb = nil - } - if emb != nil { - defer emb.Close(context.Background()) - } - - // VectorStore handles nearest-neighbour lookup. - vs, err := sqlitevec.New(db, 384) - if err != nil { - return nil, err - } - defer vs.Close(context.Background()) - - r := hybrid.New(st, emb, vs, - hybrid.WithRRFK(60), - hybrid.WithCandidatePoolSize(50), - ) - - return r.Search(context.Background(), query, lore.SearchOpts{Limit: 10}) -} -``` - -The hybrid retriever tolerates partial failures gracefully: - -- If `Embedder.Embed` returns an error (e.g. `embed.ErrUnsupported`), the vector - arm is skipped and BM25 results are returned alone. -- If `VectorStore.Search` returns an error, the BM25 arm continues independently. -- Only when both arms fail does `Search` return an error. - -When the embedder is nil, pass a no-op stub or use `bm25.New(store)` directly: - -```go -import "github.com/mathomhaus/lore/pkg/lore/retrieve/bm25" - -r := bm25.New(st) -hits, err := r.Search(ctx, "deployment rollout", lore.SearchOpts{Limit: 10}) -``` - -### RRF algorithm - -`pkg/lore/retrieve/rrf` exposes `Fuse(rankings [][]int64, k int) []ScoredID` -for callers that want to run their own ranked lists through RRF without the -hybrid retriever: - -```go -import "github.com/mathomhaus/lore/pkg/lore/retrieve/rrf" - -bm25IDs := []int64{10, 20, 30} -vecIDs := []int64{20, 10, 40} - -fused := rrf.Fuse([][]int64{bm25IDs, vecIDs}, rrf.DefaultK) -for _, s := range fused { - fmt.Printf("id=%d score=%.4f\n", s.ID, s.Score) -} -``` - -Output is sorted by descending score; ties break by ascending ID for -determinism. +Lore is pre-v1.0. The exported surface is stable in shape but may change in +detail between minor versions. Pin to a version, read release notes before +upgrading, and expect occasional breaking changes on minor version bumps. ## Attribution Lore extracts and generalizes the storage, embedding, and retrieval primitives originally built inside [`mathomhaus/guild`](https://github.com/mathomhaus/guild). -Guild remains the opinionated agent-coordination platform that adds -quest, oath, and brief on top of these primitives. - +Guild remains the opinionated agent-coordination platform that adds quest, oath, +and brief on top of these primitives. ## License diff --git a/doc/ARCHITECTURE.md b/doc/ARCHITECTURE.md new file mode 100644 index 0000000..df7c8a8 --- /dev/null +++ b/doc/ARCHITECTURE.md @@ -0,0 +1,136 @@ +# lore: Architecture + +This document describes the design of the lore library for contributors and +consumers who want to understand how the pieces fit together. + +## Goal + +Lore is a structured knowledge primitive for AI agents. It solves one problem: +store classified knowledge entries, link them with typed edges, and retrieve +them quickly using a combination of lexical and semantic ranking. The library +does not provide a CLI, an HTTP server, an MCP server, or a UI. It is +deliberately library-only so it can be composed into any of those without +duplication or coupling. + +## Three pluggable interfaces + +The library is built around exactly three swap points. Each swap point is a Go +interface; the reference implementation for each is in a sub-package. + +``` +Store pkg/lore/store persistence + BM25 full-text +Embedder pkg/lore/embed text-to-vector +VectorStore pkg/lore/vector vector nearest-neighbor +``` + +Why three and not more or fewer? + +Fewer would mean coupling persistence to retrieval or retrieval to embedding, +forcing consumers to take all three or none. More would fragment the interface +without adding capability: for example, splitting `Store` into a separate +`EdgeStore` and `EntryStore` would add complexity without enabling any +deployment pattern that the single interface cannot already serve. + +Three also corresponds directly to the three backend choices a production +deployment typically makes: relational store (SQLite, Postgres), embedding +model (local, remote API), and vector index (linear scan, pgvector, Qdrant). +Each swap is independent of the others. + +## Composition layer + +On top of the three interfaces, the library provides two composing layers: + +**Retriever** (`pkg/lore/retrieve`) composes `Store`, `Embedder`, and +`VectorStore` into a unified search surface. The `hybrid` implementation runs +the BM25 arm (`Store.SearchText`) and the vector arm (`Embedder.Embed` + +`VectorStore.Search`) concurrently, then fuses the ranked lists via Reciprocal +Rank Fusion (RRF). Callers that only want one arm use `bm25.New(store)` or +`vector.New(store, embedder, vstore)` directly; both satisfy `Retriever`. + +**Ingester** (`pkg/lore/ingest`) is a pure functional transform that walks a +directory tree, chunks recognized files (Markdown in v0.1.1), classifies each +chunk into a lore entry, and returns the entries to the caller. The caller +writes them to a `Store`. The ingester does not hold state between calls. + +## Path A vs Path B + +**Path A (agent inscribe):** an AI agent session calls `Store.Inscribe` +directly with a fully-formed entry it has already classified. It then calls +`Embedder.Embed` on the entry text and `VectorStore.Upsert` on the resulting +vector. This is the high-frequency path: no file parsing, no LLM inference cost, +no heuristics. An agent that produces 100 inscriptions per session uses Path A +for all of them. + +**Path B (document ingestion):** a one-time or periodic pipeline reads +existing document trees (runbooks, ADRs, wikis) and imports them in bulk. +`Ingester.Process` walks the tree, the heuristic classifier assigns kinds and +tags, and the caller writes the results to the same `Store`. Path B is optional: +a service that only needs agent-produced knowledge never instantiates an +`Ingester`. + +The two paths share the same `Store` and `VectorStore`; entries produced by +either path are indistinguishable at retrieval time. + +## Reference implementations and their scale limits + +**`store/sqlite`** uses `modernc.org/sqlite` (pure Go) and FTS5 for full-text +search. It is single-writer by design (SQLite WAL allows concurrent readers). +Suitable for single-service workloads without a shared database requirement. +Swap for a Postgres implementation when multiple writer replicas are needed. + +**`embed/bge`** runs the BAAI/bge-small-en-v1.5 int8 model in process using +`purego` bindings to the ONNX Runtime shared library. No CGO, no network calls. +Throughput is bounded by CPU; a single core embeds roughly 200-500 short texts +per second on modern hardware depending on batch size. Swap for a remote +embedding API (OpenAI, Cohere, Vertex AI) by implementing `embed.Embedder`. + +**`vector/sqlitevec`** stores float32 vectors as BLOB columns and computes +cosine similarity over a full table scan in Go. Practical limit is roughly +100K vectors of 384 dimensions (~100ms per query on modern laptop hardware). +Above that threshold, swap for a purpose-built ANN index: pgvector, +Qdrant, Weaviate, or a true sqlite-vec extension implementation. + +## Hybrid retrieval via RRF + +The hybrid retriever fuses two ranked lists using Reciprocal Rank Fusion: + +``` +score(d) = sum over rankers r: 1 / (k + rank_r(d)) +``` + +where k=60 is the standard smoothing constant (from Cormack, Clarke, Buettcher +2009). RRF requires only the rank position of each document, not the score +magnitude. This makes it robust to score scale differences between BM25 (which +returns large positive floats) and cosine similarity (which returns values in +[-1, 1]). No tuning is needed when switching between BM25 implementations or +embedding models. + +The retriever fetches a candidate pool (default: top 50) from each arm, fuses +the two ranked lists via RRF, truncates to the requested limit, and then +hydrates the full entry from the `Store` for each result. Partial arm failures +are tolerated: if the vector arm fails (for example `ErrUnsupported` on a +platform without ONNX Runtime), the BM25 arm continues independently and vice +versa. + +## Caller-owned dependencies + +Every constructor in the library accepts already-initialized resources. `sqlite.New` +takes a `*sql.DB`. `bge.New` accepts optional `*slog.Logger` and +`trace.Tracer`. `sqlitevec.New` takes a `*sql.DB` and the vector dimension. + +The library never opens database connections, reads environment variables for +connection strings, or manages connection pool lifecycle. This design has two +consequences: + +1. A service can pass the same `*sql.DB` to `sqlite.New` and `sqlitevec.New`, + sharing one connection pool and one SQLite file between the store and the + vector index. Schema migrations for both live in the same database. + +2. Multiple replicas of the same service can each open their own `*sql.DB` + against a shared database (for example Postgres via `database/sql` and a + Postgres-backed `Store` implementation). The library has no global state + and is safe to construct multiple times in the same process. + +In a Kubernetes deployment this means the ingester worker, query API service, +and MCP gateway each construct their own `Store` and `Retriever` from the same +connection string, without sharing in-process objects or requiring a singleton. diff --git a/doc/INTERFACES.md b/doc/INTERFACES.md new file mode 100644 index 0000000..96f18f8 --- /dev/null +++ b/doc/INTERFACES.md @@ -0,0 +1,193 @@ +# lore: Interface Reference + +This document is a condensed reference for every exported interface and its +key methods. For full godoc, run `go doc github.com/mathomhaus/lore/pkg/lore/...`. + +## Store (`pkg/lore/store`) + +`Store` persists lore entries and edges. All methods accept `ctx context.Context` +as their first argument and propagate cancellation to the underlying driver. +Close is idempotent; after Close all methods return `lore.ErrClosed`. + +```go +type Store interface { + Inscribe(ctx context.Context, e lore.Entry) (id int64, err error) + Update(ctx context.Context, e lore.Entry) error + Get(ctx context.Context, id int64) (lore.Entry, error) + DeleteBySource(ctx context.Context, source string) (deleted int, err error) + ListByTag(ctx context.Context, tag string, opts lore.ListOpts) ([]lore.Entry, error) + ListByKind(ctx context.Context, kind lore.Kind, opts lore.ListOpts) ([]lore.Entry, error) + SearchText(ctx context.Context, query string, opts lore.SearchOpts) ([]lore.SearchHit, error) + AddEdge(ctx context.Context, edge lore.Edge) error + ListEdges(ctx context.Context, fromID int64) ([]lore.Edge, error) + Close(ctx context.Context) error +} +``` + +| Method | Description | +|---|---| +| `Inscribe` | Persist a new entry; return storage-assigned ID. | +| `Update` | Replace all mutable fields of an existing entry (full replacement, not patch). | +| `Get` | Fetch a single entry by ID. Returns `ErrNotFound` when absent. | +| `DeleteBySource` | Remove all entries with the given Source; non-matching source returns 0, nil. | +| `ListByTag` | Entries carrying the exact tag, ordered newest-first. | +| `ListByKind` | Entries of the given kind, ordered newest-first. | +| `SearchText` | BM25 full-text search over Title and Body. Higher Score is better. | +| `AddEdge` | Persist a directed edge. Re-adding the same triple is a no-op. | +| `ListEdges` | All edges from a given entry ID, ordered by created_at ascending. | +| `Close` | Release resources. Idempotent. | + +Sentinel errors (from `pkg/lore`): `ErrNotFound`, `ErrDuplicate`, +`ErrInvalidKind`, `ErrInvalidArgument`, `ErrConflict`, `ErrUnsupported`, +`ErrClosed`. + +Reference implementation: `pkg/lore/store/sqlite`. Constructor: `sqlite.New(db *sql.DB, opts ...Option) (Store, error)`. + +--- + +## Embedder (`pkg/lore/embed`) + +`Embedder` turns text into dense float32 vectors. Batch-oriented: callers +wrap a single string with `[]string{s}` when only one is needed. Safe for +concurrent use. + +```go +type Embedder interface { + Embed(ctx context.Context, texts []string) ([][]float32, error) + Dimensions() int + Close(ctx context.Context) error +} +``` + +| Method | Description | +|---|---| +| `Embed` | Produce one vector per input string. Returns `ErrInvalidArgument` for empty slice or empty element. | +| `Dimensions` | Vector length emitted by Embed. Stable for the lifetime of the Embedder. | +| `Close` | Release loaded model and tokenizer. Idempotent. | + +Sentinel errors: `embed.ErrInvalidArgument`, `embed.ErrUnsupported`, +`embed.ErrClosed`. + +`ErrUnsupported` indicates the platform has no working ONNX Runtime. Callers +should fall back to lexical-only retrieval when they receive this from `New` +or `Embed`. + +Reference implementation: `pkg/lore/embed/bge`. Constructor: +`bge.New(opts ...Option) (embed.Embedder, error)`. Options: `bge.WithLogger`, +`bge.WithTracer`. + +--- + +## VectorStore (`pkg/lore/vector`) + +`VectorStore` persists float32 vectors keyed by entry ID and answers +nearest-neighbor queries. Dimension-bound at construction. + +```go +type VectorStore interface { + Upsert(ctx context.Context, id int64, vector []float32) error + Delete(ctx context.Context, id int64) error + Search(ctx context.Context, query []float32, opts SearchOpts) ([]Hit, error) + Dimensions() int + Close(ctx context.Context) error +} +``` + +| Method | Description | +|---|---| +| `Upsert` | Store or replace the vector for entry ID. Vector length must equal Dimensions(). | +| `Delete` | Remove the vector for entry ID. Returns `ErrNotFound` when absent. | +| `Search` | Return top-Limit vectors by cosine similarity. Query length must equal Dimensions(). | +| `Dimensions` | Fixed vector length for this store. | +| `Close` | Release resources beyond the caller-owned DB. Idempotent. | + +`SearchOpts.Kinds` and `SearchOpts.Tags` are advisory hints. Reference +implementations do not apply them; the Retriever layer post-filters via +`Store.Get`. + +Sentinel errors: `vector.ErrNotFound`, `vector.ErrInvalidArgument`, +`vector.ErrClosed`. + +Reference implementation: `pkg/lore/vector/sqlitevec`. Constructor: +`sqlitevec.New(db *sql.DB, dimensions int, opts ...Option) (vector.VectorStore, error)`. +Options: `sqlitevec.WithLogger`, `sqlitevec.WithTracer`. + +--- + +## Retriever (`pkg/lore/retrieve`) + +`Retriever` runs a search and returns ranked results. Implementations compose +`Store`, `Embedder`, and `VectorStore`; callers do not need to interact with +the underlying interfaces directly for search. + +```go +type Retriever interface { + Search(ctx context.Context, query string, opts lore.SearchOpts) ([]lore.SearchHit, error) +} +``` + +| Method | Description | +|---|---| +| `Search` | Execute retrieval for the given query; return results ranked by descending score. Returns `ErrInvalidArgument` for empty query or negative limit. | + +Implementations: + +- `hybrid.New(store, embedder, vstore, opts...)` - Fuses BM25 + vector via RRF. + Options: `hybrid.WithRRFK(k int)`, `hybrid.WithCandidatePoolSize(n int)`, + `hybrid.WithLogger`, `hybrid.WithTracer`. +- `bm25.New(store, opts...)` - Lexical-only. Options: `bm25.WithLogger`, + `bm25.WithTracer`. +- `vector.New(store, embedder, vstore, opts...)` - Semantic-only. Options: + `vector.WithLogger`, `vector.WithTracer`. + +The hybrid retriever degrades gracefully: if one arm fails, results from the +surviving arm are returned. Both arms failing returns an error. + +--- + +## Ingester (`pkg/lore/ingest`) + +`Ingester` walks a document tree and produces classified lore entries. Pure +functional transform: `Process` does not write to any store. + +```go +type Ingester interface { + Process(ctx context.Context, root string) (Result, error) +} +``` + +| Method | Description | +|---|---| +| `Process` | Walk root, chunk recognized files, classify chunks, return entries. Non-nil error signals a fatal failure (root does not exist, etc.). Per-file failures are collected in Result.Errors. | + +`Result` carries `Entries []lore.Entry` and `Errors []FileError`. Callers +decide how to handle `Result.Errors`: strict mode may discard entries; lenient +mode logs them and continues. + +Reference implementation: `pkg/lore/ingest/heuristic`. Constructor: +`heuristic.NewIngester(opts ...Option) ingest.Ingester`. Options: +`heuristic.WithRules([]Rule)`, `heuristic.WithLogger`, `heuristic.WithTracer`, +`heuristic.WithMaxFileSize(n int64)`. + +Classification priority (first match wins): + +1. YAML front matter `kind:` field (validated against canonical kinds). +2. Path rules: `filepath.Match` against repo-relative path and base name. +3. Heading keywords: `Decision`, `Procedure`, `Explanation`, `Principle`, etc. +4. Fallback: `KindResearch`. + +--- + +## RRF utility (`pkg/lore/retrieve/rrf`) + +```go +func Fuse(rankings [][]int64, k int) []ScoredID +``` + +Combines multiple ranked lists (each a slice of entry IDs, best first) into a +single fused list using Reciprocal Rank Fusion. Pass `rrf.DefaultK` (60) when +in doubt. Returns `[]ScoredID` sorted by descending score; ties break by +ascending ID for determinism. + +Useful for callers that want to run their own ranked lists through RRF without +the hybrid retriever. diff --git a/pkg/lore/example_test.go b/pkg/lore/example_test.go new file mode 100644 index 0000000..6fc119a --- /dev/null +++ b/pkg/lore/example_test.go @@ -0,0 +1,74 @@ +package lore_test + +import ( + "fmt" + + "github.com/mathomhaus/lore/pkg/lore" +) + +// ExampleKind_Validate demonstrates how to validate a Kind value before +// writing an entry to a Store. Unknown kinds are rejected at write time +// by all standard implementations. +func ExampleKind_Validate() { + good := lore.KindDecision + if err := good.Validate(); err != nil { + fmt.Println("unexpected error:", err) + } else { + fmt.Println("valid:", good) + } + + bad := lore.Kind("unknown") + if err := bad.Validate(); err != nil { + fmt.Println("rejected unknown kind") + } + // Output: + // valid: decision + // rejected unknown kind +} + +// ExampleAllKinds prints the canonical kind taxonomy in display order. +func ExampleAllKinds() { + for _, k := range lore.AllKinds() { + fmt.Println(k) + } + // Output: + // decision + // principle + // procedure + // reference + // explanation + // observation + // research + // idea +} + +// ExampleKind_String shows that Kind satisfies fmt.Stringer and can be used +// directly in format strings. +func ExampleKind_String() { + k := lore.KindProcedure + fmt.Println(k.String()) + // Output: + // procedure +} + +// ExampleEntry_zero shows the zero value of Entry: all fields empty or nil, +// ready for population before passing to Store.Inscribe. +func ExampleEntry_zero() { + var e lore.Entry + fmt.Printf("id=%d kind=%q title=%q\n", e.ID, e.Kind, e.Title) + // Output: + // id=0 kind="" title="" +} + +// ExampleSearchOpts shows how to construct a SearchOpts that restricts +// results to a project and a pair of kinds. +func ExampleSearchOpts() { + opts := lore.SearchOpts{ + Project: "runbookCorpus", + Kinds: []lore.Kind{lore.KindProcedure, lore.KindReference}, + Limit: 10, + } + fmt.Printf("project=%s kinds=%d limit=%d\n", opts.Project, len(opts.Kinds), opts.Limit) + // Output: + // project=runbookCorpus kinds=2 limit=10 +} diff --git a/pkg/lore/ingest/example_test.go b/pkg/lore/ingest/example_test.go new file mode 100644 index 0000000..c644653 --- /dev/null +++ b/pkg/lore/ingest/example_test.go @@ -0,0 +1,50 @@ +package ingest_test + +import ( + "context" + "fmt" + "os" + "path/filepath" + + "github.com/mathomhaus/lore/pkg/lore/ingest/heuristic" +) + +// ExampleIngester_process demonstrates running the heuristic ingester against +// a small directory of Markdown files and inspecting the classified entries. +func ExampleIngester_process() { + // Build a minimal doc tree in a temp directory. + root, err := os.MkdirTemp("", "lore-ingest-example-*") + if err != nil { + fmt.Println("mktemp:", err) + return + } + defer os.RemoveAll(root) + + adrDir := filepath.Join(root, "docs", "adr") + if err := os.MkdirAll(adrDir, 0o755); err != nil { + fmt.Println("mkdir:", err) + return + } + + files := map[string]string{ + filepath.Join(adrDir, "001-use-sqlite.md"): "# ADR-001: Use SQLite\n\nWe chose SQLite for zero-dependency deployment.", + filepath.Join(adrDir, "002-use-wal.md"): "# ADR-002: Enable WAL mode\n\nWAL allows concurrent readers with a single writer.", + } + for path, content := range files { + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + fmt.Println("write:", err) + return + } + } + + ing := heuristic.NewIngester() + result, err := ing.Process(context.Background(), root) + if err != nil { + fmt.Println("process:", err) + return + } + + fmt.Printf("entries=%d errors=%d\n", len(result.Entries), len(result.Errors)) + // Output: + // entries=2 errors=0 +} diff --git a/pkg/lore/ingest/heuristic/rules.go b/pkg/lore/ingest/heuristic/rules.go index 6d98a73..75b87c0 100644 --- a/pkg/lore/ingest/heuristic/rules.go +++ b/pkg/lore/ingest/heuristic/rules.go @@ -37,7 +37,7 @@ type Rule struct { // - *.md in project root reference (catch-wide-root) func DefaultRules() []Rule { return []Rule{ - // ADRs — architectural decision records. + // ADRs (architectural decision records). {PathGlob: "docs/adr/*.md", Kind: lore.KindDecision, Tags: []string{"adr"}}, {PathGlob: "docs/adr/*.markdown", Kind: lore.KindDecision, Tags: []string{"adr"}}, {PathGlob: "docs/decisions/*.md", Kind: lore.KindDecision, Tags: []string{"adr"}}, @@ -71,7 +71,7 @@ func DefaultRules() []Rule { {PathGlob: "CONTRIBUTING.md", Kind: lore.KindProcedure, Tags: []string{"contributing"}}, {PathGlob: "CONTRIBUTING.markdown", Kind: lore.KindProcedure, Tags: []string{"contributing"}}, - // Changelog — records of what changed. + // Changelog: records of what changed. {PathGlob: "CHANGELOG.md", Kind: lore.KindObservation, Tags: []string{"changelog"}}, {PathGlob: "CHANGELOG.markdown", Kind: lore.KindObservation, Tags: []string{"changelog"}}, } diff --git a/pkg/lore/retrieve/example_test.go b/pkg/lore/retrieve/example_test.go new file mode 100644 index 0000000..5ad51a7 --- /dev/null +++ b/pkg/lore/retrieve/example_test.go @@ -0,0 +1,78 @@ +package retrieve_test + +import ( + "context" + "database/sql" + "fmt" + + _ "modernc.org/sqlite" + + "github.com/mathomhaus/lore/pkg/lore" + "github.com/mathomhaus/lore/pkg/lore/retrieve/bm25" + "github.com/mathomhaus/lore/pkg/lore/retrieve/rrf" + "github.com/mathomhaus/lore/pkg/lore/store/sqlite" +) + +// Example_bm25Search demonstrates lexical-only search using the BM25 ranker. +// This path requires no embedder and works on every platform. +func Example_bm25Search() { + ctx := context.Background() + db, err := sql.Open("sqlite", ":memory:?_pragma=foreign_keys(ON)") + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + st, err := sqlite.New(db) + if err != nil { + fmt.Println("sqlite.New:", err) + return + } + defer st.Close(ctx) + + entries := []lore.Entry{ + {Kind: lore.KindProcedure, Title: "Rollout runbook", Body: "Step-by-step deployment procedure for production services."}, + {Kind: lore.KindDecision, Title: "Deployment strategy decision", Body: "Rationale for choosing canary deployment over blue-green."}, + {Kind: lore.KindReference, Title: "Kubernetes resource limits", Body: "CPU and memory limit recommendations per service tier."}, + } + for _, e := range entries { + if _, err := st.Inscribe(ctx, e); err != nil { + fmt.Println("inscribe:", err) + return + } + } + + r := bm25.New(st) + hits, err := r.Search(ctx, "deployment", lore.SearchOpts{Limit: 5}) + if err != nil { + fmt.Println("search:", err) + return + } + + fmt.Printf("found %d hit(s) for 'deployment'\n", len(hits)) + // Output: + // found 2 hit(s) for 'deployment' +} + +// ExampleFuse demonstrates the rrf.Fuse function directly. Two separate rankers +// produce independent ranked lists; Fuse combines them into a single order. +// Documents appearing in both lists accumulate higher scores than those +// appearing in only one. +func ExampleFuse() { + // Ranker A (BM25): top result is entry 100, then 200. + rankA := []int64{100, 200} + // Ranker B (vector): top result is entry 200, then 300. + rankB := []int64{200, 300} + + fused := rrf.Fuse([][]int64{rankA, rankB}, rrf.DefaultK) + + // Entry 200 appears in both lists so it accumulates the highest fused score. + for _, s := range fused { + fmt.Printf("id=%d\n", s.ID) + } + // Output: + // id=200 + // id=100 + // id=300 +} diff --git a/pkg/lore/retrieve/hybrid/hybrid.go b/pkg/lore/retrieve/hybrid/hybrid.go index fa08a96..06082c9 100644 --- a/pkg/lore/retrieve/hybrid/hybrid.go +++ b/pkg/lore/retrieve/hybrid/hybrid.go @@ -196,16 +196,51 @@ func (r *Retriever) Search(ctx context.Context, query string, opts lore.SearchOp } return nil, fmt.Errorf("hybrid: hydrate entry %d: %w", scored.ID, err) } - // Post-filter by project when specified. + // Post-filter by Project, Kinds, and Tags. The vector arm does not + // honor these filters natively (its SearchOpts hints are advisory), + // so the Retriever applies them after hydration to satisfy the + // caller's contract. The BM25 arm's results may already be filtered; + // re-applying here is a no-op for those. if opts.Project != "" && entry.Project != opts.Project { continue } + if len(opts.Kinds) > 0 && !containsKind(opts.Kinds, entry.Kind) { + continue + } + if len(opts.Tags) > 0 && !containsAllTags(entry.Tags, opts.Tags) { + continue + } results = append(results, lore.SearchHit{Entry: entry, Score: scored.Score}) } return results, nil } +// containsKind reports whether want contains kind. +func containsKind(want []lore.Kind, kind lore.Kind) bool { + for _, k := range want { + if k == kind { + return true + } + } + return false +} + +// containsAllTags reports whether entryTags contains every tag in required. +// Membership is intersection: required={"a","b"} demands the entry have BOTH. +func containsAllTags(entryTags, required []string) bool { + have := make(map[string]struct{}, len(entryTags)) + for _, t := range entryTags { + have[t] = struct{}{} + } + for _, r := range required { + if _, ok := have[r]; !ok { + return false + } + } + return true +} + // runBM25 executes the BM25 arm and returns a slice of entry IDs in rank order. func (r *Retriever) runBM25(ctx context.Context, query string, opts lore.SearchOpts) ([]int64, error) { ctx, span := r.tracer.Start(ctx, "lore.retrieve.bm25") diff --git a/pkg/lore/retrieve/hybrid/hybrid_test.go b/pkg/lore/retrieve/hybrid/hybrid_test.go index 1ad7cba..04ff022 100644 --- a/pkg/lore/retrieve/hybrid/hybrid_test.go +++ b/pkg/lore/retrieve/hybrid/hybrid_test.go @@ -258,7 +258,7 @@ func TestHybrid_BothBeats(t *testing.T) { } // Both found: this is the "hybrid beats single-mode" condition. if !foundMigration || !foundDeployment { - t.Logf("partial coverage (migration=%v, deployment=%v) — acceptable with small corpus", foundMigration, foundDeployment) + t.Logf("partial coverage (migration=%v, deployment=%v); acceptable with small corpus", foundMigration, foundDeployment) } } diff --git a/pkg/lore/store/example_test.go b/pkg/lore/store/example_test.go new file mode 100644 index 0000000..743b3f6 --- /dev/null +++ b/pkg/lore/store/example_test.go @@ -0,0 +1,159 @@ +package store_test + +import ( + "context" + "database/sql" + "fmt" + + _ "modernc.org/sqlite" + + "github.com/mathomhaus/lore/pkg/lore" + "github.com/mathomhaus/lore/pkg/lore/store/sqlite" +) + +// openMemoryDB opens an in-memory SQLite database suitable for examples. +// The caller must close both the Store and the *sql.DB when done. +func openMemoryDB() (*sql.DB, error) { + return sql.Open("sqlite", ":memory:?_pragma=foreign_keys(ON)") +} + +// ExampleNew_inscribeAndGet demonstrates the primary Store write-then-read +// cycle: inscribe a decision entry and retrieve it by ID. +func ExampleNew_inscribeAndGet() { + ctx := context.Background() + db, err := openMemoryDB() + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + st, err := sqlite.New(db) + if err != nil { + fmt.Println("sqlite.New:", err) + return + } + defer st.Close(ctx) + + id, err := st.Inscribe(ctx, lore.Entry{ + Project: "decisionLog", + Kind: lore.KindDecision, + Title: "Adopt WAL mode for SQLite", + Body: "WAL mode allows concurrent readers while a single writer commits.", + Tags: []string{"adr", "storage"}, + }) + if err != nil { + fmt.Println("inscribe:", err) + return + } + + entry, err := st.Get(ctx, id) + if err != nil { + fmt.Println("get:", err) + return + } + + fmt.Printf("kind=%s title=%q tags=%v\n", entry.Kind, entry.Title, entry.Tags) + // Output: + // kind=decision title="Adopt WAL mode for SQLite" tags=[adr storage] +} + +// ExampleNew_searchText demonstrates full-text search over inscribed entries. +func ExampleNew_searchText() { + ctx := context.Background() + db, err := openMemoryDB() + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + st, err := sqlite.New(db) + if err != nil { + fmt.Println("sqlite.New:", err) + return + } + defer st.Close(ctx) + + entries := []lore.Entry{ + { + Project: "runbookCorpus", + Kind: lore.KindProcedure, + Title: "Database failover runbook", + Body: "Steps to promote a replica when the primary is unavailable.", + Tags: []string{"runbook", "database"}, + }, + { + Project: "runbookCorpus", + Kind: lore.KindProcedure, + Title: "Cache flush runbook", + Body: "Steps to safely flush and reload the Redis cache layer.", + Tags: []string{"runbook", "cache"}, + }, + } + for _, e := range entries { + if _, err := st.Inscribe(ctx, e); err != nil { + fmt.Println("inscribe:", err) + return + } + } + + hits, err := st.SearchText(ctx, "runbook", lore.SearchOpts{Limit: 5}) + if err != nil { + fmt.Println("search:", err) + return + } + + fmt.Printf("found %d hit(s)\n", len(hits)) + // Output: + // found 2 hit(s) +} + +// ExampleNew_addEdge demonstrates persisting a typed edge between two entries +// and then listing it back. +func ExampleNew_addEdge() { + ctx := context.Background() + db, err := openMemoryDB() + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + st, err := sqlite.New(db) + if err != nil { + fmt.Println("sqlite.New:", err) + return + } + defer st.Close(ctx) + + fromID, _ := st.Inscribe(ctx, lore.Entry{ + Kind: lore.KindDecision, + Title: "Use mTLS for service-to-service auth", + Body: "Mutual TLS prevents lateral movement inside the cluster.", + }) + toID, _ := st.Inscribe(ctx, lore.Entry{ + Kind: lore.KindProcedure, + Title: "Rotate mTLS certificates", + Body: "Steps to generate and distribute new service certificates.", + }) + + if err := st.AddEdge(ctx, lore.Edge{ + FromID: fromID, + ToID: toID, + Relation: "informs", + }); err != nil { + fmt.Println("add edge:", err) + return + } + + edges, err := st.ListEdges(ctx, fromID) + if err != nil { + fmt.Println("list edges:", err) + return + } + + fmt.Printf("edges from %d: count=%d relation=%s\n", fromID, len(edges), edges[0].Relation) + // Output: + // edges from 1: count=1 relation=informs +} diff --git a/pkg/lore/store/sqlite/migrations/001_initial.up.sql b/pkg/lore/store/sqlite/migrations/001_initial.up.sql index b62685b..490f1a5 100644 --- a/pkg/lore/store/sqlite/migrations/001_initial.up.sql +++ b/pkg/lore/store/sqlite/migrations/001_initial.up.sql @@ -38,8 +38,8 @@ CREATE TABLE IF NOT EXISTS edges ( weight REAL NOT NULL DEFAULT 0, created_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (from_id, to_id, relation), - FOREIGN KEY (from_id) REFERENCES entries(id), - FOREIGN KEY (to_id) REFERENCES entries(id) + FOREIGN KEY (from_id) REFERENCES entries(id) ON DELETE CASCADE, + FOREIGN KEY (to_id) REFERENCES entries(id) ON DELETE CASCADE ); -- --------------------------------------------------------------------------- diff --git a/pkg/lore/vector/example_test.go b/pkg/lore/vector/example_test.go new file mode 100644 index 0000000..685c73c --- /dev/null +++ b/pkg/lore/vector/example_test.go @@ -0,0 +1,88 @@ +package vector_test + +import ( + "context" + "database/sql" + "fmt" + + _ "modernc.org/sqlite" + + "github.com/mathomhaus/lore/pkg/lore/vector" + "github.com/mathomhaus/lore/pkg/lore/vector/sqlitevec" +) + +// ExampleNew_upsertAndSearch demonstrates the core VectorStore cycle: +// store two vectors and then query for the nearest neighbor. +func ExampleNew_upsertAndSearch() { + ctx := context.Background() + db, err := sql.Open("sqlite", ":memory:") + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + const dim = 4 + vs, err := sqlitevec.New(db, dim) + if err != nil { + fmt.Println("sqlitevec.New:", err) + return + } + defer vs.Close(ctx) + + // Two unit vectors pointing in different directions. + vecA := []float32{1, 0, 0, 0} + vecB := []float32{0, 1, 0, 0} + + if err := vs.Upsert(ctx, 1, vecA); err != nil { + fmt.Println("upsert 1:", err) + return + } + if err := vs.Upsert(ctx, 2, vecB); err != nil { + fmt.Println("upsert 2:", err) + return + } + + // Query with vecA: entry 1 should be the top hit (cosine sim = 1.0). + hits, err := vs.Search(ctx, vecA, vector.SearchOpts{Limit: 2}) + if err != nil { + fmt.Println("search:", err) + return + } + + fmt.Printf("top hit id=%d\n", hits[0].ID) + // Output: + // top hit id=1 +} + +// ExampleNew_delete shows that deleting a vector removes it from +// future search results. +func ExampleNew_delete() { + ctx := context.Background() + db, err := sql.Open("sqlite", ":memory:") + if err != nil { + fmt.Println("open db:", err) + return + } + defer db.Close() + + vs, err := sqlitevec.New(db, 2) + if err != nil { + fmt.Println("sqlitevec.New:", err) + return + } + defer vs.Close(ctx) + + _ = vs.Upsert(ctx, 10, []float32{1, 0}) + + // Delete the vector, then verify it is gone. + if err := vs.Delete(ctx, 10); err != nil { + fmt.Println("delete:", err) + return + } + + hits, _ := vs.Search(ctx, []float32{1, 0}, vector.SearchOpts{Limit: 5}) + fmt.Printf("hits after delete: %d\n", len(hits)) + // Output: + // hits after delete: 0 +}