turbopuffer testing

tschellenbach · tschellenbach · commit 5348bd48285b · 2025-12-08T16:10:59.000-07:00
diff --git a/plugins/turbopuffer/tests/test_rag.py b/plugins/turbopuffer/tests/test_rag.py
diff --git a/plugins/turbopuffer/tests/test_turbopuffer_rag.py b/plugins/turbopuffer/tests/test_turbopuffer_rag.py
@@ -0,0 +1,105 @@
+"""Tests for TurboPufferRAG."""
+
+import uuid
+
+import pytest
+from dotenv import load_dotenv
+
+from vision_agents.core.rag import Document
+from vision_agents.plugins.turbopuffer import TurboPufferRAG
+
+load_dotenv()
+
+# Skip blockbuster for all tests in this module (they make real API calls)
+pytestmark = [pytest.mark.integration, pytest.mark.skip_blockbuster]
+
+
+@pytest.fixture
+async def rag():
+    """Create a RAG instance for testing, clean up after."""
+    namespace = f"test-rag-{uuid.uuid4().hex[:8]}"
+    rag = TurboPufferRAG(namespace=namespace)
+    yield rag
+    await rag.clear()
+    await rag.close()
+
+
+@pytest.fixture
+def unique_doc():
+    """Create a document with unique content."""
+    unique_id = uuid.uuid4()
+    return Document(
+        text=f"Test document {unique_id}. Contains quantum computing and AI info.",
+        source="test_doc.txt",
+    ), str(unique_id)
+
+
+async def test_basic_upload_and_search(rag: TurboPufferRAG, unique_doc):
+    """Upload a document and verify it can be found."""
+    doc, unique_id = unique_doc
+    count = await rag.add_documents([doc])
+
+    assert count >= 1
+    assert len(rag.indexed_files) == 1
+
+    result = await rag.search(f"document {unique_id}")
+    assert unique_id in result
+
+
+async def test_vector_search_mode(rag: TurboPufferRAG):
+    """Test vector-only search finds semantically similar content."""
+    doc = Document(text="Neural networks for pattern recognition.", source="ml.txt")
+    await rag.add_documents([doc])
+
+    result = await rag.search("deep learning patterns", mode="vector")
+    assert "neural" in result.lower() or "pattern" in result.lower()
+
+
+async def test_bm25_search_mode(rag: TurboPufferRAG):
+    """Test BM25 keyword search finds exact matches."""
+    unique_sku = f"SKU-{uuid.uuid4().hex[:8].upper()}"
+    doc = Document(text=f"Product code: {unique_sku}. High-quality widget.", source="product.txt")
+    await rag.add_documents([doc])
+
+    result = await rag.search(unique_sku, mode="bm25")
+    assert unique_sku in result
+
+
+async def test_hybrid_search_mode(rag: TurboPufferRAG):
+    """Test hybrid search combines vector and BM25."""
+    doc = Document(text="The API endpoint supports real-time data streaming.", source="api.txt")
+    await rag.add_documents([doc])
+
+    result = await rag.search("real-time streaming API")
+    assert "streaming" in result.lower() or "api" in result.lower()
+
+
+async def test_batch_upload_multiple_documents(rag: TurboPufferRAG):
+    """Test uploading multiple documents in a batch."""
+    docs = [
+        Document(text=f"Document about {topic}: {uuid.uuid4()}", source=f"{topic}.txt")
+        for topic in ["cats", "dogs", "birds"]
+    ]
+
+    count = await rag.add_documents(docs)
+    assert count >= 3
+    assert len(rag.indexed_files) == 3
+
+
+async def test_search_empty_namespace(rag: TurboPufferRAG):
+    """Test search returns appropriate message when namespace is empty."""
+    result = await rag.search("anything")
+    assert "No relevant information found" in result
+
+
+async def test_clear_removes_all_documents(rag: TurboPufferRAG, unique_doc):
+    """Test that clear() removes all indexed documents."""
+    doc, _ = unique_doc
+    await rag.add_documents([doc])
+    assert len(rag.indexed_files) == 1
+
+    await rag.clear()
+    assert len(rag.indexed_files) == 0
+
+    result = await rag.search("anything")
+    assert "No relevant information found" in result
diff --git a/plugins/turbopuffer/vision_agents/plugins/turbopuffer/__init__.py b/plugins/turbopuffer/vision_agents/plugins/turbopuffer/__init__.py
@@ -1,3 +1,3 @@
-from .rag import TurboPufferRAG, create_rag
+from .turbopuffer_rag import TurboPufferRAG, create_rag
 
 __all__ = ["TurboPufferRAG", "create_rag"]
diff --git a/plugins/turbopuffer/vision_agents/plugins/turbopuffer/turbopuffer_rag.py b/plugins/turbopuffer/vision_agents/plugins/turbopuffer/turbopuffer_rag.py
@@ -45,7 +45,7 @@
 
 from langchain_google_genai import GoogleGenerativeAIEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from turbopuffer import AsyncTurbopuffer
+from turbopuffer import AsyncTurbopuffer, NotFoundError
 
 from vision_agents.core.rag import RAG, Document
 
@@ -247,45 +247,51 @@ async def _vector_search(self, query: str, top_k: int) -> list[tuple[str, float]
         )
 
         ns = self._client.namespace(self._namespace_name)
-        results = await ns.query(
-            rank_by=("vector", "ANN", query_embedding),
-            top_k=top_k,
-            include_attributes=["text", "source"],
-        )
+        try:
+            results = await ns.query(
+                rank_by=("vector", "ANN", query_embedding),
+                top_k=top_k,
+                include_attributes=["text", "source"],
+            )
+        except NotFoundError:
+            return []
 
         ranked = []
         for row in results.rows:
             doc_id = str(row.id)
             # Cache the document for later retrieval
             self._doc_cache[doc_id] = {
-                "text": row.text if row.text else "",
-                "source": row.source if row.source else "unknown",
+                "text": row["text"] or "",
+                "source": row["source"] or "unknown",
             }
             # Lower distance = better, so we use negative for ranking
-            dist = row.dist if row.dist else 0
+            dist = row["$dist"] or 0
             ranked.append((doc_id, -dist))
 
         return ranked
 
     async def _bm25_search(self, query: str, top_k: int) -> list[tuple[str, float]]:
         """Run BM25 full-text search."""
         ns = self._client.namespace(self._namespace_name)
-        results = await ns.query(
-            rank_by=("text", "BM25", query),
-            top_k=top_k,
-            include_attributes=["text", "source"],
-        )
+        try:
+            results = await ns.query(
+                rank_by=("text", "BM25", query),
+                top_k=top_k,
+                include_attributes=["text", "source"],
+            )
+        except NotFoundError:
+            return []
 
         ranked = []
         for row in results.rows:
             doc_id = str(row.id)
             # Cache the document for later retrieval
             self._doc_cache[doc_id] = {
-                "text": row.text if row.text else "",
-                "source": row.source if row.source else "unknown",
+                "text": row["text"] or "",
+                "source": row["source"] or "unknown",
             }
             # BM25 score (higher = better)
-            score = row.dist if row.dist else 0
+            score = row["$dist"] or 0
             ranked.append((doc_id, score))
 
         return ranked
@@ -349,7 +355,10 @@ async def search(
     async def clear(self) -> None:
         """Clear all vectors from the namespace."""
         ns = self._client.namespace(self._namespace_name)
-        await ns.delete_all()
+        try:
+            await ns.delete_all()
+        except NotFoundError:
+            pass  # Namespace doesn't exist, nothing to clear
         self._indexed_files = []
         self._doc_cache.clear()
         logger.info(f"Cleared namespace: {self._namespace_name}")

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`		`-from .rag import TurboPufferRAG, create_rag`
	`1`	`+from .turbopuffer_rag import TurboPufferRAG, create_rag`
`2`	`2`
`3`	`3`	`__all__ = ["TurboPufferRAG", "create_rag"]`