gemini file search testing

tschellenbach · tschellenbach · commit 00bc109ebd9b · 2025-12-08T15:59:50.000-07:00
diff --git a/examples/03_phone_and_rag_example/inbound_phone_and_rag_example.py b/examples/03_phone_and_rag_example/inbound_phone_and_rag_example.py
@@ -23,6 +23,7 @@
 from pathlib import Path
 
 import uvicorn
+from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
 from dotenv import load_dotenv
 from fastapi import Depends, FastAPI, Request, WebSocket
 from fastapi.responses import JSONResponse
@@ -47,6 +48,8 @@
 rag = None  # For TurboPuffer
 
 app = FastAPI()
+# Trust proxy headers from ngrok so Twilio signature validation works (https vs http)
+app.add_middleware(ProxyHeadersMiddleware, trusted_hosts=["*"])
 call_registry = twilio.TwilioCallRegistry()
 
 
@@ -150,6 +153,7 @@ async def create_agent() -> Agent:
         )
         async def search_knowledge(query: str) -> str:
             return await rag.search(query, top_k=3)
+
     else:
         llm = gemini.LLM("gemini-2.5-flash-lite", file_search_store=file_search_store)
 
diff --git a/plugins/gemini/tests/test_gemini_file_search.py b/plugins/gemini/tests/test_gemini_file_search.py
@@ -0,0 +1,131 @@
+"""Tests for GeminiFilesearchRAG."""
+
+import logging
+import uuid
+
+import pytest
+from dotenv import load_dotenv
+
+from vision_agents.core.rag import Document
+from vision_agents.plugins.gemini import GeminiFilesearchRAG
+
+load_dotenv()
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+# Skip blockbuster for all tests in this module (they make real API calls)
+pytestmark = [pytest.mark.integration, pytest.mark.skip_blockbuster]
+
+
+@pytest.fixture
+async def rag():
+    """Create a RAG instance for testing, clean up after."""
+    # Use unique name to avoid conflicts
+    rag = GeminiFilesearchRAG(name=f"test-rag-123")
+    await rag.create()
+    yield rag
+    await rag.clear()
+
+async def test_basic_upload_and_search(rag: GeminiFilesearchRAG):
+    """Upload a document with a unique ID and verify it can be found."""
+    # Create a unique identifier to verify we find the right document
+    unique_id = f"TEST-{uuid.uuid4()}"
+
+    doc = Document(
+        text=f"This is a test document with unique identifier: {unique_id}. "
+        "It contains information about quantum computing and AI.",
+        source="test_doc.txt",
+    )
+
+    # Upload document
+    count = await rag.add_documents([doc])
+    assert count == 1
+    assert len(rag._uploaded_files) == 1
+
+    # Search for the unique ID
+    result = await rag.search(f"Find the document with identifier {unique_id}")
+    logger.info(f"Search result: {result}")
+
+    # The unique ID should appear in the search result
+    assert unique_id in result or "quantum" in result.lower() or "ai" in result.lower()
+
+
+async def test_deduplication_same_document(rag: GeminiFilesearchRAG):
+    """Verify that uploading the same document twice doesn't create duplicates."""
+    unique_id = f"DEDUP-{uuid.uuid4()}"
+
+    doc = Document(
+        text=f"Unique content for deduplication test: {unique_id}",
+        source="dedup_test.txt",
+    )
+
+    # Upload the same document twice
+    first_count = await rag.add_documents([doc])
+    assert first_count == 1
+    first_hash_count = len(rag.uploaded_hashes)
+
+    # Upload the exact same document again
+    second_count = await rag.add_documents([doc])
+    assert second_count == 0  # Should be skipped as duplicate
+
+    # Hash count should remain the same
+    assert len(rag.uploaded_hashes) == first_hash_count
+
+    # Uploaded files list should only have one entry
+    assert len(rag._uploaded_files) == 1
+
+
+async def test_deduplication_different_source_same_content(rag: GeminiFilesearchRAG):
+    """Verify that same content with different source names is deduplicated."""
+    content = f"Same content for both documents: {uuid.uuid4()}"
+
+    doc1 = Document(text=content, source="source1.txt")
+    doc2 = Document(text=content, source="source2.txt")
+
+    # Upload first document
+    count1 = await rag.add_documents([doc1])
+    assert count1 == 1
+
+    # Upload second document with same content but different source
+    count2 = await rag.add_documents([doc2])
+    assert count2 == 0  # Should be skipped - same content hash
+
+    # Only one hash should be stored
+    assert len(rag.uploaded_hashes) == 1
+
+
+async def test_different_content_not_deduplicated(rag: GeminiFilesearchRAG):
+    """Verify that different content is not incorrectly deduplicated."""
+    doc1 = Document(
+        text=f"First unique document: {uuid.uuid4()}",
+        source="doc1.txt",
+    )
+    doc2 = Document(
+        text=f"Second unique document: {uuid.uuid4()}",
+        source="doc2.txt",
+    )
+
+    # Upload both documents
+    count = await rag.add_documents([doc1, doc2])
+    assert count == 2
+
+    # Both hashes should be stored
+    assert len(rag.uploaded_hashes) == 2
+    assert len(rag._uploaded_files) == 2
+
+
+async def test_batch_upload_with_duplicates(rag: GeminiFilesearchRAG):
+    """Test batch upload correctly handles mixed unique and duplicate documents."""
+    shared_content = f"Shared content: {uuid.uuid4()}"
+
+    docs = [
+        Document(text=f"Unique doc 1: {uuid.uuid4()}", source="unique1.txt"),
+        Document(text=shared_content, source="shared1.txt"),
+        Document(text=f"Unique doc 2: {uuid.uuid4()}", source="unique2.txt"),
+        Document(text=shared_content, source="shared2.txt"),  # Duplicate content
+    ]
+
+    count = await rag.add_documents(docs)
+    assert count == 3  # 2 unique + 1 shared (second shared is duplicate)
+    assert len(rag.uploaded_hashes) == 3
diff --git a/plugins/gemini/vision_agents/plugins/gemini/file_search.py b/plugins/gemini/vision_agents/plugins/gemini/file_search.py
@@ -7,23 +7,27 @@
 """
 
 import asyncio
+import hashlib
 import logging
 import tempfile
 from pathlib import Path
 
-from google.genai import Client
+from google.genai import Client, types
 from google.genai.types import (
     CreateFileSearchStoreConfig,
-    FileSearch,
     GenerateContentConfig,
-    Tool,
 )
 
 from vision_agents.core.rag import RAG, Document
 
 logger = logging.getLogger(__name__)
 
 
+def _compute_hash(content: str) -> str:
+    """Compute SHA-256 hash of content."""
+    return hashlib.sha256(content.encode()).hexdigest()
+
+
 class GeminiFilesearchRAG(RAG):
     """
     RAG implementation using Gemini's File Search.
@@ -48,7 +52,7 @@ def __init__(
         name: str,
         client: Client | None = None,
         api_key: str | None = None,
-        model: str = "gemini-2.0-flash",
+        model: str = "gemini-2.5-flash",
     ):
         """
         Initialize a GeminiFilesearchRAG.
@@ -62,6 +66,7 @@ def __init__(
         self.name = name
         self._store_name: str | None = None
         self._uploaded_files: list[str] = []
+        self._uploaded_hashes: set[str] = set()
         self._model = model
 
         if client is not None:
@@ -79,6 +84,11 @@ def is_created(self) -> bool:
         """Check if the store has been created."""
         return self._store_name is not None
 
+    @property
+    def uploaded_hashes(self) -> set[str]:
+        """Set of content hashes for uploaded documents."""
+        return self._uploaded_hashes
+
     async def create(self) -> str:
         """
         Create the file search store. Must be called before adding documents.
@@ -105,9 +115,22 @@ async def create(self) -> str:
         return self._store_name
 
     async def _upload_file(
-        self, file_path: str | Path, display_name: str | None = None
-    ) -> None:
-        """Upload a single file to the file search store."""
+        self,
+        file_path: str | Path,
+        display_name: str | None = None,
+        content_hash: str | None = None,
+    ) -> bool:
+        """
+        Upload a single file to the file search store.
+
+        Args:
+            file_path: Path to the file to upload.
+            display_name: Optional display name (defaults to filename).
+            content_hash: Optional hash of file content for deduplication.
+
+        Returns:
+            True if file was uploaded, False if skipped (duplicate).
+        """
         if not self._store_name:
             raise ValueError("Store not created. Call create() first.")
 
@@ -117,6 +140,15 @@ async def _upload_file(
         if not file_path.exists():
             raise FileNotFoundError(f"File not found: {file_path}")
 
+        # Compute hash if not provided
+        if content_hash is None:
+            content_hash = _compute_hash(file_path.read_text())
+
+        # Skip if already uploaded
+        if content_hash in self._uploaded_hashes:
+            logger.info(f"Skipping duplicate: {display_name or file_path.name}")
+            return False
+
         display_name = display_name or file_path.name
 
         loop = asyncio.get_event_loop()
@@ -135,45 +167,56 @@ async def _upload_file(
         while not operation.done:
             await asyncio.sleep(1)
             operation = await loop.run_in_executor(
-                None, lambda: self._client.operations.get(operation)
+                None, lambda op=operation: self._client.operations.get(op)
             )
 
         self._uploaded_files.append(display_name)
+        self._uploaded_hashes.add(content_hash)
         logger.info(f"Uploaded and indexed: {display_name}")
+        return True
 
     async def add_documents(self, documents: list[Document]) -> int:
         """
         Add documents to the RAG index.
 
         Documents are written to temporary files and uploaded to Gemini's
-        File Search store.
+        File Search store. Duplicate documents (same content hash) are skipped.
 
         Args:
             documents: List of documents to index.
 
         Returns:
-            Number of documents indexed.
+            Number of documents indexed (excluding duplicates).
         """
         if not self._store_name:
             raise ValueError("Store not created. Call create() first.")
 
         if not documents:
             return 0
 
+        uploaded_count = 0
+
         # Write documents to temp files and upload
         with tempfile.TemporaryDirectory() as tmpdir:
             tmppath = Path(tmpdir)
             for doc in documents:
+                # Compute hash for deduplication
+                content_hash = _compute_hash(doc.text)
+
                 # Use source as filename, default to .txt extension
                 filename = doc.source
                 if not Path(filename).suffix:
                     filename = f"{filename}.txt"
                 filepath = tmppath / filename
                 filepath.write_text(doc.text)
-                await self._upload_file(filepath, display_name=doc.source)
 
-        logger.info(f"Indexed {len(documents)} documents")
-        return len(documents)
+                if await self._upload_file(
+                    filepath, display_name=doc.source, content_hash=content_hash
+                ):
+                    uploaded_count += 1
+
+        logger.info(f"Indexed {uploaded_count} documents ({len(documents) - uploaded_count} duplicates skipped)")
+        return uploaded_count
 
     async def add_directory(
         self,
@@ -224,11 +267,14 @@ async def add_directory(
         )
 
         # Upload files in batches concurrently
+        uploaded_count = 0
         for i in range(0, len(files), batch_size):
             batch = files[i : i + batch_size]
-            await asyncio.gather(*[self._upload_file(f) for f in batch])
+            results = await asyncio.gather(*[self._upload_file(f) for f in batch])
+            uploaded_count += sum(results)
 
-        return len(files)
+        logger.info(f"Indexed {uploaded_count} files ({len(files) - uploaded_count} duplicates skipped)")
+        return uploaded_count
 
     async def search(self, query: str, top_k: int = 3) -> str:
         """
@@ -268,19 +314,25 @@ async def clear(self) -> None:
             return
 
         loop = asyncio.get_event_loop()
+        store_name = self._store_name
+
+        # Delete the store with force=True to also delete all documents
         await loop.run_in_executor(
             None,
-            lambda: self._client.file_search_stores.delete(name=self._store_name),
+            lambda: self._client.file_search_stores.delete(
+                name=store_name, config={"force": True}
+            ),
         )
-        logger.info(f"Deleted GeminiFilesearchRAG: {self._store_name}")
+        logger.info(f"Deleted GeminiFilesearchRAG: {store_name}")
         self._store_name = None
         self._uploaded_files = []
+        self._uploaded_hashes = set()
 
     async def close(self) -> None:
         """Close resources. Note: does not delete the store."""
         pass
 
-    def get_tool(self) -> Tool:
+    def get_tool(self) -> types.Tool:
         """
         Get the File Search tool configuration for use with Gemini LLM.
 
@@ -290,7 +342,11 @@ def get_tool(self) -> Tool:
         if not self._store_name:
             raise ValueError("Store not created. Call create() first.")
 
-        return Tool(file_search=FileSearch(file_search_store_names=[self._store_name]))
+        return types.Tool(
+            file_search=types.FileSearch(
+                file_search_store_names=[self._store_name]
+            )
+        )
 
     def get_tool_config(self) -> dict:
         """