fix(gemini): Add persistent hash-based deduplication for File Search

d3xvn · d3xvn · commit 99617f4ee664 · 2025-12-09T12:07:54.000+01:00
- Reuse existing stores with the same display_name instead of creating new ones
- Store content hash (SHA-256) in document custom_metadata for persistence
- Load existing hashes from API on startup to skip duplicate uploads
- Works across app restarts: same content = skipped, regardless of filename
- Update tests to use unique store names to avoid interference
diff --git a/plugins/gemini/tests/test_gemini_file_search.py b/plugins/gemini/tests/test_gemini_file_search.py
@@ -21,8 +21,9 @@
 @pytest.fixture
 async def rag():
     """Create a RAG instance for testing, clean up after."""
-    # Use unique name to avoid conflicts
-    rag = GeminiFilesearchRAG(name="test-rag-123")
+    # Use unique name to avoid conflicts with store reuse
+    unique_name = f"test-rag-{uuid.uuid4().hex[:8]}"
+    rag = GeminiFilesearchRAG(name=unique_name)
     await rag.create()
     yield rag
     await rag.clear()
diff --git a/plugins/gemini/vision_agents/plugins/gemini/file_search.py b/plugins/gemini/vision_agents/plugins/gemini/file_search.py
@@ -35,10 +35,13 @@ class GeminiFilesearchRAG(RAG):
     File Search imports, chunks, and indexes your data to enable fast retrieval
     of relevant information. Search is performed by Gemini's infrastructure.
 
+    The store automatically reuses existing stores with the same name and skips
+    uploading documents that already exist (based on content hash stored in metadata).
+
     Usage:
         rag = GeminiFilesearchRAG(name="my-knowledge-base")
-        await rag.create()
-        await rag.add_directory("./knowledge")
+        await rag.create()  # Reuses existing store if found
+        await rag.add_directory("./knowledge") 
 
         # Search
         results = await rag.search("How does the API work?")
@@ -67,6 +70,8 @@ def __init__(
         self._store_name: str | None = None
         self._uploaded_files: list[str] = []
         self._uploaded_hashes: set[str] = set()
+        # Map of content_hash -> display_name for existing documents (loaded from API)
+        self._existing_hashes: set[str] = set()
         self._model = model
 
         if client is not None:
@@ -91,7 +96,10 @@ def uploaded_hashes(self) -> set[str]:
 
     async def create(self) -> str:
         """
-        Create the file search store. Must be called before adding documents.
+        Create or reuse an existing file search store.
+
+        If a store with the same display_name already exists, it will be reused
+        and existing documents will be loaded for deduplication.
 
         Returns:
             The store resource name.
@@ -103,17 +111,68 @@ async def create(self) -> str:
             return self._store_name
 
         loop = asyncio.get_event_loop()
+
+        # Check if a store with this name already exists
+        existing_store = await loop.run_in_executor(
+            None, self._find_existing_store
+        )
+
+        if existing_store:
+            self._store_name = existing_store
+            await self._load_existing_documents()
+            logger.info(
+                f"Reusing existing store '{self.name}': {self._store_name} "
+                f"({len(self._existing_hashes)} documents with hashes)"
+            )
+            return self._store_name
+
+        # Create new store if none exists
         store = await loop.run_in_executor(
             None,
             lambda: self._client.file_search_stores.create(
                 config=CreateFileSearchStoreConfig(display_name=self.name)
             ),
         )
         self._store_name = store.name
-        logger.info(f"Created GeminiFilesearchRAG '{self.name}': {self._store_name}")
+        logger.info(f"Created new store '{self.name}': {self._store_name}")
         assert self._store_name is not None
         return self._store_name
 
+    def _find_existing_store(self) -> str | None:
+        """Find an existing store with the same display_name."""
+        for store in self._client.file_search_stores.list():
+            if store.display_name == self.name:
+                return store.name
+        return None
+
+    async def _load_existing_documents(self) -> None:
+        """Load existing document hashes from the store for deduplication."""
+        if not self._store_name:
+            return
+
+        loop = asyncio.get_event_loop()
+        store_name = self._store_name  # Capture for closure
+
+        def list_docs():
+            return list(
+                self._client.file_search_stores.documents.list(parent=store_name)
+            )
+
+        docs = await loop.run_in_executor(None, list_docs)
+
+        for doc in docs:
+            self._uploaded_files.append(doc.display_name)
+            # Extract content_hash from custom_metadata if present
+            if doc.custom_metadata:
+                for meta in doc.custom_metadata:
+                    if meta.key == "content_hash" and meta.string_value:
+                        self._existing_hashes.add(meta.string_value)
+                        break
+
+        logger.debug(
+            f"Loaded {len(docs)} documents, {len(self._existing_hashes)} with hashes"
+        )
+
     async def _upload_file(
         self,
         file_path: str | Path,
@@ -123,6 +182,12 @@ async def _upload_file(
         """
         Upload a single file to the file search store.
 
+        Skips upload if the content hash matches a previously uploaded file
+        (checked against both in-memory session hashes and API-stored hashes).
+
+        The content hash is stored in the document's custom_metadata for
+        persistent deduplication across restarts.
+
         Args:
             file_path: Path to the file to upload.
             display_name: Optional display name (defaults to filename).
@@ -140,26 +205,39 @@ async def _upload_file(
         if not file_path.exists():
             raise FileNotFoundError(f"File not found: {file_path}")
 
+        display_name = display_name or file_path.name
+
         # Compute hash if not provided
         if content_hash is None:
             content_hash = _compute_hash(file_path.read_text())
 
-        # Skip if already uploaded
-        if content_hash in self._uploaded_hashes:
-            logger.info(f"Skipping duplicate: {display_name or file_path.name}")
+        # Check if hash already exists (from API or this session)
+        if content_hash in self._existing_hashes:
+            logger.info(f"Skipping (hash exists in store): {display_name}")
             return False
 
-        display_name = display_name or file_path.name
+        if content_hash in self._uploaded_hashes:
+            logger.info(f"Skipping (duplicate in session): {display_name}")
+            return False
 
         loop = asyncio.get_event_loop()
 
-        # Upload and wait for indexing
+        # Upload with content_hash in custom_metadata for persistent deduplication
+        # Capture variables for lambda closure
+        file_str = str(file_path)
         operation = await loop.run_in_executor(
             None,
             lambda: self._client.file_search_stores.upload_to_file_search_store(
-                file=str(file_path),
+                file=file_str,
                 file_search_store_name=store_name,
-                config={"display_name": display_name},
+                config=types.UploadToFileSearchStoreConfig(
+                    display_name=display_name,
+                    custom_metadata=[
+                        types.CustomMetadata(
+                            key="content_hash", string_value=content_hash
+                        ),
+                    ],
+                ),
             ),
         )
 
@@ -172,6 +250,7 @@ async def _upload_file(
 
         self._uploaded_files.append(display_name)
         self._uploaded_hashes.add(content_hash)
+        self._existing_hashes.add(content_hash)
         logger.info(f"Uploaded and indexed: {display_name}")
         return True
 
@@ -333,6 +412,7 @@ async def clear(self) -> None:
         self._store_name = None
         self._uploaded_files = []
         self._uploaded_hashes = set()
+        self._existing_hashes = set()
 
     async def close(self) -> None:
         """Close resources. Note: does not delete the store."""