77"""
88
99import asyncio
10+ import hashlib
1011import logging
1112import tempfile
1213from pathlib import Path
1314
14- from google .genai import Client
15+ from google .genai import Client , types
1516from google .genai .types import (
1617 CreateFileSearchStoreConfig ,
17- FileSearch ,
1818 GenerateContentConfig ,
19- Tool ,
2019)
2120
2221from vision_agents .core .rag import RAG , Document
2322
2423logger = logging .getLogger (__name__ )
2524
2625
26+ def _compute_hash (content : str ) -> str :
27+ """Compute SHA-256 hash of content."""
28+ return hashlib .sha256 (content .encode ()).hexdigest ()
29+
30+
2731class GeminiFilesearchRAG (RAG ):
2832 """
2933 RAG implementation using Gemini's File Search.
@@ -48,7 +52,7 @@ def __init__(
4852 name : str ,
4953 client : Client | None = None ,
5054 api_key : str | None = None ,
51- model : str = "gemini-2.0 -flash" ,
55+ model : str = "gemini-2.5 -flash" ,
5256 ):
5357 """
5458 Initialize a GeminiFilesearchRAG.
@@ -62,6 +66,7 @@ def __init__(
6266 self .name = name
6367 self ._store_name : str | None = None
6468 self ._uploaded_files : list [str ] = []
69+ self ._uploaded_hashes : set [str ] = set ()
6570 self ._model = model
6671
6772 if client is not None :
@@ -79,6 +84,11 @@ def is_created(self) -> bool:
7984 """Check if the store has been created."""
8085 return self ._store_name is not None
8186
87+ @property
88+ def uploaded_hashes (self ) -> set [str ]:
89+ """Set of content hashes for uploaded documents."""
90+ return self ._uploaded_hashes
91+
8292 async def create (self ) -> str :
8393 """
8494 Create the file search store. Must be called before adding documents.
@@ -105,9 +115,22 @@ async def create(self) -> str:
105115 return self ._store_name
106116
107117 async def _upload_file (
108- self , file_path : str | Path , display_name : str | None = None
109- ) -> None :
110- """Upload a single file to the file search store."""
118+ self ,
119+ file_path : str | Path ,
120+ display_name : str | None = None ,
121+ content_hash : str | None = None ,
122+ ) -> bool :
123+ """
124+ Upload a single file to the file search store.
125+
126+ Args:
127+ file_path: Path to the file to upload.
128+ display_name: Optional display name (defaults to filename).
129+ content_hash: Optional hash of file content for deduplication.
130+
131+ Returns:
132+ True if file was uploaded, False if skipped (duplicate).
133+ """
111134 if not self ._store_name :
112135 raise ValueError ("Store not created. Call create() first." )
113136
@@ -117,6 +140,15 @@ async def _upload_file(
117140 if not file_path .exists ():
118141 raise FileNotFoundError (f"File not found: { file_path } " )
119142
143+ # Compute hash if not provided
144+ if content_hash is None :
145+ content_hash = _compute_hash (file_path .read_text ())
146+
147+ # Skip if already uploaded
148+ if content_hash in self ._uploaded_hashes :
149+ logger .info (f"Skipping duplicate: { display_name or file_path .name } " )
150+ return False
151+
120152 display_name = display_name or file_path .name
121153
122154 loop = asyncio .get_event_loop ()
@@ -135,45 +167,56 @@ async def _upload_file(
135167 while not operation .done :
136168 await asyncio .sleep (1 )
137169 operation = await loop .run_in_executor (
138- None , lambda : self ._client .operations .get (operation )
170+ None , lambda op = operation : self ._client .operations .get (op )
139171 )
140172
141173 self ._uploaded_files .append (display_name )
174+ self ._uploaded_hashes .add (content_hash )
142175 logger .info (f"Uploaded and indexed: { display_name } " )
176+ return True
143177
144178 async def add_documents (self , documents : list [Document ]) -> int :
145179 """
146180 Add documents to the RAG index.
147181
148182 Documents are written to temporary files and uploaded to Gemini's
149- File Search store.
183+ File Search store. Duplicate documents (same content hash) are skipped.
150184
151185 Args:
152186 documents: List of documents to index.
153187
154188 Returns:
155- Number of documents indexed.
189+ Number of documents indexed (excluding duplicates) .
156190 """
157191 if not self ._store_name :
158192 raise ValueError ("Store not created. Call create() first." )
159193
160194 if not documents :
161195 return 0
162196
197+ uploaded_count = 0
198+
163199 # Write documents to temp files and upload
164200 with tempfile .TemporaryDirectory () as tmpdir :
165201 tmppath = Path (tmpdir )
166202 for doc in documents :
203+ # Compute hash for deduplication
204+ content_hash = _compute_hash (doc .text )
205+
167206 # Use source as filename, default to .txt extension
168207 filename = doc .source
169208 if not Path (filename ).suffix :
170209 filename = f"{ filename } .txt"
171210 filepath = tmppath / filename
172211 filepath .write_text (doc .text )
173- await self ._upload_file (filepath , display_name = doc .source )
174212
175- logger .info (f"Indexed { len (documents )} documents" )
176- return len (documents )
213+ if await self ._upload_file (
214+ filepath , display_name = doc .source , content_hash = content_hash
215+ ):
216+ uploaded_count += 1
217+
218+ logger .info (f"Indexed { uploaded_count } documents ({ len (documents ) - uploaded_count } duplicates skipped)" )
219+ return uploaded_count
177220
178221 async def add_directory (
179222 self ,
@@ -224,11 +267,14 @@ async def add_directory(
224267 )
225268
226269 # Upload files in batches concurrently
270+ uploaded_count = 0
227271 for i in range (0 , len (files ), batch_size ):
228272 batch = files [i : i + batch_size ]
229- await asyncio .gather (* [self ._upload_file (f ) for f in batch ])
273+ results = await asyncio .gather (* [self ._upload_file (f ) for f in batch ])
274+ uploaded_count += sum (results )
230275
231- return len (files )
276+ logger .info (f"Indexed { uploaded_count } files ({ len (files ) - uploaded_count } duplicates skipped)" )
277+ return uploaded_count
232278
233279 async def search (self , query : str , top_k : int = 3 ) -> str :
234280 """
@@ -268,19 +314,25 @@ async def clear(self) -> None:
268314 return
269315
270316 loop = asyncio .get_event_loop ()
317+ store_name = self ._store_name
318+
319+ # Delete the store with force=True to also delete all documents
271320 await loop .run_in_executor (
272321 None ,
273- lambda : self ._client .file_search_stores .delete (name = self ._store_name ),
322+ lambda : self ._client .file_search_stores .delete (
323+ name = store_name , config = {"force" : True }
324+ ),
274325 )
275- logger .info (f"Deleted GeminiFilesearchRAG: { self . _store_name } " )
326+ logger .info (f"Deleted GeminiFilesearchRAG: { store_name } " )
276327 self ._store_name = None
277328 self ._uploaded_files = []
329+ self ._uploaded_hashes = set ()
278330
279331 async def close (self ) -> None :
280332 """Close resources. Note: does not delete the store."""
281333 pass
282334
283- def get_tool (self ) -> Tool :
335+ def get_tool (self ) -> types . Tool :
284336 """
285337 Get the File Search tool configuration for use with Gemini LLM.
286338
@@ -290,7 +342,11 @@ def get_tool(self) -> Tool:
290342 if not self ._store_name :
291343 raise ValueError ("Store not created. Call create() first." )
292344
293- return Tool (file_search = FileSearch (file_search_store_names = [self ._store_name ]))
345+ return types .Tool (
346+ file_search = types .FileSearch (
347+ file_search_store_names = [self ._store_name ]
348+ )
349+ )
294350
295351 def get_tool_config (self ) -> dict :
296352 """
0 commit comments