Skip to content

Commit 00bc109

Browse files
committed
gemini file search testing
1 parent d6121ad commit 00bc109

File tree

3 files changed

+210
-19
lines changed

3 files changed

+210
-19
lines changed

examples/03_phone_and_rag_example/inbound_phone_and_rag_example.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from pathlib import Path
2424

2525
import uvicorn
26+
from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
2627
from dotenv import load_dotenv
2728
from fastapi import Depends, FastAPI, Request, WebSocket
2829
from fastapi.responses import JSONResponse
@@ -47,6 +48,8 @@
4748
rag = None # For TurboPuffer
4849

4950
app = FastAPI()
51+
# Trust proxy headers from ngrok so Twilio signature validation works (https vs http)
52+
app.add_middleware(ProxyHeadersMiddleware, trusted_hosts=["*"])
5053
call_registry = twilio.TwilioCallRegistry()
5154

5255

@@ -150,6 +153,7 @@ async def create_agent() -> Agent:
150153
)
151154
async def search_knowledge(query: str) -> str:
152155
return await rag.search(query, top_k=3)
156+
153157
else:
154158
llm = gemini.LLM("gemini-2.5-flash-lite", file_search_store=file_search_store)
155159

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""Tests for GeminiFilesearchRAG."""
2+
3+
import logging
4+
import uuid
5+
6+
import pytest
7+
from dotenv import load_dotenv
8+
9+
from vision_agents.core.rag import Document
10+
from vision_agents.plugins.gemini import GeminiFilesearchRAG
11+
12+
load_dotenv()
13+
14+
logger = logging.getLogger(__name__)
15+
logger.setLevel(logging.DEBUG)
16+
17+
# Skip blockbuster for all tests in this module (they make real API calls)
18+
pytestmark = [pytest.mark.integration, pytest.mark.skip_blockbuster]
19+
20+
21+
@pytest.fixture
22+
async def rag():
23+
"""Create a RAG instance for testing, clean up after."""
24+
# Use unique name to avoid conflicts
25+
rag = GeminiFilesearchRAG(name=f"test-rag-123")
26+
await rag.create()
27+
yield rag
28+
await rag.clear()
29+
30+
async def test_basic_upload_and_search(rag: GeminiFilesearchRAG):
31+
"""Upload a document with a unique ID and verify it can be found."""
32+
# Create a unique identifier to verify we find the right document
33+
unique_id = f"TEST-{uuid.uuid4()}"
34+
35+
doc = Document(
36+
text=f"This is a test document with unique identifier: {unique_id}. "
37+
"It contains information about quantum computing and AI.",
38+
source="test_doc.txt",
39+
)
40+
41+
# Upload document
42+
count = await rag.add_documents([doc])
43+
assert count == 1
44+
assert len(rag._uploaded_files) == 1
45+
46+
# Search for the unique ID
47+
result = await rag.search(f"Find the document with identifier {unique_id}")
48+
logger.info(f"Search result: {result}")
49+
50+
# The unique ID should appear in the search result
51+
assert unique_id in result or "quantum" in result.lower() or "ai" in result.lower()
52+
53+
54+
async def test_deduplication_same_document(rag: GeminiFilesearchRAG):
55+
"""Verify that uploading the same document twice doesn't create duplicates."""
56+
unique_id = f"DEDUP-{uuid.uuid4()}"
57+
58+
doc = Document(
59+
text=f"Unique content for deduplication test: {unique_id}",
60+
source="dedup_test.txt",
61+
)
62+
63+
# Upload the same document twice
64+
first_count = await rag.add_documents([doc])
65+
assert first_count == 1
66+
first_hash_count = len(rag.uploaded_hashes)
67+
68+
# Upload the exact same document again
69+
second_count = await rag.add_documents([doc])
70+
assert second_count == 0 # Should be skipped as duplicate
71+
72+
# Hash count should remain the same
73+
assert len(rag.uploaded_hashes) == first_hash_count
74+
75+
# Uploaded files list should only have one entry
76+
assert len(rag._uploaded_files) == 1
77+
78+
79+
async def test_deduplication_different_source_same_content(rag: GeminiFilesearchRAG):
80+
"""Verify that same content with different source names is deduplicated."""
81+
content = f"Same content for both documents: {uuid.uuid4()}"
82+
83+
doc1 = Document(text=content, source="source1.txt")
84+
doc2 = Document(text=content, source="source2.txt")
85+
86+
# Upload first document
87+
count1 = await rag.add_documents([doc1])
88+
assert count1 == 1
89+
90+
# Upload second document with same content but different source
91+
count2 = await rag.add_documents([doc2])
92+
assert count2 == 0 # Should be skipped - same content hash
93+
94+
# Only one hash should be stored
95+
assert len(rag.uploaded_hashes) == 1
96+
97+
98+
async def test_different_content_not_deduplicated(rag: GeminiFilesearchRAG):
99+
"""Verify that different content is not incorrectly deduplicated."""
100+
doc1 = Document(
101+
text=f"First unique document: {uuid.uuid4()}",
102+
source="doc1.txt",
103+
)
104+
doc2 = Document(
105+
text=f"Second unique document: {uuid.uuid4()}",
106+
source="doc2.txt",
107+
)
108+
109+
# Upload both documents
110+
count = await rag.add_documents([doc1, doc2])
111+
assert count == 2
112+
113+
# Both hashes should be stored
114+
assert len(rag.uploaded_hashes) == 2
115+
assert len(rag._uploaded_files) == 2
116+
117+
118+
async def test_batch_upload_with_duplicates(rag: GeminiFilesearchRAG):
119+
"""Test batch upload correctly handles mixed unique and duplicate documents."""
120+
shared_content = f"Shared content: {uuid.uuid4()}"
121+
122+
docs = [
123+
Document(text=f"Unique doc 1: {uuid.uuid4()}", source="unique1.txt"),
124+
Document(text=shared_content, source="shared1.txt"),
125+
Document(text=f"Unique doc 2: {uuid.uuid4()}", source="unique2.txt"),
126+
Document(text=shared_content, source="shared2.txt"), # Duplicate content
127+
]
128+
129+
count = await rag.add_documents(docs)
130+
assert count == 3 # 2 unique + 1 shared (second shared is duplicate)
131+
assert len(rag.uploaded_hashes) == 3

plugins/gemini/vision_agents/plugins/gemini/file_search.py

Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,27 @@
77
"""
88

99
import asyncio
10+
import hashlib
1011
import logging
1112
import tempfile
1213
from pathlib import Path
1314

14-
from google.genai import Client
15+
from google.genai import Client, types
1516
from google.genai.types import (
1617
CreateFileSearchStoreConfig,
17-
FileSearch,
1818
GenerateContentConfig,
19-
Tool,
2019
)
2120

2221
from vision_agents.core.rag import RAG, Document
2322

2423
logger = logging.getLogger(__name__)
2524

2625

26+
def _compute_hash(content: str) -> str:
27+
"""Compute SHA-256 hash of content."""
28+
return hashlib.sha256(content.encode()).hexdigest()
29+
30+
2731
class GeminiFilesearchRAG(RAG):
2832
"""
2933
RAG implementation using Gemini's File Search.
@@ -48,7 +52,7 @@ def __init__(
4852
name: str,
4953
client: Client | None = None,
5054
api_key: str | None = None,
51-
model: str = "gemini-2.0-flash",
55+
model: str = "gemini-2.5-flash",
5256
):
5357
"""
5458
Initialize a GeminiFilesearchRAG.
@@ -62,6 +66,7 @@ def __init__(
6266
self.name = name
6367
self._store_name: str | None = None
6468
self._uploaded_files: list[str] = []
69+
self._uploaded_hashes: set[str] = set()
6570
self._model = model
6671

6772
if client is not None:
@@ -79,6 +84,11 @@ def is_created(self) -> bool:
7984
"""Check if the store has been created."""
8085
return self._store_name is not None
8186

87+
@property
88+
def uploaded_hashes(self) -> set[str]:
89+
"""Set of content hashes for uploaded documents."""
90+
return self._uploaded_hashes
91+
8292
async def create(self) -> str:
8393
"""
8494
Create the file search store. Must be called before adding documents.
@@ -105,9 +115,22 @@ async def create(self) -> str:
105115
return self._store_name
106116

107117
async def _upload_file(
108-
self, file_path: str | Path, display_name: str | None = None
109-
) -> None:
110-
"""Upload a single file to the file search store."""
118+
self,
119+
file_path: str | Path,
120+
display_name: str | None = None,
121+
content_hash: str | None = None,
122+
) -> bool:
123+
"""
124+
Upload a single file to the file search store.
125+
126+
Args:
127+
file_path: Path to the file to upload.
128+
display_name: Optional display name (defaults to filename).
129+
content_hash: Optional hash of file content for deduplication.
130+
131+
Returns:
132+
True if file was uploaded, False if skipped (duplicate).
133+
"""
111134
if not self._store_name:
112135
raise ValueError("Store not created. Call create() first.")
113136

@@ -117,6 +140,15 @@ async def _upload_file(
117140
if not file_path.exists():
118141
raise FileNotFoundError(f"File not found: {file_path}")
119142

143+
# Compute hash if not provided
144+
if content_hash is None:
145+
content_hash = _compute_hash(file_path.read_text())
146+
147+
# Skip if already uploaded
148+
if content_hash in self._uploaded_hashes:
149+
logger.info(f"Skipping duplicate: {display_name or file_path.name}")
150+
return False
151+
120152
display_name = display_name or file_path.name
121153

122154
loop = asyncio.get_event_loop()
@@ -135,45 +167,56 @@ async def _upload_file(
135167
while not operation.done:
136168
await asyncio.sleep(1)
137169
operation = await loop.run_in_executor(
138-
None, lambda: self._client.operations.get(operation)
170+
None, lambda op=operation: self._client.operations.get(op)
139171
)
140172

141173
self._uploaded_files.append(display_name)
174+
self._uploaded_hashes.add(content_hash)
142175
logger.info(f"Uploaded and indexed: {display_name}")
176+
return True
143177

144178
async def add_documents(self, documents: list[Document]) -> int:
145179
"""
146180
Add documents to the RAG index.
147181
148182
Documents are written to temporary files and uploaded to Gemini's
149-
File Search store.
183+
File Search store. Duplicate documents (same content hash) are skipped.
150184
151185
Args:
152186
documents: List of documents to index.
153187
154188
Returns:
155-
Number of documents indexed.
189+
Number of documents indexed (excluding duplicates).
156190
"""
157191
if not self._store_name:
158192
raise ValueError("Store not created. Call create() first.")
159193

160194
if not documents:
161195
return 0
162196

197+
uploaded_count = 0
198+
163199
# Write documents to temp files and upload
164200
with tempfile.TemporaryDirectory() as tmpdir:
165201
tmppath = Path(tmpdir)
166202
for doc in documents:
203+
# Compute hash for deduplication
204+
content_hash = _compute_hash(doc.text)
205+
167206
# Use source as filename, default to .txt extension
168207
filename = doc.source
169208
if not Path(filename).suffix:
170209
filename = f"{filename}.txt"
171210
filepath = tmppath / filename
172211
filepath.write_text(doc.text)
173-
await self._upload_file(filepath, display_name=doc.source)
174212

175-
logger.info(f"Indexed {len(documents)} documents")
176-
return len(documents)
213+
if await self._upload_file(
214+
filepath, display_name=doc.source, content_hash=content_hash
215+
):
216+
uploaded_count += 1
217+
218+
logger.info(f"Indexed {uploaded_count} documents ({len(documents) - uploaded_count} duplicates skipped)")
219+
return uploaded_count
177220

178221
async def add_directory(
179222
self,
@@ -224,11 +267,14 @@ async def add_directory(
224267
)
225268

226269
# Upload files in batches concurrently
270+
uploaded_count = 0
227271
for i in range(0, len(files), batch_size):
228272
batch = files[i : i + batch_size]
229-
await asyncio.gather(*[self._upload_file(f) for f in batch])
273+
results = await asyncio.gather(*[self._upload_file(f) for f in batch])
274+
uploaded_count += sum(results)
230275

231-
return len(files)
276+
logger.info(f"Indexed {uploaded_count} files ({len(files) - uploaded_count} duplicates skipped)")
277+
return uploaded_count
232278

233279
async def search(self, query: str, top_k: int = 3) -> str:
234280
"""
@@ -268,19 +314,25 @@ async def clear(self) -> None:
268314
return
269315

270316
loop = asyncio.get_event_loop()
317+
store_name = self._store_name
318+
319+
# Delete the store with force=True to also delete all documents
271320
await loop.run_in_executor(
272321
None,
273-
lambda: self._client.file_search_stores.delete(name=self._store_name),
322+
lambda: self._client.file_search_stores.delete(
323+
name=store_name, config={"force": True}
324+
),
274325
)
275-
logger.info(f"Deleted GeminiFilesearchRAG: {self._store_name}")
326+
logger.info(f"Deleted GeminiFilesearchRAG: {store_name}")
276327
self._store_name = None
277328
self._uploaded_files = []
329+
self._uploaded_hashes = set()
278330

279331
async def close(self) -> None:
280332
"""Close resources. Note: does not delete the store."""
281333
pass
282334

283-
def get_tool(self) -> Tool:
335+
def get_tool(self) -> types.Tool:
284336
"""
285337
Get the File Search tool configuration for use with Gemini LLM.
286338
@@ -290,7 +342,11 @@ def get_tool(self) -> Tool:
290342
if not self._store_name:
291343
raise ValueError("Store not created. Call create() first.")
292344

293-
return Tool(file_search=FileSearch(file_search_store_names=[self._store_name]))
345+
return types.Tool(
346+
file_search=types.FileSearch(
347+
file_search_store_names=[self._store_name]
348+
)
349+
)
294350

295351
def get_tool_config(self) -> dict:
296352
"""

0 commit comments

Comments
 (0)