Skip to content

Commit 622aa8b

Browse files
committed
feat(rag): Add OpenAI Vector Store RAG and Local RAG providers
- Add OpenAIVectorStoreRAG provider using OpenAI's managed vector store API - Supports file upload, search with relevance scores, and citations - Automatic store creation and management - Integrates with OpenAI LLM via set_rag_provider() - Add LocalRAG provider with pluggable components: - EmbeddingProvider interface with OpenAIEmbeddings implementation - VectorStore interface with InMemoryVectorStore implementation - Chunker interface with FixedSizeChunker and SentenceChunker - Add comprehensive unit tests for all new components - Export new components from core rag module and OpenAI plugin
1 parent d1a8dad commit 622aa8b

File tree

11 files changed

+1551
-2
lines changed

11 files changed

+1551
-2
lines changed

agents-core/vision_agents/core/rag/__init__.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,36 @@
77
RAGRetrievalCompleteEvent,
88
RAGRetrievalStartEvent,
99
)
10+
from .local import (
11+
Chunker,
12+
EmbeddingProvider,
13+
FixedSizeChunker,
14+
InMemoryVectorStore,
15+
LocalRAG,
16+
OpenAIEmbeddings,
17+
SentenceChunker,
18+
VectorStore,
19+
)
1020
from .types import Chunk, Document, RetrievalResult
1121

1222
__all__ = [
23+
# Base
1324
"RAGProvider",
1425
"Document",
1526
"Chunk",
1627
"RetrievalResult",
28+
# Events
1729
"RAGRetrievalStartEvent",
1830
"RAGRetrievalCompleteEvent",
1931
"RAGDocumentAddedEvent",
2032
"RAGFileAddedEvent",
33+
# Local RAG
34+
"LocalRAG",
35+
"EmbeddingProvider",
36+
"OpenAIEmbeddings",
37+
"VectorStore",
38+
"InMemoryVectorStore",
39+
"Chunker",
40+
"FixedSizeChunker",
41+
"SentenceChunker",
2142
]
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
"""Local RAG implementation with pluggable embeddings and vector stores."""
2+
3+
from .chunker import Chunker, FixedSizeChunker, SentenceChunker
4+
from .embeddings import EmbeddingProvider, OpenAIEmbeddings
5+
from .local_rag import LocalRAG
6+
from .vector_store import InMemoryVectorStore, VectorStore
7+
8+
__all__ = [
9+
"LocalRAG",
10+
"EmbeddingProvider",
11+
"OpenAIEmbeddings",
12+
"VectorStore",
13+
"InMemoryVectorStore",
14+
"Chunker",
15+
"FixedSizeChunker",
16+
"SentenceChunker",
17+
]
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
"""Text chunking strategies for local RAG."""
2+
3+
import abc
4+
import re
5+
from dataclasses import dataclass
6+
7+
8+
@dataclass
9+
class TextChunk:
10+
"""A chunk of text with position information."""
11+
12+
content: str
13+
start: int
14+
end: int
15+
index: int
16+
17+
18+
class Chunker(abc.ABC):
19+
"""Abstract base class for text chunking strategies."""
20+
21+
@abc.abstractmethod
22+
def chunk(self, text: str) -> list[TextChunk]:
23+
"""Split text into chunks.
24+
25+
Args:
26+
text: The text to chunk.
27+
28+
Returns:
29+
List of text chunks with position information.
30+
"""
31+
32+
33+
class FixedSizeChunker(Chunker):
34+
"""Chunk text into fixed-size pieces with optional overlap.
35+
36+
Example:
37+
```python
38+
chunker = FixedSizeChunker(chunk_size=500, overlap=50)
39+
chunks = chunker.chunk("Long document text...")
40+
```
41+
"""
42+
43+
def __init__(self, chunk_size: int = 500, overlap: int = 50):
44+
"""Initialize the fixed-size chunker.
45+
46+
Args:
47+
chunk_size: Maximum size of each chunk in characters.
48+
overlap: Number of characters to overlap between chunks.
49+
"""
50+
if overlap >= chunk_size:
51+
raise ValueError("Overlap must be less than chunk_size")
52+
self._chunk_size = chunk_size
53+
self._overlap = overlap
54+
55+
def chunk(self, text: str) -> list[TextChunk]:
56+
"""Split text into fixed-size chunks."""
57+
if not text:
58+
return []
59+
60+
chunks = []
61+
start = 0
62+
index = 0
63+
64+
while start < len(text):
65+
end = min(start + self._chunk_size, len(text))
66+
chunk_text = text[start:end]
67+
68+
chunks.append(
69+
TextChunk(
70+
content=chunk_text,
71+
start=start,
72+
end=end,
73+
index=index,
74+
)
75+
)
76+
77+
# Move to next chunk, accounting for overlap
78+
start = end - self._overlap if end < len(text) else end
79+
index += 1
80+
81+
return chunks
82+
83+
84+
class SentenceChunker(Chunker):
85+
"""Chunk text by sentences, respecting a maximum chunk size.
86+
87+
This chunker tries to keep sentences together while staying
88+
within the maximum chunk size.
89+
90+
Example:
91+
```python
92+
chunker = SentenceChunker(max_chunk_size=500)
93+
chunks = chunker.chunk("First sentence. Second sentence. Third sentence.")
94+
```
95+
"""
96+
97+
# Regex pattern for sentence boundaries
98+
_SENTENCE_PATTERN = re.compile(r"(?<=[.!?])\s+")
99+
100+
def __init__(self, max_chunk_size: int = 500, min_chunk_size: int = 100):
101+
"""Initialize the sentence chunker.
102+
103+
Args:
104+
max_chunk_size: Maximum size of each chunk in characters.
105+
min_chunk_size: Minimum size before starting a new chunk.
106+
"""
107+
self._max_chunk_size = max_chunk_size
108+
self._min_chunk_size = min_chunk_size
109+
110+
def chunk(self, text: str) -> list[TextChunk]:
111+
"""Split text into sentence-based chunks."""
112+
if not text:
113+
return []
114+
115+
# Split into sentences
116+
sentences = self._SENTENCE_PATTERN.split(text)
117+
118+
chunks = []
119+
current_chunk = ""
120+
current_start = 0
121+
index = 0
122+
position = 0
123+
124+
for sentence in sentences:
125+
sentence_with_space = sentence + " "
126+
127+
# If adding this sentence would exceed max size
128+
if (
129+
len(current_chunk) + len(sentence_with_space) > self._max_chunk_size
130+
and len(current_chunk) >= self._min_chunk_size
131+
):
132+
# Save current chunk
133+
chunks.append(
134+
TextChunk(
135+
content=current_chunk.strip(),
136+
start=current_start,
137+
end=position,
138+
index=index,
139+
)
140+
)
141+
index += 1
142+
current_chunk = sentence_with_space
143+
current_start = position
144+
else:
145+
current_chunk += sentence_with_space
146+
147+
position += len(sentence_with_space)
148+
149+
# Don't forget the last chunk
150+
if current_chunk.strip():
151+
chunks.append(
152+
TextChunk(
153+
content=current_chunk.strip(),
154+
start=current_start,
155+
end=len(text),
156+
index=index,
157+
)
158+
)
159+
160+
return chunks
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
"""Embedding providers for local RAG."""
2+
3+
import abc
4+
from typing import Optional
5+
6+
7+
class EmbeddingProvider(abc.ABC):
8+
"""Abstract base class for embedding providers.
9+
10+
Embedding providers convert text into vector representations
11+
for semantic similarity search.
12+
"""
13+
14+
@property
15+
@abc.abstractmethod
16+
def dimension(self) -> int:
17+
"""Return the dimension of the embedding vectors."""
18+
19+
@abc.abstractmethod
20+
async def embed(self, text: str) -> list[float]:
21+
"""Generate an embedding for a single text.
22+
23+
Args:
24+
text: The text to embed.
25+
26+
Returns:
27+
A list of floats representing the embedding vector.
28+
"""
29+
30+
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
31+
"""Generate embeddings for multiple texts.
32+
33+
Default implementation calls embed() for each text.
34+
Override for more efficient batch processing.
35+
36+
Args:
37+
texts: List of texts to embed.
38+
39+
Returns:
40+
List of embedding vectors.
41+
"""
42+
return [await self.embed(text) for text in texts]
43+
44+
45+
class OpenAIEmbeddings(EmbeddingProvider):
46+
"""Embedding provider using OpenAI's embedding models.
47+
48+
Example:
49+
```python
50+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
51+
vector = await embeddings.embed("Hello world")
52+
```
53+
"""
54+
55+
def __init__(
56+
self,
57+
model: str = "text-embedding-3-small",
58+
api_key: Optional[str] = None,
59+
dimensions: Optional[int] = None,
60+
):
61+
"""Initialize OpenAI embeddings provider.
62+
63+
Args:
64+
model: The embedding model to use. Defaults to text-embedding-3-small.
65+
api_key: Optional API key. By default loads from OPENAI_API_KEY.
66+
dimensions: Optional dimension for the embeddings (for models that support it).
67+
"""
68+
from openai import AsyncOpenAI
69+
70+
self._model = model
71+
self._dimensions = dimensions
72+
self._client = AsyncOpenAI(api_key=api_key) if api_key else AsyncOpenAI()
73+
74+
# Default dimensions for known models
75+
self._default_dimensions = {
76+
"text-embedding-3-small": 1536,
77+
"text-embedding-3-large": 3072,
78+
"text-embedding-ada-002": 1536,
79+
}
80+
81+
@property
82+
def dimension(self) -> int:
83+
"""Return the dimension of the embedding vectors."""
84+
if self._dimensions:
85+
return self._dimensions
86+
return self._default_dimensions.get(self._model, 1536)
87+
88+
async def embed(self, text: str) -> list[float]:
89+
"""Generate an embedding for a single text."""
90+
kwargs: dict = {"model": self._model, "input": text}
91+
if self._dimensions is not None:
92+
kwargs["dimensions"] = self._dimensions
93+
response = await self._client.embeddings.create(**kwargs)
94+
return response.data[0].embedding
95+
96+
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
97+
"""Generate embeddings for multiple texts efficiently."""
98+
if not texts:
99+
return []
100+
101+
kwargs: dict = {"model": self._model, "input": texts}
102+
if self._dimensions is not None:
103+
kwargs["dimensions"] = self._dimensions
104+
response = await self._client.embeddings.create(**kwargs)
105+
106+
# Sort by index to maintain order
107+
sorted_data = sorted(response.data, key=lambda x: x.index)
108+
return [item.embedding for item in sorted_data]

0 commit comments

Comments
 (0)