Refactor RAG pipeline and scraper: update pipeline methods for consistency, enhance throttling in scraper, and improve vector store distance handling; add unit tests for pipeline indexing and markdown conversion.

ursisterbtw · ursisterbtw · commit 39e71c24de38 · 2025-08-26T02:57:53.000-04:00
diff --git a/Makefile b/Makefile
@@ -188,19 +188,19 @@ scrape-list:
 scrape-list-parallel:
 	python -m RAGnificent -o $(OUTPUT_DIR) --links-file $(LINKS_FILE) --parallel --max-workers $(WORKERS)
 
-# Run the complete RAG uv pipeline with a single URL
-.PHONY: rag-uv pipeline
-rag-uv pipeline:
+# Run the complete RAG pipeline with a single URL
+.PHONY: rag_pipeline
+rag_pipeline:
 ifeq ($(URL),)
-	@echo "Error: URL is required. Use 'make rag-uv pipeline URL=https://example.com'"
+	@echo "Error: URL is required. Use 'make rag_pipeline URL=https://example.com'"
 	@exit 1
 endif
-	python -c "from RAGnificent.rag.uv pipeline import uv pipeline; uv pipeline = uv pipeline(collection_name='$(COLLECTION)'); uv pipeline.run_uv pipeline(url='$(URL)', run_extract=True, run_chunk=True, run_embed=True, run_store=True)"
+	python -c "from RAGnificent.rag.pipeline import Pipeline; pipeline = Pipeline(collection_name='$(COLLECTION)'); pipeline.run_pipeline(url='$(URL)', run_extract=True, run_chunk=True, run_embed=True, run_store=True)"
 
-# Run the complete RAG uv pipeline with a list of URLs
-.PHONY: rag-uv pipeline-list
-rag-uv pipeline-list:
-	python -c "from RAGnificent.rag.uv pipeline import uv pipeline; uv pipeline = uv pipeline(collection_name='$(COLLECTION)'); uv pipeline.run_uv pipeline(links_file='$(LINKS_FILE)', run_extract=True, run_chunk=True, run_embed=True, run_store=True)"
+# Run the complete RAG pipeline with a list of URLs
+.PHONY: rag_pipeline_list
+rag_pipeline_list:
+	python -c "from RAGnificent.rag.pipeline import Pipeline; pipeline = Pipeline(collection_name='$(COLLECTION)'); pipeline.run_pipeline(links_file='$(LINKS_FILE)', run_extract=True, run_chunk=True, run_embed=True, run_store=True)"
 
 # Extract content from a URL
 .PHONY: extract
@@ -209,27 +209,27 @@ ifeq ($(URL),)
 	@echo "Error: URL is required. Use 'make extract URL=https://example.com'"
 	@exit 1
 endif
-	python -c "from RAGnificent.rag.uv pipeline import uv pipeline; uv pipeline().extract_content(url='$(URL)', output_file='$(RAW_DOCUMENTS)', output_format='$(FORMAT)')"
+	python -c "from RAGnificent.rag.pipeline import Pipeline; Pipeline().extract_content(url='$(URL)', output_file='$(RAW_DOCUMENTS)', output_format='$(FORMAT)')"
 
 # Extract content from a list of URLs
 .PHONY: extract-list
 extract-list:
-	python -c "from RAGnificent.rag.uv pipeline import uv pipeline; uv pipeline().extract_content(links_file='$(LINKS_FILE)', output_file='$(RAW_DOCUMENTS)', output_format='$(FORMAT)')"
+	python -c "from RAGnificent.rag.pipeline import Pipeline; Pipeline().extract_content(links_file='$(LINKS_FILE)', output_file='$(RAW_DOCUMENTS)', output_format='$(FORMAT)')"
 
 # Chunk documents
 .PHONY: chunk
 chunk:
-	python -c "from RAGnificent.rag.uv pipeline import uv pipeline; uv pipeline().chunk_documents('$(RAW_DOCUMENTS)', '$(DOCUMENT_CHUNKS)')"
+	python -c "from RAGnificent.rag.pipeline import Pipeline; Pipeline().chunk_documents('$(RAW_DOCUMENTS)', '$(DOCUMENT_CHUNKS)')"
 
 # Embed chunks
 .PHONY: embed
 embed:
-	python -c "from RAGnificent.rag.uv pipeline import uv pipeline; uv pipeline().embed_chunks('$(DOCUMENT_CHUNKS)', '$(EMBEDDED_CHUNKS)')"
+	python -c "from RAGnificent.rag.pipeline import Pipeline; Pipeline().embed_chunks('$(DOCUMENT_CHUNKS)', '$(EMBEDDED_CHUNKS)')"
 
 # Store chunks in vector database
 .PHONY: store
 store:
-	python -c "from RAGnificent.rag.uv pipeline import uv pipeline; uv pipeline(collection_name='$(COLLECTION)').store_chunks('$(EMBEDDED_CHUNKS)')"
+	python -c "from RAGnificent.rag.pipeline import Pipeline; Pipeline(collection_name='$(COLLECTION)').store_chunks('$(EMBEDDED_CHUNKS)')"
 
 # Search the vector database
 .PHONY: search
@@ -247,7 +247,7 @@ ifeq ($(QUERY),)
 	@echo "Error: QUERY is required. Use 'make query QUERY=\"your query\"'"
 	@exit 1
 endif
-	python -c "from RAGnificent.rag.uv pipeline import uv pipeline; response = uv pipeline(collection_name='$(COLLECTION)').query_with_context('$(QUERY)', $(LIMIT)); print(f'Response: {response[\"response\"]}\n\nSources:\n' + '\n'.join([f'- {r[\"source_url\"]}' for r in response['context']]))"
+	python -c "from RAGnificent.rag.pipeline import Pipeline; response = Pipeline(collection_name='$(COLLECTION)').query_with_context('$(QUERY)', $(LIMIT)); print(f'Response: {response[\"response\"]}\n\nSources:\n' + '\n'.join([f'- {r[\"source_url\"]}' for r in response['context']]))"
 
 # Run the demo for all output formats
 .PHONY: run-demo
@@ -262,7 +262,7 @@ run-hello:
 # Run the RAG uv pipeline example
 .PHONY: run-rag-example
 run-rag-example:
-	python examples/rag_uv pipeline_example.py
+	python examples/rag_pipeline_example.py
 
 # Visualize Qdrant data
 .PHONY: view-qdrant
diff --git a/RAGnificent/core/scraper.py b/RAGnificent/core/scraper.py
@@ -18,7 +18,6 @@
 
 import requests
 from bs4 import BeautifulSoup, Tag
-from bs4 import BeautifulSoup, Tag
 from RAGnificent.core.cache import RequestCache
 from RAGnificent.core.logging import get_logger
 from RAGnificent.core.security import redact_sensitive_data
@@ -216,7 +215,8 @@ def _cache_response(self, url: str, content: str) -> None:
     def _fetch_with_retries(self, url: str) -> str:
         for attempt in range(self.max_retries):
             try:
-                self.throttler.throttle()
+                # Use domain-aware throttling
+                self.throttler.throttle(url)
                 response = self.session.get(url, timeout=self.timeout)
                 response.raise_for_status()
 
@@ -429,7 +429,6 @@ def convert_to_markdown(self, html_content: str, url: str = "") -> str:
             ]:
                 if element_markdown := self._get_element_markdown(element, base_url):
                     markdown_content += element_markdown + "\n\n"
-                    markdown_content += element_markdown + "\n\n"
 
         logger.info("Conversion to Markdown completed.")
         return markdown_content.strip()
diff --git a/RAGnificent/core/security.py b/RAGnificent/core/security.py
@@ -69,15 +69,17 @@ def wrapper(*args, **kwargs):
 class ThrottledSession:
     """Session with built-in throttling for HTTP requests."""
 
-    def __init__(self, requests_per_second: float = 1.0):
+    def __init__(self, requests_per_second: float = 1.0, default_timeout: float = 30.0):
         """
         Initialize throttled session.
 
         Args:
             requests_per_second: Maximum number of requests per second
+            default_timeout: Default network timeout in seconds if none is provided
         """
         self.min_interval = 1.0 / requests_per_second
         self.last_request_time = 0
+        self.default_timeout = default_timeout
 
     def request(self, method: str, url: str, **kwargs) -> Any:
         """
@@ -102,9 +104,34 @@ def request(self, method: str, url: str, **kwargs) -> Any:
             time.sleep(wait_time)
 
         self.last_request_time = time.time()
+        # Ensure a sane default timeout is always applied
+        kwargs.setdefault("timeout", self.default_timeout)
         return requests.request(method, url, **kwargs)
 
 
+def validate_file_access(path: str) -> bool:
+    """
+    Validate that a file path exists and is safely readable.
+
+    Args:
+        path: Absolute or relative path to a file
+
+    Returns:
+        bool: True if the file exists, is a regular file, and is readable
+    """
+    import os
+
+    try:
+        # Resolve symlinks and ensure it points to a regular file
+        real_path = os.path.realpath(path)
+        if not os.path.isfile(real_path):
+            return False
+        # Check read permission
+        return os.access(real_path, os.R_OK)
+    except Exception:
+        return False
+
+
 def redact_sensitive_data(
     text: str, patterns: Optional[List[Tuple[str, str]]] = None
 ) -> str:
diff --git a/RAGnificent/rag/pipeline.py b/RAGnificent/rag/pipeline.py
@@ -372,20 +372,19 @@ def _execute_index_step(self, config: Dict[str, Any]) -> Dict[str, Any]:
                     content, source_url=str(md_file)
                 )
 
-                # Generate embeddings and store
-                for chunk in chunks:
-                    embedding = self.embedding_service.generate_embeddings(
-                        [chunk["content"]]
-                    )[0]
-                    self.vector_store.store_documents(
-                        [
-                            {
-                                "content": chunk["content"],
-                                "metadata": chunk["metadata"],
-                                "embedding": embedding,
-                            }
-                        ]
-                    )
+                # Generate embeddings for all chunks and store
+                chunk_dicts = [
+                    {
+                        "content": chunk.content,
+                        "metadata": chunk.metadata,
+                        "source_url": chunk.source_url,
+                        "id": chunk.id,
+                    }
+                    for chunk in chunks
+                ]
+
+                embedded_chunks = self.embedding_service.embed_chunks(chunk_dicts)
+                self.vector_store.store_documents(embedded_chunks)
 
                 indexed_count += len(chunks)
 
@@ -435,7 +434,7 @@ def _process_single_url(
             Document dictionary or None if extraction failed
         """
         try:
-            from ragnificent_rs import OutputFormat
+            from RAGnificent.ragnificent_rs import OutputFormat
 
             # Convert string to OutputFormat enum if it's not already an enum
             output_format_enum = output_format
@@ -1099,16 +1098,27 @@ def query_with_context(
         ]
 
         try:
-            # Generate response
-            completion = openai.chat.completions.create(
-                model=model,
-                messages=messages,
-                temperature=temperature,
-                max_tokens=self.config.openai.max_tokens,
-                timeout=self.config.openai.request_timeout,
-            )
-
-            response = completion.choices[0].message.content
+            # Generate response (support both legacy and client-style APIs)
+            try:
+                from openai import OpenAI  # type: ignore
+
+                client = OpenAI()
+                completion = client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    temperature=temperature,
+                    max_tokens=self.config.openai.max_tokens,
+                )
+                response = completion.choices[0].message.content
+            except Exception:  # Fallback to legacy if client API unavailable
+                completion = openai.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    temperature=temperature,
+                    max_tokens=self.config.openai.max_tokens,
+                    timeout=self.config.openai.request_timeout,
+                )
+                response = completion.choices[0].message.content
 
             return {
                 "query": query,
diff --git a/RAGnificent/rag/vector_store.py b/RAGnificent/rag/vector_store.py
@@ -195,9 +195,20 @@ def initialize_collection(
         logger.info(
             f"Creating collection '{collection_name}' with vector size {vector_size}"
         )
+        # Map distance string to Qdrant enum
+        distance_map = {
+            "Cosine": models.Distance.COSINE,
+            "cosine": models.Distance.COSINE,
+            "Euclid": models.Distance.EUCLID,
+            "euclid": models.Distance.EUCLID,
+            "Dot": models.Distance.DOT,
+            "dot": models.Distance.DOT,
+        }
+        dist_enum = distance_map.get(distance, models.Distance.COSINE)
+
         client.create_collection(
             collection_name=collection_name,
-            vectors_config=models.VectorParams(size=vector_size, distance=distance),
+            vectors_config=models.VectorParams(size=vector_size, distance=dist_enum),
         )
 
         logger.info(f"Successfully created collection '{collection_name}'")
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,6 @@ dependencies = [
     "sentry-sdk[fastapi]>=2.29.1",
     "rich>=14.0.0",
     "bleach>=6.2.0",
-    "code2flow>=2.5.1",
     "responses>=0.25.7",
 ]
 
@@ -29,6 +28,7 @@ dev = [
     "mypy>=1.15.0",
     "isort>=6.0.1",
     "sourcery>=1.36.0",
+    "code2flow>=2.5.1",
 ]
 test = [
     "pytest>=8.3.5",
diff --git a/tests/unit/test_index_step_regression.py b/tests/unit/test_index_step_regression.py
@@ -0,0 +1,55 @@
+import os
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+
+def test_execute_index_step_embeds_and_stores(tmp_path, monkeypatch):
+    # Arrange: create a simple markdown file in a temp data dir
+    data_dir = tmp_path / "data"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    md_file = data_dir / "doc.md"
+    md_file.write_text("# Title\n\nHello world paragraph.", encoding="utf-8")
+
+    # Dummy vector store that just records documents
+    class DummyVS:
+        def __init__(self, *args, **kwargs):
+            self.stored = []
+
+        def store_documents(self, docs, **kwargs):
+            # Ensure embeddings present
+            for d in docs:
+                assert "embedding" in d
+                emb = d["embedding"]
+                # Accept numpy array or list
+                assert isinstance(emb, (list, np.ndarray))
+            self.stored.extend(docs)
+            return True
+
+        def count_documents(self):
+            return len(self.stored)
+
+    # Dummy search object used by pipeline
+    class DummySearch:
+        def __init__(self, *args, **kwargs):
+            pass
+
+    import RAGnificent.rag.pipeline as pl
+
+    # Patch to avoid real Qdrant and search init
+    monkeypatch.setattr(pl, "get_vector_store", lambda *a, **k: DummyVS())
+    monkeypatch.setattr(pl, "get_search", lambda *a, **k: DummySearch())
+
+    # Act: construct pipeline with simple embedder to avoid heavy models
+    pipeline = pl.Pipeline(
+        data_dir=str(data_dir),
+        embedding_model_type="simpler",
+        continue_on_error=True,
+    )
+
+    result = pipeline._execute_index_step({"input_dir": "."})
+
+    # Assert
+    assert isinstance(result, dict)
+    assert result.get("indexed_documents", 0) >= 1
diff --git a/tests/unit/test_markdown_conversion_regression.py b/tests/unit/test_markdown_conversion_regression.py
@@ -0,0 +1,24 @@
+import pytest
+
+from RAGnificent.core.scraper import MarkdownScraper
+
+
+def test_markdown_conversion_no_duplicate_elements():
+    html = """
+    <html>
+      <head><title>Page</title></head>
+      <body>
+        <main>
+          <h1>Header</h1>
+          <p>One</p>
+          <p>Two</p>
+        </main>
+      </body>
+    </html>
+    """
+    s = MarkdownScraper()
+    md = s.convert_to_markdown(html, url="https://example.com")
+    # Expect title once and paragraphs once each
+    assert md.count("# Header") == 1
+    assert md.count("One") == 1
+    assert md.count("Two") == 1

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,6 @@ dependencies = [`
`17`	`17`	`"sentry-sdk[fastapi]>=2.29.1",`
`18`	`18`	`"rich>=14.0.0",`
`19`	`19`	`"bleach>=6.2.0",`
`20`		`- "code2flow>=2.5.1",`
`21`	`20`	`"responses>=0.25.7",`
`22`	`21`	`]`
`23`	`22`
`@@ -29,6 +28,7 @@ dev = [`
`29`	`28`	`"mypy>=1.15.0",`
`30`	`29`	`"isort>=6.0.1",`
`31`	`30`	`"sourcery>=1.36.0",`
	`31`	`+ "code2flow>=2.5.1",`
`32`	`32`	`]`
`33`	`33`	`test = [`
`34`	`34`	`"pytest>=8.3.5",`