krljakob
diff --git a/‎AGENTS.md‎
Lines changed: 41 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎RAGnificent/__main__.py‎
Lines changed: 1 addition & 0 deletions b/‎RAGnificent/__main__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎RAGnificent/core/async_scraper.py‎
Lines changed: 0 additions & 11 deletions b/‎RAGnificent/core/async_scraper.py‎
Lines changed: 0 additions & 11 deletions
@@ -0,0 +1,41 @@
+# Repository Guidelines
+
+## Project Structure & Module Organization
+- `RAGnificent/`: Python package
+  - `core/`, `rag/`, `utils/`: scraping, RAG pipeline, helpers
+- `src/`: Rust library (exposed via PyO3/maturin as `RAGnificent.ragnificent_rs`)
+- `tests/`: pytest suite (`unit/`, `integration/`, `rust/`)
+- `examples/`, `docs/`, `data/`: demos, docs, local artifacts
+- Key config: `.env.example`, `pyproject.toml`, `Cargo.toml`, `pytest.ini`, `.editorconfig`, `justfile`, `Makefile`
+
+## Build, Test, and Development Commands
+- Setup: `just setup` (creates venv with uv, installs deps, builds Rust ext)
+- Build: `just build` or `make build` (Rust lib via maturin/cargo)
+- Test: `just test` or `make test` (runs Rust + Python); quick: `./run_tests.sh fast`
+- Format: `just format` or `make format` (Black+isort+Ruff, cargo fmt)
+- Lint/Type: `just lint` (ruff, mypy, clippy)
+- Bench: `cargo bench`; visualize: `python scripts/visualize_benchmarks.py`
+Example: `python -m RAGnificent https://example.com -o output.md --save-chunks`
+
+## Coding Style & Naming Conventions
+- Python: Black line length 88, isort profile=black, Ruff rules in `pyproject.toml`
+- Types: mypy (py312, strict options for defs/decorators)
+- Rust: `cargo fmt`, `clippy` clean
+- EditorConfig: spaces, 4-space indent, UTF-8, CRLF, trim trailing whitespace
+- Naming: modules/functions `snake_case`, classes `PascalCase`, constants `UPPER_SNAKE`.
+
+## Testing Guidelines
+- Framework: `pytest`; discovery: files `test_*.py`, classes `Test*`, funcs `test_*`
+- Markers: `unit`, `integration`, `benchmark`, `slow`, `network`, `requires_model`
+- Run subsets: `pytest -m "not benchmark and not slow"`
+- Rust bindings: `pytest tests/rust/test_python_bindings.py -v`
+
+## Commit & Pull Request Guidelines
+- Commits: imperative mood, concise summary (<72 chars), scope optional
+  Example: `Refactor scraper: faster sitemap parsing`
+- PRs: clear description, link issues (`Closes #123`), list changes, test coverage notes; include screenshots for `webui.py` or benchmark deltas when relevant.
+
+## Security & Configuration Tips
+- Create `.env` from `.env.example`; never commit secrets
+- Key vars: `OPENAI_API_KEY`, `QDRANT_*`, and pipeline tuning knobs (chunking/embedding)
+- Prefer in-memory Qdrant for dev; set host/port for prod; validate via `view_qdrant_data.py`
@@ -3,6 +3,7 @@
 """
 
 import sys
+
 from RAGnificent.core.scraper import main
 
 if __name__ == "__main__":
 
@@ -71,7 +71,6 @@ def __init__(
             follow_redirects=True,
         )
 
-        # Initialize async throttler
         self.throttler = AsyncRequestThrottler(
             requests_per_second=requests_per_second,
             domain_specific_limits=domain_specific_limits,
@@ -84,7 +83,6 @@ def __init__(
         self.chunker = ContentChunker(chunk_size, chunk_overlap)
         self.max_workers = max_workers
 
-        # Initialize request cache
         self.cache_enabled = cache_enabled
         self.request_cache = (
             RequestCache(max_age=cache_max_age) if cache_enabled else None
@@ -146,7 +144,6 @@ async def scrape_website(self, url: str, skip_cache: bool = False) -> str:
             )
             url = sanitized_url
 
-        # Check cache first
         cached_content = self._check_cache(url, skip_cache)
         if cached_content is not None:
             return cached_content
@@ -254,20 +251,17 @@ async def scrape_multiple_urls(
         Returns:
             List of successfully scraped URLs
         """
-        # Create output directory
         Path(output_dir).mkdir(parents=True, exist_ok=True)
         if save_chunks and chunk_dir:
             Path(chunk_dir).mkdir(parents=True, exist_ok=True)
 
-        # Create tasks for concurrent scraping
         tasks = []
         for url in urls:
             task = self._scrape_and_save(
                 url, output_dir, output_format, save_chunks, chunk_dir
             )
             tasks.append(task)
 
-        # Execute tasks concurrently with semaphore to limit parallelism
         semaphore = asyncio.Semaphore(self.max_workers)
 
         async def bounded_scrape(task):
@@ -300,7 +294,6 @@ async def _scrape_and_save(
             # Scrape the website
             html_content = await self.scrape_website(url)
 
-            # Convert content
             if self.rust_available:
                 format_map = {
                     "markdown": self.OutputFormat.MARKDOWN,
@@ -315,15 +308,13 @@ async def _scrape_and_save(
             else:
                 output = self._convert_content(html_content, url, output_format)
 
-            # Generate filename
             parsed_url = urlparse(url)
             path_parts = parsed_url.path.strip("/").split("/")
             if path_parts and path_parts[-1]:
                 base_name = self._url_path_pattern.sub("_", path_parts[-1])
             else:
                 base_name = self._url_path_pattern.sub("_", parsed_url.netloc)
 
-            # Save the converted content
             allowed_extensions = {"markdown": "md", "json": "json", "xml": "xml"}
             extension = allowed_extensions.get(output_format, "md")
             output_file = Path(output_dir) / f"{base_name}.{extension}"
@@ -348,7 +339,6 @@ async def _scrape_and_save(
 
             logger.info(f"Saved {output_format} to: {output_file}")
 
-            # Save chunks if requested
             if save_chunks and output_format == "markdown":
                 chunks = self.chunker.create_chunks_from_markdown(
                     output, source_url=url
@@ -409,7 +399,6 @@ async def scrape_by_sitemap(
         Returns:
             List of successfully scraped URLs
         """
-        # Parse sitemap
         sitemap_parser = SitemapParser()
         sitemap_urls = sitemap_parser.parse_sitemap(
             url, min_priority=min_priority, url_filter=url_filter