Skip to content

Commit 2c40ce1

Browse files
committed
Refactor codebase: remove unnecessary comments and improve readability across multiple modules
1 parent 328df2b commit 2c40ce1

File tree

11 files changed

+84
-122
lines changed

11 files changed

+84
-122
lines changed

AGENTS.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Repository Guidelines
2+
3+
## Project Structure & Module Organization
4+
- `RAGnificent/`: Python package
5+
- `core/`, `rag/`, `utils/`: scraping, RAG pipeline, helpers
6+
- `src/`: Rust library (exposed via PyO3/maturin as `RAGnificent.ragnificent_rs`)
7+
- `tests/`: pytest suite (`unit/`, `integration/`, `rust/`)
8+
- `examples/`, `docs/`, `data/`: demos, docs, local artifacts
9+
- Key config: `.env.example`, `pyproject.toml`, `Cargo.toml`, `pytest.ini`, `.editorconfig`, `justfile`, `Makefile`
10+
11+
## Build, Test, and Development Commands
12+
- Setup: `just setup` (creates venv with uv, installs deps, builds Rust ext)
13+
- Build: `just build` or `make build` (Rust lib via maturin/cargo)
14+
- Test: `just test` or `make test` (runs Rust + Python); quick: `./run_tests.sh fast`
15+
- Format: `just format` or `make format` (Black+isort+Ruff, cargo fmt)
16+
- Lint/Type: `just lint` (ruff, mypy, clippy)
17+
- Bench: `cargo bench`; visualize: `python scripts/visualize_benchmarks.py`
18+
Example: `python -m RAGnificent https://example.com -o output.md --save-chunks`
19+
20+
## Coding Style & Naming Conventions
21+
- Python: Black line length 88, isort profile=black, Ruff rules in `pyproject.toml`
22+
- Types: mypy (py312, strict options for defs/decorators)
23+
- Rust: `cargo fmt`, `clippy` clean
24+
- EditorConfig: spaces, 4-space indent, UTF-8, CRLF, trim trailing whitespace
25+
- Naming: modules/functions `snake_case`, classes `PascalCase`, constants `UPPER_SNAKE`.
26+
27+
## Testing Guidelines
28+
- Framework: `pytest`; discovery: files `test_*.py`, classes `Test*`, funcs `test_*`
29+
- Markers: `unit`, `integration`, `benchmark`, `slow`, `network`, `requires_model`
30+
- Run subsets: `pytest -m "not benchmark and not slow"`
31+
- Rust bindings: `pytest tests/rust/test_python_bindings.py -v`
32+
33+
## Commit & Pull Request Guidelines
34+
- Commits: imperative mood, concise summary (<72 chars), scope optional
35+
Example: `Refactor scraper: faster sitemap parsing`
36+
- PRs: clear description, link issues (`Closes #123`), list changes, test coverage notes; include screenshots for `webui.py` or benchmark deltas when relevant.
37+
38+
## Security & Configuration Tips
39+
- Create `.env` from `.env.example`; never commit secrets
40+
- Key vars: `OPENAI_API_KEY`, `QDRANT_*`, and pipeline tuning knobs (chunking/embedding)
41+
- Prefer in-memory Qdrant for dev; set host/port for prod; validate via `view_qdrant_data.py`

RAGnificent/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import sys
6+
67
from RAGnificent.core.scraper import main
78

89
if __name__ == "__main__":

RAGnificent/core/async_scraper.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ def __init__(
7171
follow_redirects=True,
7272
)
7373

74-
# Initialize async throttler
7574
self.throttler = AsyncRequestThrottler(
7675
requests_per_second=requests_per_second,
7776
domain_specific_limits=domain_specific_limits,
@@ -84,7 +83,6 @@ def __init__(
8483
self.chunker = ContentChunker(chunk_size, chunk_overlap)
8584
self.max_workers = max_workers
8685

87-
# Initialize request cache
8886
self.cache_enabled = cache_enabled
8987
self.request_cache = (
9088
RequestCache(max_age=cache_max_age) if cache_enabled else None
@@ -146,7 +144,6 @@ async def scrape_website(self, url: str, skip_cache: bool = False) -> str:
146144
)
147145
url = sanitized_url
148146

149-
# Check cache first
150147
cached_content = self._check_cache(url, skip_cache)
151148
if cached_content is not None:
152149
return cached_content
@@ -254,20 +251,17 @@ async def scrape_multiple_urls(
254251
Returns:
255252
List of successfully scraped URLs
256253
"""
257-
# Create output directory
258254
Path(output_dir).mkdir(parents=True, exist_ok=True)
259255
if save_chunks and chunk_dir:
260256
Path(chunk_dir).mkdir(parents=True, exist_ok=True)
261257

262-
# Create tasks for concurrent scraping
263258
tasks = []
264259
for url in urls:
265260
task = self._scrape_and_save(
266261
url, output_dir, output_format, save_chunks, chunk_dir
267262
)
268263
tasks.append(task)
269264

270-
# Execute tasks concurrently with semaphore to limit parallelism
271265
semaphore = asyncio.Semaphore(self.max_workers)
272266

273267
async def bounded_scrape(task):
@@ -300,7 +294,6 @@ async def _scrape_and_save(
300294
# Scrape the website
301295
html_content = await self.scrape_website(url)
302296

303-
# Convert content
304297
if self.rust_available:
305298
format_map = {
306299
"markdown": self.OutputFormat.MARKDOWN,
@@ -315,15 +308,13 @@ async def _scrape_and_save(
315308
else:
316309
output = self._convert_content(html_content, url, output_format)
317310

318-
# Generate filename
319311
parsed_url = urlparse(url)
320312
path_parts = parsed_url.path.strip("/").split("/")
321313
if path_parts and path_parts[-1]:
322314
base_name = self._url_path_pattern.sub("_", path_parts[-1])
323315
else:
324316
base_name = self._url_path_pattern.sub("_", parsed_url.netloc)
325317

326-
# Save the converted content
327318
allowed_extensions = {"markdown": "md", "json": "json", "xml": "xml"}
328319
extension = allowed_extensions.get(output_format, "md")
329320
output_file = Path(output_dir) / f"{base_name}.{extension}"
@@ -348,7 +339,6 @@ async def _scrape_and_save(
348339

349340
logger.info(f"Saved {output_format} to: {output_file}")
350341

351-
# Save chunks if requested
352342
if save_chunks and output_format == "markdown":
353343
chunks = self.chunker.create_chunks_from_markdown(
354344
output, source_url=url
@@ -409,7 +399,6 @@ async def scrape_by_sitemap(
409399
Returns:
410400
List of successfully scraped URLs
411401
"""
412-
# Parse sitemap
413402
sitemap_parser = SitemapParser()
414403
sitemap_urls = sitemap_parser.parse_sitemap(
415404
url, min_priority=min_priority, url_filter=url_filter

0 commit comments

Comments
 (0)