@@ -71,7 +71,6 @@ def __init__(
7171 follow_redirects = True ,
7272 )
7373
74- # Initialize async throttler
7574 self .throttler = AsyncRequestThrottler (
7675 requests_per_second = requests_per_second ,
7776 domain_specific_limits = domain_specific_limits ,
@@ -84,7 +83,6 @@ def __init__(
8483 self .chunker = ContentChunker (chunk_size , chunk_overlap )
8584 self .max_workers = max_workers
8685
87- # Initialize request cache
8886 self .cache_enabled = cache_enabled
8987 self .request_cache = (
9088 RequestCache (max_age = cache_max_age ) if cache_enabled else None
@@ -146,7 +144,6 @@ async def scrape_website(self, url: str, skip_cache: bool = False) -> str:
146144 )
147145 url = sanitized_url
148146
149- # Check cache first
150147 cached_content = self ._check_cache (url , skip_cache )
151148 if cached_content is not None :
152149 return cached_content
@@ -254,20 +251,17 @@ async def scrape_multiple_urls(
254251 Returns:
255252 List of successfully scraped URLs
256253 """
257- # Create output directory
258254 Path (output_dir ).mkdir (parents = True , exist_ok = True )
259255 if save_chunks and chunk_dir :
260256 Path (chunk_dir ).mkdir (parents = True , exist_ok = True )
261257
262- # Create tasks for concurrent scraping
263258 tasks = []
264259 for url in urls :
265260 task = self ._scrape_and_save (
266261 url , output_dir , output_format , save_chunks , chunk_dir
267262 )
268263 tasks .append (task )
269264
270- # Execute tasks concurrently with semaphore to limit parallelism
271265 semaphore = asyncio .Semaphore (self .max_workers )
272266
273267 async def bounded_scrape (task ):
@@ -300,7 +294,6 @@ async def _scrape_and_save(
300294 # Scrape the website
301295 html_content = await self .scrape_website (url )
302296
303- # Convert content
304297 if self .rust_available :
305298 format_map = {
306299 "markdown" : self .OutputFormat .MARKDOWN ,
@@ -315,15 +308,13 @@ async def _scrape_and_save(
315308 else :
316309 output = self ._convert_content (html_content , url , output_format )
317310
318- # Generate filename
319311 parsed_url = urlparse (url )
320312 path_parts = parsed_url .path .strip ("/" ).split ("/" )
321313 if path_parts and path_parts [- 1 ]:
322314 base_name = self ._url_path_pattern .sub ("_" , path_parts [- 1 ])
323315 else :
324316 base_name = self ._url_path_pattern .sub ("_" , parsed_url .netloc )
325317
326- # Save the converted content
327318 allowed_extensions = {"markdown" : "md" , "json" : "json" , "xml" : "xml" }
328319 extension = allowed_extensions .get (output_format , "md" )
329320 output_file = Path (output_dir ) / f"{ base_name } .{ extension } "
@@ -348,7 +339,6 @@ async def _scrape_and_save(
348339
349340 logger .info (f"Saved { output_format } to: { output_file } " )
350341
351- # Save chunks if requested
352342 if save_chunks and output_format == "markdown" :
353343 chunks = self .chunker .create_chunks_from_markdown (
354344 output , source_url = url
@@ -409,7 +399,6 @@ async def scrape_by_sitemap(
409399 Returns:
410400 List of successfully scraped URLs
411401 """
412- # Parse sitemap
413402 sitemap_parser = SitemapParser ()
414403 sitemap_urls = sitemap_parser .parse_sitemap (
415404 url , min_priority = min_priority , url_filter = url_filter
0 commit comments