unclecode · Ahmed-Tawfik94 · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025 · Oct 1, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,9 @@
 # Scripts folder (private tools)
 .scripts/
 
+# Docker automation scripts (personal use)
+docker-scripts/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -270,4 +273,7 @@ docs/**/data
 .codecat/
 
 docs/apps/linkdin/debug*/
-docs/apps/linkdin/samples/insights/*
+docs/apps/linkdin/samples/insights/*
+.yoyo/
+.github/instructions/instructions.instructions.md
+.kilocode/mcp.json
diff --git a/Dockerfile b/Dockerfile
@@ -124,7 +124,7 @@ COPY . /tmp/project/
 
 # Copy supervisor config first (might need root later, but okay for now)
 COPY deploy/docker/supervisord.conf .
-
+COPY deploy/docker/routers ./routers
 COPY deploy/docker/requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 

diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
@@ -25,7 +25,8 @@
     JsonCssExtractionStrategy,
     JsonXPathExtractionStrategy,
     JsonLxmlExtractionStrategy,
-    RegexExtractionStrategy
+    RegexExtractionStrategy,
+    NoExtractionStrategy,  # NEW: Import NoExtractionStrategy
 )
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -113,6 +114,7 @@
     "BrowserProfiler",
     "LLMConfig",
     "GeolocationConfig",
+    "NoExtractionStrategy",
     # NEW: Add SeedingConfig and VirtualScrollConfig
     "SeedingConfig",
     "VirtualScrollConfig",

diff --git a/crawl4ai/proxy_strategy.py b/crawl4ai/proxy_strategy.py
@@ -2,6 +2,11 @@
 from abc import ABC, abstractmethod
 from itertools import cycle
 import os
+import random
+import time
+import asyncio
+import logging
+from collections import defaultdict
 
 
 ########### ATTENTION PEOPLE OF EARTH ###########
@@ -131,7 +136,7 @@ def add_proxies(self, proxies: List[ProxyConfig]):
         """Add proxy configurations to the strategy"""
         pass
 
-class RoundRobinProxyStrategy:
+class RoundRobinProxyStrategy(ProxyRotationStrategy):
     """Simple round-robin proxy rotation strategy using ProxyConfig objects"""
 
     def __init__(self, proxies: List[ProxyConfig] = None):
@@ -156,3 +161,113 @@ async def get_next_proxy(self) -> Optional[ProxyConfig]:
         if not self._proxy_cycle:
             return None
         return next(self._proxy_cycle)
+
+
+class RandomProxyStrategy(ProxyRotationStrategy):
+    """Random proxy selection strategy for unpredictable traffic patterns."""
+
+    def __init__(self, proxies: List[ProxyConfig] = None):
+        self._proxies = []
+        self._lock = asyncio.Lock()
+        if proxies:
+            self.add_proxies(proxies)
+
+    def add_proxies(self, proxies: List[ProxyConfig]):
+        """Add new proxies to the rotation pool."""
+        self._proxies.extend(proxies)
+
+    async def get_next_proxy(self) -> Optional[ProxyConfig]:
+        """Get randomly selected proxy."""
+        async with self._lock:
+            if not self._proxies:
+                return None
+            return random.choice(self._proxies)
+
+
+class LeastUsedProxyStrategy(ProxyRotationStrategy):
+    """Least used proxy strategy for optimal load distribution."""
+
+    def __init__(self, proxies: List[ProxyConfig] = None):
+        self._proxies = []
+        self._usage_count: Dict[str, int] = defaultdict(int)
+        self._lock = asyncio.Lock()
+        if proxies:
+            self.add_proxies(proxies)
+
+    def add_proxies(self, proxies: List[ProxyConfig]):
+        """Add new proxies to the rotation pool."""
+        self._proxies.extend(proxies)
+        for proxy in proxies:
+            self._usage_count[proxy.server] = 0
+
+    async def get_next_proxy(self) -> Optional[ProxyConfig]:
+        """Get least used proxy for optimal load balancing."""
+        async with self._lock:
+            if not self._proxies:
+                return None
+
+            # Find proxy with minimum usage
+            min_proxy = min(self._proxies, key=lambda p: self._usage_count[p.server])
+            self._usage_count[min_proxy.server] += 1
+            return min_proxy
+
+
+class FailureAwareProxyStrategy(ProxyRotationStrategy):
+    """Failure-aware proxy strategy with automatic recovery and health tracking."""
+
+    def __init__(self, proxies: List[ProxyConfig] = None, failure_threshold: int = 3, recovery_time: int = 300):
+        self._proxies = []
+        self._healthy_proxies = []
+        self._failure_count: Dict[str, int] = defaultdict(int)
+        self._last_failure_time: Dict[str, float] = defaultdict(float)
+        self._failure_threshold = failure_threshold
+        self._recovery_time = recovery_time  # seconds
+        self._lock = asyncio.Lock()
+        if proxies:
+            self.add_proxies(proxies)
+
+    def add_proxies(self, proxies: List[ProxyConfig]):
+        """Add new proxies to the rotation pool."""
+        self._proxies.extend(proxies)
+        self._healthy_proxies.extend(proxies)
+        for proxy in proxies:
+            self._failure_count[proxy.server] = 0
+
+    async def get_next_proxy(self) -> Optional[ProxyConfig]:
+        """Get next healthy proxy with automatic recovery."""
+        async with self._lock:
+            # Recovery check: re-enable proxies after recovery_time
+            current_time = time.time()
+            recovered_proxies = []
+
+            for proxy in self._proxies:
+                if (proxy not in self._healthy_proxies and 
+                    current_time - self._last_failure_time[proxy.server] > self._recovery_time):
+                    recovered_proxies.append(proxy)
+                    self._failure_count[proxy.server] = 0
+
+            # Add recovered proxies back to healthy pool
+            self._healthy_proxies.extend(recovered_proxies)
+
+            # If no healthy proxies, reset all (emergency fallback)
+            if not self._healthy_proxies and self._proxies:
+                logging.warning("All proxies failed, resetting health status")
+                self._healthy_proxies = self._proxies.copy()
+                for proxy in self._proxies:
+                    self._failure_count[proxy.server] = 0
+
+            if not self._healthy_proxies:
+                return None
+
+            return random.choice(self._healthy_proxies)
+
+    async def mark_proxy_failed(self, proxy: ProxyConfig):
+        """Mark a proxy as failed and remove from healthy pool if threshold exceeded."""
+        async with self._lock:
+            self._failure_count[proxy.server] += 1
+            self._last_failure_time[proxy.server] = time.time()
+
+            if (self._failure_count[proxy.server] >= self._failure_threshold and 
+                proxy in self._healthy_proxies):
+                self._healthy_proxies.remove(proxy)
+                logging.warning(f"Proxy {proxy.server} marked as unhealthy after {self._failure_count[proxy.server]} failures")
diff --git a/crawl4ai/types_backup.py b/crawl4ai/types_backup.py
@@ -0,0 +1,195 @@
+from typing import TYPE_CHECKING, Union
+
+# Logger types
+AsyncLoggerBase = Union['AsyncLoggerBaseType']
+AsyncLogger = Union['AsyncLoggerType']
+
+# Crawler core types
+AsyncWebCrawler = Union['AsyncWebCrawlerType']
+CacheMode = Union['CacheModeType']
+CrawlResult = Union['CrawlResultType']
+CrawlerHub = Union['CrawlerHubType']
+BrowserProfiler = Union['BrowserProfilerType']
+# NEW: Add AsyncUrlSeederType
+AsyncUrlSeeder = Union['AsyncUrlSeederType']
+
+# Configuration types
+BrowserConfig = Union['BrowserConfigType']
+CrawlerRunConfig = Union['CrawlerRunConfigType']
+HTTPCrawlerConfig = Union['HTTPCrawlerConfigType']
+LLMConfig = Union['LLMConfigType']
+# NEW: Add SeedingConfigType
+SeedingConfig = Union['SeedingConfigType']
+
+# Content scraping types
+ContentScrapingStrategy = Union['ContentScrapingStrategyType']
+LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
+# Backward compatibility alias
+WebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
+
+# Proxy types
+ProxyRotationStrategy = Union['ProxyRotationStrategyType']
+RoundRobinProxyStrategy = Union['RoundRobinProxyStrategyType']
+
+# Extraction types
+ExtractionStrategy = Union['ExtractionStrategyType']
+LLMExtractionStrategy = Union['LLMExtractionStrategyType']
+CosineStrategy = Union['CosineStrategyType']
+JsonCssExtractionStrategy = Union['JsonCssExtractionStrategyType']
+JsonXPathExtractionStrategy = Union['JsonXPathExtractionStrategyType']
+
+# Chunking types
+ChunkingStrategy = Union['ChunkingStrategyType']
+RegexChunking = Union['RegexChunkingType']
+
+# Markdown generation types
+DefaultMarkdownGenerator = Union['DefaultMarkdownGeneratorType']
+MarkdownGenerationResult = Union['MarkdownGenerationResultType']
+
+# Content filter types
+RelevantContentFilter = Union['RelevantContentFilterType']
+PruningContentFilter = Union['PruningContentFilterType']
+BM25ContentFilter = Union['BM25ContentFilterType']
+LLMContentFilter = Union['LLMContentFilterType']
+
+# Dispatcher types
+BaseDispatcher = Union['BaseDispatcherType']
+MemoryAdaptiveDispatcher = Union['MemoryAdaptiveDispatcherType']
+SemaphoreDispatcher = Union['SemaphoreDispatcherType']
+RateLimiter = Union['RateLimiterType']
+CrawlerMonitor = Union['CrawlerMonitorType']
+DisplayMode = Union['DisplayModeType']
+RunManyReturn = Union['RunManyReturnType']
+
+# Docker client
+Crawl4aiDockerClient = Union['Crawl4aiDockerClientType']
+
+# Deep crawling types
+DeepCrawlStrategy = Union['DeepCrawlStrategyType']
+BFSDeepCrawlStrategy = Union['BFSDeepCrawlStrategyType']
+FilterChain = Union['FilterChainType']
+ContentTypeFilter = Union['ContentTypeFilterType']
+DomainFilter = Union['DomainFilterType']
+URLFilter = Union['URLFilterType']
+FilterStats = Union['FilterStatsType']
+SEOFilter = Union['SEOFilterType']
+KeywordRelevanceScorer = Union['KeywordRelevanceScorerType']
+URLScorer = Union['URLScorerType']
+CompositeScorer = Union['CompositeScorerType']
+DomainAuthorityScorer = Union['DomainAuthorityScorerType']
+FreshnessScorer = Union['FreshnessScorerType']
+PathDepthScorer = Union['PathDepthScorerType']
+BestFirstCrawlingStrategy = Union['BestFirstCrawlingStrategyType']
+DFSDeepCrawlStrategy = Union['DFSDeepCrawlStrategyType']
+DeepCrawlDecorator = Union['DeepCrawlDecoratorType']
+
+# Only import types during type checking to avoid circular imports
+if TYPE_CHECKING:
+    # Logger imports
+    from .async_logger import (
+        AsyncLoggerBase as AsyncLoggerBaseType,
+        AsyncLogger as AsyncLoggerType,
+    )
+
+    # Crawler core imports
+    from .async_webcrawler import (
+        AsyncWebCrawler as AsyncWebCrawlerType,
+        CacheMode as CacheModeType,
+    )
+    from .models import CrawlResult as CrawlResultType
+    from .hub import CrawlerHub as CrawlerHubType
+    from .browser_profiler import BrowserProfiler as BrowserProfilerType
+    # NEW: Import AsyncUrlSeeder for type checking
+    from .async_url_seeder import AsyncUrlSeeder as AsyncUrlSeederType
+
+    # Configuration imports
+    from .async_configs import (
+        BrowserConfig as BrowserConfigType,
+        CrawlerRunConfig as CrawlerRunConfigType,
+        HTTPCrawlerConfig as HTTPCrawlerConfigType,
+        LLMConfig as LLMConfigType,
+        # NEW: Import SeedingConfig for type checking
+        SeedingConfig as SeedingConfigType,
+    )
+
+    # Content scraping imports
+    from .content_scraping_strategy import (
+        ContentScrapingStrategy as ContentScrapingStrategyType,
+        LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
+    )
+
+    # Proxy imports
+    from .proxy_strategy import (
+        ProxyRotationStrategy as ProxyRotationStrategyType,
+        RoundRobinProxyStrategy as RoundRobinProxyStrategyType,
+    )
+
+    # Extraction imports
+    from .extraction_strategy import (
+        ExtractionStrategy as ExtractionStrategyType,
+        LLMExtractionStrategy as LLMExtractionStrategyType,
+        CosineStrategy as CosineStrategyType,
+        JsonCssExtractionStrategy as JsonCssExtractionStrategyType,
+        JsonXPathExtractionStrategy as JsonXPathExtractionStrategyType,
+    )
+
+    # Chunking imports
+    from .chunking_strategy import (
+        ChunkingStrategy as ChunkingStrategyType,
+        RegexChunking as RegexChunkingType,
+    )
+
+    # Markdown generation imports
+    from .markdown_generation_strategy import (
+        DefaultMarkdownGenerator as DefaultMarkdownGeneratorType,
+    )
+    from .models import MarkdownGenerationResult as MarkdownGenerationResultType
+
+    # Content filter imports
+    from .content_filter_strategy import (
+        RelevantContentFilter as RelevantContentFilterType,
+        PruningContentFilter as PruningContentFilterType,
+        BM25ContentFilter as BM25ContentFilterType,
+        LLMContentFilter as LLMContentFilterType,
+    )
+
+    # Dispatcher imports
+    from .async_dispatcher import (
+        BaseDispatcher as BaseDispatcherType,
+        MemoryAdaptiveDispatcher as MemoryAdaptiveDispatcherType,
+        SemaphoreDispatcher as SemaphoreDispatcherType,
+        RateLimiter as RateLimiterType,
+        CrawlerMonitor as CrawlerMonitorType,
+        DisplayMode as DisplayModeType,
+        RunManyReturn as RunManyReturnType,
+    )
+
+    # Docker client
+    from .docker_client import Crawl4aiDockerClient as Crawl4aiDockerClientType
+
+    # Deep crawling imports
+    from .deep_crawling import (
+        DeepCrawlStrategy as DeepCrawlStrategyType,
+        BFSDeepCrawlStrategy as BFSDeepCrawlStrategyType,
+        FilterChain as FilterChainType,
+        ContentTypeFilter as ContentTypeFilterType,
+        DomainFilter as DomainFilterType,
+        URLFilter as URLFilterType,
+        FilterStats as FilterStatsType,
+        SEOFilter as SEOFilterType,
+        KeywordRelevanceScorer as KeywordRelevanceScorerType,
+        URLScorer as URLScorerType,
+        CompositeScorer as CompositeScorerType,
+        DomainAuthorityScorer as DomainAuthorityScorerType,
+        FreshnessScorer as FreshnessScorerType,
+        PathDepthScorer as PathDepthScorerType,
+        BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType,
+        DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
+        DeepCrawlDecorator as DeepCrawlDecoratorType,
+    )
+
+
+
+def create_llm_config(*args, **kwargs) -> 'LLMConfigType':
+    from .async_configs import LLMConfig
+    return LLMConfig(*args, **kwargs)