Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
1ea021b
feat(api): add seed URL endpoint and related request model
Ahmed-Tawfik94 Sep 30, 2025
bb3b290
chore: remove yoyo snapshot subproject and impelemented adaptive craw…
Ahmed-Tawfik94 Sep 30, 2025
a62cfee
feat(adaptive-crawling): implement adaptive crawling endpoints and jo…
Ahmed-Tawfik94 Sep 30, 2025
1a8e023
feat(adaptive-crawling): implement adaptive crawling endpoints and in…
Ahmed-Tawfik94 Oct 1, 2025
a599db8
feat(docker): add routers directory to Dockerfile
Ahmed-Tawfik94 Oct 1, 2025
5dc34dd
feat: enhance crawling functionality with anti-bot strategies and hea…
Ahmed-Tawfik94 Oct 3, 2025
f00e8cb
Add demo script for proxy rotation and quick test suite
Ahmed-Tawfik94 Oct 6, 2025
201843a
Add comprehensive tests for anti-bot strategies and extended features
Ahmed-Tawfik94 Oct 7, 2025
8cca970
feat: add comprehensive type definitions and improve test coverage
Ahmed-Tawfik94 Oct 13, 2025
aebf5a3
Add link analysis tests and integration tests for /links/analyze endp…
Ahmed-Tawfik94 Oct 14, 2025
674d074
feat: Add HTTP-only crawling endpoints and related models
Ahmed-Tawfik94 Oct 15, 2025
74eeff4
feat: Add comprehensive tests for URL discovery and virtual scroll fu…
Ahmed-Tawfik94 Oct 16, 2025
3877335
Profiling/monitoring :Add interactive monitoring dashboard and integr…
Ahmed-Tawfik94 Oct 16, 2025
00e9904
feat: Add table extraction strategies and API documentation
Ahmed-Tawfik94 Oct 17, 2025
a3b02be
#1564 fix: Improve error handling in browser configuration serializat…
Ahmed-Tawfik94 Oct 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Scripts folder (private tools)
.scripts/

# Docker automation scripts (personal use)
docker-scripts/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -270,4 +273,7 @@ docs/**/data
.codecat/

docs/apps/linkdin/debug*/
docs/apps/linkdin/samples/insights/*
docs/apps/linkdin/samples/insights/*
.yoyo/
.github/instructions/instructions.instructions.md
.kilocode/mcp.json
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ COPY . /tmp/project/

# Copy supervisor config first (might need root later, but okay for now)
COPY deploy/docker/supervisord.conf .

COPY deploy/docker/routers ./routers
COPY deploy/docker/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

Expand Down
4 changes: 3 additions & 1 deletion crawl4ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,
JsonLxmlExtractionStrategy,
RegexExtractionStrategy
RegexExtractionStrategy,
NoExtractionStrategy, # NEW: Import NoExtractionStrategy
)
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
Expand Down Expand Up @@ -113,6 +114,7 @@
"BrowserProfiler",
"LLMConfig",
"GeolocationConfig",
"NoExtractionStrategy",
# NEW: Add SeedingConfig and VirtualScrollConfig
"SeedingConfig",
"VirtualScrollConfig",
Expand Down
117 changes: 116 additions & 1 deletion crawl4ai/proxy_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
from abc import ABC, abstractmethod
from itertools import cycle
import os
import random
import time
import asyncio
import logging
from collections import defaultdict


########### ATTENTION PEOPLE OF EARTH ###########
Expand Down Expand Up @@ -131,7 +136,7 @@ def add_proxies(self, proxies: List[ProxyConfig]):
"""Add proxy configurations to the strategy"""
pass

class RoundRobinProxyStrategy:
class RoundRobinProxyStrategy(ProxyRotationStrategy):
"""Simple round-robin proxy rotation strategy using ProxyConfig objects"""

def __init__(self, proxies: List[ProxyConfig] = None):
Expand All @@ -156,3 +161,113 @@ async def get_next_proxy(self) -> Optional[ProxyConfig]:
if not self._proxy_cycle:
return None
return next(self._proxy_cycle)


class RandomProxyStrategy(ProxyRotationStrategy):
"""Random proxy selection strategy for unpredictable traffic patterns."""

def __init__(self, proxies: List[ProxyConfig] = None):
self._proxies = []
self._lock = asyncio.Lock()
if proxies:
self.add_proxies(proxies)

def add_proxies(self, proxies: List[ProxyConfig]):
"""Add new proxies to the rotation pool."""
self._proxies.extend(proxies)

async def get_next_proxy(self) -> Optional[ProxyConfig]:
"""Get randomly selected proxy."""
async with self._lock:
if not self._proxies:
return None
return random.choice(self._proxies)


class LeastUsedProxyStrategy(ProxyRotationStrategy):
"""Least used proxy strategy for optimal load distribution."""

def __init__(self, proxies: List[ProxyConfig] = None):
self._proxies = []
self._usage_count: Dict[str, int] = defaultdict(int)
self._lock = asyncio.Lock()
if proxies:
self.add_proxies(proxies)

def add_proxies(self, proxies: List[ProxyConfig]):
"""Add new proxies to the rotation pool."""
self._proxies.extend(proxies)
for proxy in proxies:
self._usage_count[proxy.server] = 0

async def get_next_proxy(self) -> Optional[ProxyConfig]:
"""Get least used proxy for optimal load balancing."""
async with self._lock:
if not self._proxies:
return None

# Find proxy with minimum usage
min_proxy = min(self._proxies, key=lambda p: self._usage_count[p.server])
self._usage_count[min_proxy.server] += 1
return min_proxy


class FailureAwareProxyStrategy(ProxyRotationStrategy):
"""Failure-aware proxy strategy with automatic recovery and health tracking."""

def __init__(self, proxies: List[ProxyConfig] = None, failure_threshold: int = 3, recovery_time: int = 300):
self._proxies = []
self._healthy_proxies = []
self._failure_count: Dict[str, int] = defaultdict(int)
self._last_failure_time: Dict[str, float] = defaultdict(float)
self._failure_threshold = failure_threshold
self._recovery_time = recovery_time # seconds
self._lock = asyncio.Lock()
if proxies:
self.add_proxies(proxies)

def add_proxies(self, proxies: List[ProxyConfig]):
"""Add new proxies to the rotation pool."""
self._proxies.extend(proxies)
self._healthy_proxies.extend(proxies)
for proxy in proxies:
self._failure_count[proxy.server] = 0

async def get_next_proxy(self) -> Optional[ProxyConfig]:
"""Get next healthy proxy with automatic recovery."""
async with self._lock:
# Recovery check: re-enable proxies after recovery_time
current_time = time.time()
recovered_proxies = []

for proxy in self._proxies:
if (proxy not in self._healthy_proxies and
current_time - self._last_failure_time[proxy.server] > self._recovery_time):
recovered_proxies.append(proxy)
self._failure_count[proxy.server] = 0

# Add recovered proxies back to healthy pool
self._healthy_proxies.extend(recovered_proxies)

# If no healthy proxies, reset all (emergency fallback)
if not self._healthy_proxies and self._proxies:
logging.warning("All proxies failed, resetting health status")
self._healthy_proxies = self._proxies.copy()
for proxy in self._proxies:
self._failure_count[proxy.server] = 0

if not self._healthy_proxies:
return None

return random.choice(self._healthy_proxies)

async def mark_proxy_failed(self, proxy: ProxyConfig):
"""Mark a proxy as failed and remove from healthy pool if threshold exceeded."""
async with self._lock:
self._failure_count[proxy.server] += 1
self._last_failure_time[proxy.server] = time.time()

if (self._failure_count[proxy.server] >= self._failure_threshold and
proxy in self._healthy_proxies):
self._healthy_proxies.remove(proxy)
logging.warning(f"Proxy {proxy.server} marked as unhealthy after {self._failure_count[proxy.server]} failures")
195 changes: 195 additions & 0 deletions crawl4ai/types_backup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
from typing import TYPE_CHECKING, Union

# Logger types
AsyncLoggerBase = Union['AsyncLoggerBaseType']
AsyncLogger = Union['AsyncLoggerType']

# Crawler core types
AsyncWebCrawler = Union['AsyncWebCrawlerType']
CacheMode = Union['CacheModeType']
CrawlResult = Union['CrawlResultType']
CrawlerHub = Union['CrawlerHubType']
BrowserProfiler = Union['BrowserProfilerType']
# NEW: Add AsyncUrlSeederType
AsyncUrlSeeder = Union['AsyncUrlSeederType']

# Configuration types
BrowserConfig = Union['BrowserConfigType']
CrawlerRunConfig = Union['CrawlerRunConfigType']
HTTPCrawlerConfig = Union['HTTPCrawlerConfigType']
LLMConfig = Union['LLMConfigType']
# NEW: Add SeedingConfigType
SeedingConfig = Union['SeedingConfigType']

# Content scraping types
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
# Backward compatibility alias
WebScrapingStrategy = Union['LXMLWebScrapingStrategyType']

# Proxy types
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
RoundRobinProxyStrategy = Union['RoundRobinProxyStrategyType']

# Extraction types
ExtractionStrategy = Union['ExtractionStrategyType']
LLMExtractionStrategy = Union['LLMExtractionStrategyType']
CosineStrategy = Union['CosineStrategyType']
JsonCssExtractionStrategy = Union['JsonCssExtractionStrategyType']
JsonXPathExtractionStrategy = Union['JsonXPathExtractionStrategyType']

# Chunking types
ChunkingStrategy = Union['ChunkingStrategyType']
RegexChunking = Union['RegexChunkingType']

# Markdown generation types
DefaultMarkdownGenerator = Union['DefaultMarkdownGeneratorType']
MarkdownGenerationResult = Union['MarkdownGenerationResultType']

# Content filter types
RelevantContentFilter = Union['RelevantContentFilterType']
PruningContentFilter = Union['PruningContentFilterType']
BM25ContentFilter = Union['BM25ContentFilterType']
LLMContentFilter = Union['LLMContentFilterType']

# Dispatcher types
BaseDispatcher = Union['BaseDispatcherType']
MemoryAdaptiveDispatcher = Union['MemoryAdaptiveDispatcherType']
SemaphoreDispatcher = Union['SemaphoreDispatcherType']
RateLimiter = Union['RateLimiterType']
CrawlerMonitor = Union['CrawlerMonitorType']
DisplayMode = Union['DisplayModeType']
RunManyReturn = Union['RunManyReturnType']

# Docker client
Crawl4aiDockerClient = Union['Crawl4aiDockerClientType']

# Deep crawling types
DeepCrawlStrategy = Union['DeepCrawlStrategyType']
BFSDeepCrawlStrategy = Union['BFSDeepCrawlStrategyType']
FilterChain = Union['FilterChainType']
ContentTypeFilter = Union['ContentTypeFilterType']
DomainFilter = Union['DomainFilterType']
URLFilter = Union['URLFilterType']
FilterStats = Union['FilterStatsType']
SEOFilter = Union['SEOFilterType']
KeywordRelevanceScorer = Union['KeywordRelevanceScorerType']
URLScorer = Union['URLScorerType']
CompositeScorer = Union['CompositeScorerType']
DomainAuthorityScorer = Union['DomainAuthorityScorerType']
FreshnessScorer = Union['FreshnessScorerType']
PathDepthScorer = Union['PathDepthScorerType']
BestFirstCrawlingStrategy = Union['BestFirstCrawlingStrategyType']
DFSDeepCrawlStrategy = Union['DFSDeepCrawlStrategyType']
DeepCrawlDecorator = Union['DeepCrawlDecoratorType']

# Only import types during type checking to avoid circular imports
if TYPE_CHECKING:
# Logger imports
from .async_logger import (
AsyncLoggerBase as AsyncLoggerBaseType,
AsyncLogger as AsyncLoggerType,
)

# Crawler core imports
from .async_webcrawler import (
AsyncWebCrawler as AsyncWebCrawlerType,
CacheMode as CacheModeType,
)
from .models import CrawlResult as CrawlResultType
from .hub import CrawlerHub as CrawlerHubType
from .browser_profiler import BrowserProfiler as BrowserProfilerType
# NEW: Import AsyncUrlSeeder for type checking
from .async_url_seeder import AsyncUrlSeeder as AsyncUrlSeederType

# Configuration imports
from .async_configs import (
BrowserConfig as BrowserConfigType,
CrawlerRunConfig as CrawlerRunConfigType,
HTTPCrawlerConfig as HTTPCrawlerConfigType,
LLMConfig as LLMConfigType,
# NEW: Import SeedingConfig for type checking
SeedingConfig as SeedingConfigType,
)

# Content scraping imports
from .content_scraping_strategy import (
ContentScrapingStrategy as ContentScrapingStrategyType,
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
)

# Proxy imports
from .proxy_strategy import (
ProxyRotationStrategy as ProxyRotationStrategyType,
RoundRobinProxyStrategy as RoundRobinProxyStrategyType,
)

# Extraction imports
from .extraction_strategy import (
ExtractionStrategy as ExtractionStrategyType,
LLMExtractionStrategy as LLMExtractionStrategyType,
CosineStrategy as CosineStrategyType,
JsonCssExtractionStrategy as JsonCssExtractionStrategyType,
JsonXPathExtractionStrategy as JsonXPathExtractionStrategyType,
)

# Chunking imports
from .chunking_strategy import (
ChunkingStrategy as ChunkingStrategyType,
RegexChunking as RegexChunkingType,
)

# Markdown generation imports
from .markdown_generation_strategy import (
DefaultMarkdownGenerator as DefaultMarkdownGeneratorType,
)
from .models import MarkdownGenerationResult as MarkdownGenerationResultType

# Content filter imports
from .content_filter_strategy import (
RelevantContentFilter as RelevantContentFilterType,
PruningContentFilter as PruningContentFilterType,
BM25ContentFilter as BM25ContentFilterType,
LLMContentFilter as LLMContentFilterType,
)

# Dispatcher imports
from .async_dispatcher import (
BaseDispatcher as BaseDispatcherType,
MemoryAdaptiveDispatcher as MemoryAdaptiveDispatcherType,
SemaphoreDispatcher as SemaphoreDispatcherType,
RateLimiter as RateLimiterType,
CrawlerMonitor as CrawlerMonitorType,
DisplayMode as DisplayModeType,
RunManyReturn as RunManyReturnType,
)

# Docker client
from .docker_client import Crawl4aiDockerClient as Crawl4aiDockerClientType

# Deep crawling imports
from .deep_crawling import (
DeepCrawlStrategy as DeepCrawlStrategyType,
BFSDeepCrawlStrategy as BFSDeepCrawlStrategyType,
FilterChain as FilterChainType,
ContentTypeFilter as ContentTypeFilterType,
DomainFilter as DomainFilterType,
URLFilter as URLFilterType,
FilterStats as FilterStatsType,
SEOFilter as SEOFilterType,
KeywordRelevanceScorer as KeywordRelevanceScorerType,
URLScorer as URLScorerType,
CompositeScorer as CompositeScorerType,
DomainAuthorityScorer as DomainAuthorityScorerType,
FreshnessScorer as FreshnessScorerType,
PathDepthScorer as PathDepthScorerType,
BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType,
DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
DeepCrawlDecorator as DeepCrawlDecoratorType,
)



def create_llm_config(*args, **kwargs) -> 'LLMConfigType':
from .async_configs import LLMConfig
return LLMConfig(*args, **kwargs)
Loading