diff --git a/CHANGELOG.md b/CHANGELOG.md index ce63516f9..82925256e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **⏱️ Crawl-delay Directive Support**: New `respect_crawl_delay` configuration parameter + - Honors `Crawl-delay` directives from robots.txt files + - Automatically waits the specified delay between requests to the same domain + - Per-domain crawl-delay caching for efficiency + - Shared HTTP session with connection pooling for robots.txt fetching + - Race-condition safe domain initialization with asyncio locks + - Works with `arun_many()` for batch crawling scenarios + - Fully backward compatible with opt-in flag (default: `False`) + - **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag - Maintains HTTPS scheme for internal links even when servers redirect to HTTP - Prevents security downgrades during deep crawling diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 10cc48d08..f8eba046b 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1136,6 +1136,10 @@ class CrawlerRunConfig(): check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False Default: False. + respect_crawl_delay (bool): Whether to respect Crawl-delay directives from robots.txt. + When True, the crawler will wait the specified delay between + requests to the same domain. Requires check_robots_txt=True. + Default: False. user_agent (str): Custom User-Agent string to use. Default: None. user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is. @@ -1247,6 +1251,7 @@ def __init__( stream: bool = False, url: str = None, check_robots_txt: bool = False, + respect_crawl_delay: bool = False, user_agent: str = None, user_agent_mode: str = None, user_agent_generator_config: dict = {}, @@ -1375,6 +1380,7 @@ def __init__( # Robots.txt Handling Parameters self.check_robots_txt = check_robots_txt + self.respect_crawl_delay = respect_crawl_delay # User Agent Parameters self.user_agent = user_agent @@ -1644,6 +1650,7 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig": method=kwargs.get("method", "GET"), stream=kwargs.get("stream", False), check_robots_txt=kwargs.get("check_robots_txt", False), + respect_crawl_delay=kwargs.get("respect_crawl_delay", False), user_agent=kwargs.get("user_agent"), user_agent_mode=kwargs.get("user_agent_mode"), user_agent_generator_config=kwargs.get("user_agent_generator_config", {}), @@ -1748,6 +1755,7 @@ def to_dict(self): "method": self.method, "stream": self.stream, "check_robots_txt": self.check_robots_txt, + "respect_crawl_delay": self.respect_crawl_delay, "user_agent": self.user_agent, "user_agent_mode": self.user_agent_mode, "user_agent_generator_config": self.user_agent_generator_config, diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py index bd44557c7..ebf8160fb 100644 --- a/crawl4ai/async_dispatcher.py +++ b/crawl4ai/async_dispatcher.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional, List, Tuple, Union +from typing import Dict, Optional, List, Tuple, Union, TYPE_CHECKING from .async_configs import CrawlerRunConfig from .models import ( CrawlResult, @@ -11,6 +11,9 @@ from .types import AsyncWebCrawler +if TYPE_CHECKING: + from .utils import RobotsParser + from collections.abc import AsyncGenerator import time @@ -32,32 +35,77 @@ def __init__( max_delay: float = 60.0, max_retries: int = 3, rate_limit_codes: List[int] = None, + robots_parser: Optional["RobotsParser"] = None, + respect_crawl_delay: bool = False, + default_user_agent: str = "*", ): self.base_delay = base_delay self.max_delay = max_delay self.max_retries = max_retries self.rate_limit_codes = rate_limit_codes or [429, 503] self.domains: Dict[str, DomainState] = {} + self.robots_parser = robots_parser + self.respect_crawl_delay = respect_crawl_delay + self.default_user_agent = default_user_agent + # Lock to prevent race conditions when initializing new domains + self._domain_init_lock = asyncio.Lock() def get_domain(self, url: str) -> str: return urlparse(url).netloc + async def _get_crawl_delay_for_domain(self, url: str) -> Optional[float]: + """Fetch and cache crawl-delay for a domain from robots.txt.""" + if not self.robots_parser or not self.respect_crawl_delay: + return None + + domain = self.get_domain(url) + state = self.domains.get(domain) + + # If we already have crawl_delay cached for this domain, return it + if state and state.crawl_delay is not None: + return state.crawl_delay + + # Fetch crawl-delay from robots.txt + try: + delay = await self.robots_parser.get_crawl_delay(url, self.default_user_agent) + return delay + except Exception: + return None + async def wait_if_needed(self, url: str) -> None: domain = self.get_domain(url) state = self.domains.get(domain) + # Initialize new domain with lock to prevent race conditions if not state: - self.domains[domain] = DomainState() - state = self.domains[domain] + async with self._domain_init_lock: + # Double-check after acquiring lock + state = self.domains.get(domain) + if not state: + state = DomainState() + + # Fetch crawl-delay before adding to domains dict + if self.robots_parser and self.respect_crawl_delay: + crawl_delay = await self._get_crawl_delay_for_domain(url) + state.crawl_delay = crawl_delay + + self.domains[domain] = state now = time.time() + + # Determine the effective delay - use crawl_delay if specified, otherwise current_delay + effective_delay = state.current_delay + if state.crawl_delay is not None and state.crawl_delay > 0: + # Use the larger of crawl_delay and current_delay (which may be increased due to rate limiting) + effective_delay = max(state.crawl_delay, state.current_delay) + if state.last_request_time: - wait_time = max(0, state.current_delay - (now - state.last_request_time)) + wait_time = max(0, effective_delay - (now - state.last_request_time)) if wait_time > 0: await asyncio.sleep(wait_time) - # Random delay within base range if no current delay - if state.current_delay == 0: + # Random delay within base range if no current delay and no crawl_delay + if state.current_delay == 0 and (state.crawl_delay is None or state.crawl_delay == 0): state.current_delay = random.uniform(*self.base_delay) state.last_request_time = time.time() @@ -65,6 +113,9 @@ async def wait_if_needed(self, url: str) -> None: def update_delay(self, url: str, status_code: int) -> bool: domain = self.get_domain(url) state = self.domains[domain] + + # Get minimum delay from crawl_delay if set + min_delay = state.crawl_delay if state.crawl_delay is not None else 0 if status_code in self.rate_limit_codes: state.fail_count += 1 @@ -76,10 +127,10 @@ def update_delay(self, url: str, status_code: int) -> bool: state.current_delay * 2 * random.uniform(0.75, 1.25), self.max_delay ) else: - # Gradually reduce delay on success - state.current_delay = max( - random.uniform(*self.base_delay), state.current_delay * 0.75 - ) + # Gradually reduce delay on success, but never below crawl_delay + base = random.uniform(*self.base_delay) + reduced = state.current_delay * 0.75 + state.current_delay = max(base, reduced, min_delay) state.fail_count = 0 return True diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 4dc52adc1..e0bcf8da6 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -730,9 +730,30 @@ async def arun_many( # ) if dispatcher is None: + # Determine if we need to respect crawl-delay + respect_crawl_delay = False + user_agent = self.browser_config.user_agent or "*" + + if isinstance(config, list): + respect_crawl_delay = any(c.respect_crawl_delay for c in config) + # Use user_agent from first config that has one + for c in config: + if c.user_agent: + user_agent = c.user_agent + break + else: + respect_crawl_delay = config.respect_crawl_delay + if config.user_agent: + user_agent = config.user_agent + dispatcher = MemoryAdaptiveDispatcher( rate_limiter=RateLimiter( - base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3 + base_delay=(1.0, 3.0), + max_delay=60.0, + max_retries=3, + robots_parser=self.robots_parser if respect_crawl_delay else None, + respect_crawl_delay=respect_crawl_delay, + default_user_agent=user_agent, ), ) diff --git a/crawl4ai/models.py b/crawl4ai/models.py index e46bb7fa8..845354cb5 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -17,6 +17,7 @@ class DomainState: last_request_time: float = 0 current_delay: float = 0 fail_count: int = 0 + crawl_delay: Optional[float] = None # Crawl-delay from robots.txt @dataclass diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 742160954..9d735a131 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -259,6 +259,11 @@ def __init__(self, cache_dir=None, cache_ttl=None): os.makedirs(self.cache_dir, exist_ok=True) self.db_path = os.path.join(self.cache_dir, "robots_cache.db") self._init_db() + + # Shared session for efficient connection pooling + self._session: Optional[aiohttp.ClientSession] = None + # Track in-flight requests to prevent duplicate fetches + self._pending_fetches: Dict[str, asyncio.Future] = {} def _init_db(self): # Use WAL mode for better concurrency and performance @@ -274,6 +279,54 @@ def _init_db(self): """) conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)") + async def _get_session(self) -> aiohttp.ClientSession: + """Get or create a shared aiohttp session for connection pooling.""" + if self._session is None or self._session.closed: + timeout = aiohttp.ClientTimeout(total=2) + connector = aiohttp.TCPConnector(limit=100, limit_per_host=2) + self._session = aiohttp.ClientSession(timeout=timeout, connector=connector) + return self._session + + async def _fetch_robots_txt(self, domain: str, scheme: str) -> Optional[str]: + """ + Fetch robots.txt with deduplication of in-flight requests. + Multiple concurrent requests for the same domain will share the result. + """ + # If there's already a pending fetch for this domain, wait for it + if domain in self._pending_fetches: + return await self._pending_fetches[domain] + + # Create a future for this fetch so others can wait + loop = asyncio.get_event_loop() + future = loop.create_future() + self._pending_fetches[domain] = future + + try: + robots_url = f"{scheme}://{domain}/robots.txt" + session = await self._get_session() + + async with session.get(robots_url, ssl=False) as response: + if response.status == 200: + rules = await response.text() + self._cache_rules(domain, rules) + future.set_result(rules) + return rules + else: + future.set_result(None) + return None + except Exception as _ex: + future.set_result(None) + return None + finally: + # Clean up the pending fetch + self._pending_fetches.pop(domain, None) + + async def close(self): + """Close the shared session. Call this when done with the parser.""" + if self._session and not self._session.closed: + await self._session.close() + self._session = None + def _get_cached_rules(self, domain: str) -> tuple[str, bool]: """Get cached rules. Returns (rules, is_fresh)""" with sqlite3.connect(self.db_path) as conn: @@ -335,20 +388,9 @@ async def can_fetch(self, url: str, user_agent: str = "*") -> bool: # If rules not found or stale, fetch new ones if not is_fresh: - try: - # Ensure we use the same scheme as the input URL - scheme = parsed.scheme or 'http' - robots_url = f"{scheme}://{domain}/robots.txt" - - async with aiohttp.ClientSession() as session: - async with session.get(robots_url, timeout=2, ssl=False) as response: - if response.status == 200: - rules = await response.text() - self._cache_rules(domain, rules) - else: - return True - except Exception as _ex: - # On any error (timeout, connection failed, etc), allow access + scheme = parsed.scheme or 'http' + rules = await self._fetch_robots_txt(domain, scheme) + if rules is None: return True if not rules: @@ -364,6 +406,44 @@ async def can_fetch(self, url: str, user_agent: str = "*") -> bool: return parser.can_fetch(user_agent, url) + async def get_crawl_delay(self, url: str, user_agent: str = "*") -> Optional[float]: + """ + Get the Crawl-delay directive from robots.txt for a URL/user-agent. + + Args: + url: The URL to check (used to determine the domain) + user_agent: User agent string to check against (default: "*") + + Returns: + float: Crawl delay in seconds, or None if not specified + """ + # Handle empty/invalid URLs + try: + parsed = urlparse(url) + domain = parsed.netloc + if not domain: + return None + except Exception as _ex: + return None + + # Check cache first + rules, is_fresh = self._get_cached_rules(domain) + + # If rules not found or stale, fetch new ones + if not is_fresh: + scheme = parsed.scheme or 'http' + rules = await self._fetch_robots_txt(domain, scheme) + + if not rules: + return None + + # Create parser and extract crawl-delay + parser = RobotFileParser() + parser.parse(rules.splitlines()) + + delay = parser.crawl_delay(user_agent) + return float(delay) if delay is not None else None + def clear_cache(self): """Clear all cached robots.txt entries""" with sqlite3.connect(self.db_path) as conn: diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md index a3086a8cc..d61993629 100644 --- a/docs/md_v2/api/arun.md +++ b/docs/md_v2/api/arun.md @@ -23,7 +23,8 @@ async def main(): verbose=True, # Detailed logging cache_mode=CacheMode.ENABLED, # Use normal read/write cache check_robots_txt=True, # Respect robots.txt rules - # ... other parameters + respect_crawl_delay=True, # Honor Crawl-delay directives + # ... other parameters ) async with AsyncWebCrawler() as crawler: @@ -232,7 +233,8 @@ async def main(): # Core verbose=True, cache_mode=CacheMode.ENABLED, - check_robots_txt=True, # Respect robots.txt rules + check_robots_txt=True, # Respect robots.txt rules + respect_crawl_delay=True, # Honor Crawl-delay directives # Content word_count_threshold=10, diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 9d9075166..7e28a67ff 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -140,6 +140,7 @@ Use these for controlling whether you read or write from a local content cache. | **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. | | **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. | | **`check_robots_txt`** | `bool` (False) | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency. | +| **`respect_crawl_delay`** | `bool` (False) | Whether to honor `Crawl-delay` directives from robots.txt. Requires `check_robots_txt=True`. Used with `arun_many()`. | | **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. | | **`semaphore_count`** | `int` (5) | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls. | @@ -416,12 +417,14 @@ if __name__ == "__main__": | **Parameter** | **Type / Default** | **What It Does** | |-----------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------| | **`check_robots_txt`**| `bool` (False) | When True, checks and respects robots.txt rules before crawling. Uses efficient caching with SQLite backend. | -| **`user_agent`** | `str` (None) | User agent string to identify your crawler. Used for robots.txt checking when enabled. | +| **`respect_crawl_delay`**| `bool` (False) | When True (and `check_robots_txt=True`), honors `Crawl-delay` directives from robots.txt. Waits the specified delay between requests to the same domain. | +| **`user_agent`** | `str` (None) | User agent string to identify your crawler. Used for robots.txt checking and crawl-delay lookup. | ```python run_config = CrawlerRunConfig( - check_robots_txt=True, # Enable robots.txt compliance - user_agent="MyBot/1.0" # Identify your crawler + check_robots_txt=True, # Enable robots.txt compliance + respect_crawl_delay=True, # Honor Crawl-delay directives + user_agent="MyBot/1.0" # Identify your crawler ) ``` diff --git a/docs/md_v2/complete-sdk-reference.md b/docs/md_v2/complete-sdk-reference.md index 7e6abf5c8..0df02370e 100644 --- a/docs/md_v2/complete-sdk-reference.md +++ b/docs/md_v2/complete-sdk-reference.md @@ -711,6 +711,7 @@ async def main(): verbose=True, # Detailed logging cache_mode=CacheMode.ENABLED, # Use normal read/write cache check_robots_txt=True, # Respect robots.txt rules + respect_crawl_delay=True, # Honor Crawl-delay directives # ... other parameters ) @@ -864,7 +865,8 @@ async def main(): # Core verbose=True, cache_mode=CacheMode.ENABLED, - check_robots_txt=True, # Respect robots.txt rules + check_robots_txt=True, # Respect robots.txt rules + respect_crawl_delay=True, # Honor Crawl-delay directives # Content word_count_threshold=10, @@ -1775,6 +1777,7 @@ run_cfg = CrawlerRunConfig( | **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. | | **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. | | **`check_robots_txt`** | `bool` (False) | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency. | +| **`respect_crawl_delay`** | `bool` (False) | Whether to honor `Crawl-delay` directives from robots.txt. Requires `check_robots_txt=True`. Used with `arun_many()`. | | **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. | | **`semaphore_count`** | `int` (5) | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls. | ### D) **Page Interaction** @@ -1962,11 +1965,13 @@ if __name__ == "__main__": | **Parameter** | **Type / Default** | **What It Does** | |-----------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------| | **`check_robots_txt`**| `bool` (False) | When True, checks and respects robots.txt rules before crawling. Uses efficient caching with SQLite backend. | -| **`user_agent`** | `str` (None) | User agent string to identify your crawler. Used for robots.txt checking when enabled. | +| **`respect_crawl_delay`**| `bool` (False) | When True (and `check_robots_txt=True`), honors `Crawl-delay` directives from robots.txt. Waits the specified delay between requests to the same domain. | +| **`user_agent`** | `str` (None) | User agent string to identify your crawler. Used for robots.txt checking and crawl-delay lookup. | ```python run_config = CrawlerRunConfig( - check_robots_txt=True, # Enable robots.txt compliance - user_agent="MyBot/1.0" # Identify your crawler + check_robots_txt=True, # Enable robots.txt compliance + respect_crawl_delay=True, # Honor Crawl-delay directives + user_agent="MyBot/1.0" # Identify your crawler ) ``` # 3. **LLMConfig** - Setting up LLM providers diff --git a/tests/general/test_crawl_delay.py b/tests/general/test_crawl_delay.py new file mode 100644 index 000000000..c5ff62b81 --- /dev/null +++ b/tests/general/test_crawl_delay.py @@ -0,0 +1,335 @@ +""" +Test cases for Crawl-delay directive support in crawl4ai. + +This module tests the respect_crawl_delay feature which allows the crawler +to honor Crawl-delay directives from robots.txt files. +""" + +import asyncio +import tempfile +import shutil +import time +import os + +import pytest +import pytest_asyncio +from aiohttp import web + +from crawl4ai.utils import RobotsParser +from crawl4ai.async_dispatcher import RateLimiter +from crawl4ai.models import DomainState + + +class TestRobotsParserCrawlDelay: + """Test cases for RobotsParser.get_crawl_delay() method.""" + + @pytest.fixture + def temp_cache_dir(self): + """Create a temporary directory for cache.""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + shutil.rmtree(temp_dir) + + @pytest_asyncio.fixture + async def test_server_with_delay(self): + """Start a test HTTP server with crawl-delay in robots.txt.""" + app = web.Application() + + async def robots_with_delay(request): + return web.Response(text="""User-agent: * +Crawl-delay: 5 +Disallow: /private/ +""") + + app.router.add_get('/robots.txt', robots_with_delay) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8181) + await site.start() + + yield "http://localhost:8181" + + await runner.cleanup() + + @pytest_asyncio.fixture + async def test_server_bot_specific(self): + """Start a test HTTP server with bot-specific crawl-delay.""" + app = web.Application() + + async def robots_with_bot_specific_delay(request): + return web.Response(text="""User-agent: MyBot +Crawl-delay: 10 + +User-agent: * +Crawl-delay: 2 +Disallow: /admin/ +""") + + app.router.add_get('/robots.txt', robots_with_bot_specific_delay) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8183) + await site.start() + + yield "http://localhost:8183" + + await runner.cleanup() + + @pytest_asyncio.fixture + async def test_server_no_delay(self): + """Start a test HTTP server without crawl-delay.""" + app = web.Application() + + async def robots_no_delay(request): + return web.Response(text="""User-agent: * +Disallow: /private/ +""") + + app.router.add_get('/robots.txt', robots_no_delay) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8184) + await site.start() + + yield "http://localhost:8184" + + await runner.cleanup() + + @pytest_asyncio.fixture + async def test_server_empty(self): + """Start a test HTTP server with empty robots.txt.""" + app = web.Application() + + async def robots_empty(request): + return web.Response(text="") + + app.router.add_get('/robots.txt', robots_empty) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8185) + await site.start() + + yield "http://localhost:8185" + + await runner.cleanup() + + @pytest.mark.asyncio + async def test_get_crawl_delay_returns_value(self, temp_cache_dir, test_server_with_delay): + """Test that get_crawl_delay returns the correct delay value.""" + parser = RobotsParser(cache_dir=temp_cache_dir) + + delay = await parser.get_crawl_delay(f"{test_server_with_delay}/page", "*") + await parser.close() + assert delay == 5.0 + + @pytest.mark.asyncio + async def test_get_crawl_delay_bot_specific(self, temp_cache_dir, test_server_bot_specific): + """Test that get_crawl_delay respects bot-specific delays.""" + parser = RobotsParser(cache_dir=temp_cache_dir) + + # MyBot should get 10 second delay + delay = await parser.get_crawl_delay(f"{test_server_bot_specific}/page", "MyBot") + assert delay == 10.0 + + # Clear cache to fetch again + parser.clear_cache() + + # Other bots should get 2 second delay + delay = await parser.get_crawl_delay(f"{test_server_bot_specific}/page", "OtherBot") + await parser.close() + assert delay == 2.0 + + @pytest.mark.asyncio + async def test_get_crawl_delay_returns_none_when_not_specified(self, temp_cache_dir, test_server_no_delay): + """Test that get_crawl_delay returns None when no delay is specified.""" + parser = RobotsParser(cache_dir=temp_cache_dir) + + delay = await parser.get_crawl_delay(f"{test_server_no_delay}/page", "*") + await parser.close() + assert delay is None + + @pytest.mark.asyncio + async def test_get_crawl_delay_returns_none_for_empty_robots(self, temp_cache_dir, test_server_empty): + """Test that get_crawl_delay returns None for empty robots.txt.""" + parser = RobotsParser(cache_dir=temp_cache_dir) + + delay = await parser.get_crawl_delay(f"{test_server_empty}/page", "*") + await parser.close() + assert delay is None + + @pytest.mark.asyncio + async def test_get_crawl_delay_returns_none_for_invalid_url(self, temp_cache_dir): + """Test that get_crawl_delay handles invalid URLs gracefully.""" + parser = RobotsParser(cache_dir=temp_cache_dir) + + delay = await parser.get_crawl_delay("", "*") + assert delay is None + + delay = await parser.get_crawl_delay("not_a_url", "*") + await parser.close() + assert delay is None + + @pytest.mark.asyncio + async def test_get_crawl_delay_caches_result(self, temp_cache_dir, test_server_with_delay): + """Test that get_crawl_delay uses cached results.""" + parser = RobotsParser(cache_dir=temp_cache_dir) + + # First call - should fetch + start = time.time() + delay1 = await parser.get_crawl_delay(f"{test_server_with_delay}/page", "*") + first_duration = time.time() - start + + # Second call - should be cached and faster + start = time.time() + delay2 = await parser.get_crawl_delay(f"{test_server_with_delay}/page", "*") + second_duration = time.time() - start + + await parser.close() + assert delay1 == delay2 == 5.0 + # Cached lookup should be significantly faster + assert second_duration < first_duration + + +class TestRateLimiterWithCrawlDelay: + """Test cases for RateLimiter with crawl-delay support.""" + + @pytest.fixture + def temp_cache_dir(self): + """Create a temporary directory for cache.""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + shutil.rmtree(temp_dir) + + @pytest_asyncio.fixture + async def test_server(self): + """Start a test HTTP server with robots.txt that has crawl-delay.""" + app = web.Application() + + async def robots_with_delay(request): + return web.Response(text="""User-agent: * +Crawl-delay: 2 +""") + + async def page(request): + return web.Response(text="OK") + + app.router.add_get('/robots.txt', robots_with_delay) + app.router.add_get('/page', page) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8182) + await site.start() + + yield "http://localhost:8182" + + await runner.cleanup() + + @pytest.mark.asyncio + async def test_rate_limiter_respects_crawl_delay(self, temp_cache_dir, test_server): + """Test that RateLimiter waits for crawl-delay between requests.""" + parser = RobotsParser(cache_dir=temp_cache_dir) + + rate_limiter = RateLimiter( + base_delay=(0.1, 0.2), # Small base delay + robots_parser=parser, + respect_crawl_delay=True, + default_user_agent="*" + ) + + url = f"{test_server}/page" + + # First request + start = time.time() + await rate_limiter.wait_if_needed(url) + first_request_time = time.time() + + # Second request - should wait for crawl-delay (2 seconds) + await rate_limiter.wait_if_needed(url) + second_request_time = time.time() + + await parser.close() + + elapsed = second_request_time - first_request_time + # Should have waited approximately 2 seconds (crawl-delay) + assert elapsed >= 1.9, f"Expected ~2s delay, got {elapsed}s" + + @pytest.mark.asyncio + async def test_rate_limiter_without_crawl_delay_uses_base_delay(self, temp_cache_dir): + """Test that RateLimiter uses base_delay when crawl-delay is disabled.""" + parser = RobotsParser(cache_dir=temp_cache_dir) + + rate_limiter = RateLimiter( + base_delay=(0.1, 0.2), + robots_parser=parser, + respect_crawl_delay=False, # Disabled + default_user_agent="*" + ) + + url = "http://example.com/page" + + # First request + await rate_limiter.wait_if_needed(url) + first_request_time = time.time() + + # Second request - should use base_delay (0.1-0.2 seconds) + await rate_limiter.wait_if_needed(url) + second_request_time = time.time() + + await parser.close() + + elapsed = second_request_time - first_request_time + # Should have waited base_delay (0.1-0.2 seconds) + assert elapsed < 1.0, f"Expected small delay, got {elapsed}s" + + def test_domain_state_has_crawl_delay_field(self): + """Test that DomainState includes crawl_delay field.""" + state = DomainState() + assert hasattr(state, 'crawl_delay') + assert state.crawl_delay is None + + state.crawl_delay = 5.0 + assert state.crawl_delay == 5.0 + + +class TestCrawlerRunConfigWithCrawlDelay: + """Test cases for CrawlerRunConfig.respect_crawl_delay parameter.""" + + def test_config_has_respect_crawl_delay_parameter(self): + """Test that CrawlerRunConfig has respect_crawl_delay parameter.""" + from crawl4ai.async_configs import CrawlerRunConfig + + config = CrawlerRunConfig() + assert hasattr(config, 'respect_crawl_delay') + assert config.respect_crawl_delay is False + + def test_config_respect_crawl_delay_can_be_set(self): + """Test that respect_crawl_delay can be set to True.""" + from crawl4ai.async_configs import CrawlerRunConfig + + config = CrawlerRunConfig(respect_crawl_delay=True) + assert config.respect_crawl_delay is True + + def test_config_to_dict_includes_respect_crawl_delay(self): + """Test that to_dict includes respect_crawl_delay.""" + from crawl4ai.async_configs import CrawlerRunConfig + + config = CrawlerRunConfig(respect_crawl_delay=True) + config_dict = config.to_dict() + assert 'respect_crawl_delay' in config_dict + assert config_dict['respect_crawl_delay'] is True + + def test_config_from_kwargs_includes_respect_crawl_delay(self): + """Test that from_kwargs handles respect_crawl_delay.""" + from crawl4ai.async_configs import CrawlerRunConfig + + config = CrawlerRunConfig.from_kwargs({"respect_crawl_delay": True}) + assert config.respect_crawl_delay is True + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])