From 4be931bbc54414d9494d718d6bb3552f0db457f2 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 6 Mar 2026 17:59:19 +0000 Subject: [PATCH 1/6] add `post_navigation_hook` --- src/crawlee/crawlers/__init__.py | 19 +++- .../_abstract_http/_abstract_http_crawler.py | 19 ++++ .../crawlers/_adaptive_playwright/__init__.py | 2 + .../_adaptive_playwright_crawler.py | 46 ++++++++ .../_adaptive_playwright_crawling_context.py | 57 ++++++++++ src/crawlee/crawlers/_playwright/__init__.py | 3 + .../_playwright/_playwright_crawler.py | 106 ++++++++++++------ .../_playwright_crawling_context.py | 9 +- .../_playwright_post_nav_crawling_context.py | 23 ++++ 9 files changed, 241 insertions(+), 43 deletions(-) create mode 100644 src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py index 5d0825eb01..6445ad5d64 100644 --- a/src/crawlee/crawlers/__init__.py +++ b/src/crawlee/crawlers/__init__.py @@ -15,13 +15,25 @@ with _try_import(__name__, 'ParselCrawler', 'ParselCrawlingContext'): from ._parsel import ParselCrawler, ParselCrawlingContext -with _try_import(__name__, 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavCrawlingContext'): - from ._playwright import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext +with _try_import( + __name__, + 'PlaywrightCrawler', + 'PlaywrightCrawlingContext', + 'PlaywrightPostNavCrawlingContext', + 'PlaywrightPreNavCrawlingContext', +): + from ._playwright import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPostNavCrawlingContext, + PlaywrightPreNavCrawlingContext, + ) with _try_import( __name__, 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlingContext', + 'AdaptivePlaywrightPostNavCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', 'AdaptivePlaywrightCrawlerStatisticState', 'RenderingType', @@ -32,6 +44,7 @@ AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlerStatisticState, AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, RenderingType, RenderingTypePrediction, @@ -45,6 +58,7 @@ 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlerStatisticState', 'AdaptivePlaywrightCrawlingContext', + 'AdaptivePlaywrightPostNavCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', 'BasicCrawler', 'BasicCrawlerOptions', @@ -62,6 +76,7 @@ 'ParselCrawlingContext', 'PlaywrightCrawler', 'PlaywrightCrawlingContext', + 'PlaywrightPostNavCrawlingContext', 'PlaywrightPreNavCrawlingContext', 'RenderingType', 'RenderingTypePrediction', diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 7aafa49e2e..7a1eb9594d 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -32,6 +32,7 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext) TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) +TContext = TypeVar('TContext') class HttpCrawlerOptions( @@ -77,6 +78,7 @@ def __init__( self._parser = parser self._navigation_timeout = navigation_timeout or timedelta(minutes=1) self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = [] + self._post_navigation_hooks: list[Callable[[HttpCrawlingContext], Awaitable[None]]] = [] self._shared_navigation_timeouts: dict[int, SharedTimeout] = {} if '_context_pipeline' not in kwargs: @@ -120,6 +122,7 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC ContextPipeline() .compose(self._execute_pre_navigation_hooks) .compose(self._make_http_request) + .compose(self._execute_post_navigation_hooks) .compose(self._handle_status_code_response) .compose(self._parse_http_response) .compose(self._handle_blocked_request_by_content) @@ -140,6 +143,14 @@ async def _execute_pre_navigation_hooks( finally: self._shared_navigation_timeouts.pop(context_id, None) + async def _execute_post_navigation_hooks( + self, context: HttpCrawlingContext + ) -> AsyncGenerator[HttpCrawlingContext, None]: + for hook in self._post_navigation_hooks: + await hook(context) + + yield context + async def _parse_http_response( self, context: HttpCrawlingContext ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]: @@ -308,3 +319,11 @@ def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], Awaitable[N hook: A coroutine function to be called before each navigation. """ self._pre_navigation_hooks.append(hook) + + def post_navigation_hook(self, hook: Callable[[HttpCrawlingContext], Awaitable[None]]) -> None: + """Register a hook to be called after each navigation. + + Args: + hook: A coroutine function to be called after each navigation. + """ + self._post_navigation_hooks.append(hook) diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py index 89d0a26888..a680ca97e6 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/__init__.py +++ b/src/crawlee/crawlers/_adaptive_playwright/__init__.py @@ -4,6 +4,7 @@ # These imports have only mandatory dependencies, so they are imported directly. from ._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) @@ -22,6 +23,7 @@ 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlerStatisticState', 'AdaptivePlaywrightCrawlingContext', + 'AdaptivePlaywrightPostNavCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', 'RenderingType', 'RenderingTypePrediction', diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index fbbf811f11..3ae0b8742f 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -20,9 +20,11 @@ AbstractHttpParser, BasicCrawler, BeautifulSoupParserType, + HttpCrawlingContext, ParsedHttpCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, + PlaywrightPostNavCrawlingContext, PlaywrightPreNavCrawlingContext, ) from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser @@ -33,6 +35,7 @@ from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState from ._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor @@ -196,6 +199,23 @@ async def adaptive_pre_navigation_hook_pw(context: PlaywrightPreNavCrawlingConte static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_static) playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_pw) + # Register post navigation hooks on sub crawlers + self._post_navigation_hooks = list[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]]() + self._post_navigation_hooks_pw_only = list[ + Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] + ]() + + async def adaptive_post_navigation_hook_static(context: HttpCrawlingContext) -> None: + for hook in self._post_navigation_hooks: + await hook(await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)) + + async def adaptive_post_navigation_hook_pw(context: PlaywrightPostNavCrawlingContext) -> None: + for hook in self._post_navigation_hooks + self._post_navigation_hooks_pw_only: + await hook(await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)) + + static_crawler.post_navigation_hook(adaptive_post_navigation_hook_static) + playwright_crawler.post_navigation_hook(adaptive_post_navigation_hook_pw) + self._additional_context_managers = [ *self._additional_context_managers, self.rendering_type_predictor, @@ -437,6 +457,32 @@ def register_hooks(hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awa # Return parametrized decorator that will be executed through decorator syntax if called with parameter. return register_hooks + def post_navigation_hook( + self, + hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] | None = None, + *, + playwright_only: bool = False, + ) -> Callable[[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]], None]: + """Post navigation hooks for adaptive crawler are delegated to sub crawlers. + + Optionally parametrized decorator. + Hooks are wrapped in context that handles possibly missing `page` and `response` objects by raising + `AdaptiveContextError`. + """ + + def register_hooks(hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None: + if playwright_only: + self._post_navigation_hooks_pw_only.append(hook) + else: + self._post_navigation_hooks.append(hook) + + # No parameter in decorator. Execute directly. + if hook: + register_hooks(hook) + + # Return parametrized decorator that will be executed through decorator syntax if called with parameter. + return register_hooks + def track_http_only_request_handler_runs(self) -> None: self.statistics.state.http_only_request_handler_runs += 1 diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 9026fb358f..0149f0914d 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -9,6 +9,7 @@ from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext +from crawlee.crawlers._playwright._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext from crawlee.crawlers._playwright._types import PlaywrightHttpResponse if TYPE_CHECKING: @@ -17,6 +18,7 @@ from playwright.async_api import Page, Response from typing_extensions import Self + from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions @@ -201,6 +203,61 @@ async def from_playwright_crawling_context( ) +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class AdaptivePlaywrightPostNavCrawlingContext(BasicCrawlingContext): + """A post-navigation crawling context for the `AdaptivePlaywrightCrawler`. + + Wraps either `HttpCrawlingContext` (static sub-crawler) or `PlaywrightPostNavCrawlingContext` (browser + sub-crawler). Playwright-specific attributes (`page`, `response`) raise `AdaptiveContextError` when accessed + during static crawling. + """ + + _page: Page | None = None + _response: Response | None = None + + @property + def page(self) -> Page: + """The Playwright `Page` object for the current page. + + Raises `AdaptiveContextError` if accessed during static crawling. + """ + if not self._page: + raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') + return self._page + + @property + def response(self) -> Response: + """The Playwright `Response` object containing the response details for the current URL. + + Raises `AdaptiveContextError` if accessed during static crawling. + """ + if not self._response: + raise AdaptiveContextError('Response was not crawled with PlaywrightCrawler.') + return self._response + + @classmethod + async def from_post_navigation_context( + cls, context: HttpCrawlingContext | PlaywrightPostNavCrawlingContext + ) -> Self: + """Initialize a new instance from an existing post-navigation context.""" + context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} + + context_kwargs['_page'] = context_kwargs.pop('page', None) + context_kwargs['_response'] = context_kwargs.pop('response', None) + + # block_requests and goto_options are useful only on pre-navigation contexts. + context_kwargs.pop('block_requests', None) + context_kwargs.pop('goto_options', None) + + if isinstance(context, PlaywrightPostNavCrawlingContext): + protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol') + context_kwargs['http_response'] = await PlaywrightHttpResponse.from_playwright_response( + response=context.response, protocol=protocol_guess or '' + ) + return cls(**context_kwargs) + + @dataclass(frozen=True) @docs_group('Crawling contexts') class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext): diff --git a/src/crawlee/crawlers/_playwright/__init__.py b/src/crawlee/crawlers/_playwright/__init__.py index 58eef84772..959b84780d 100644 --- a/src/crawlee/crawlers/_playwright/__init__.py +++ b/src/crawlee/crawlers/_playwright/__init__.py @@ -11,9 +11,12 @@ from ._playwright_crawling_context import PlaywrightCrawlingContext with _try_import(__name__, 'PlaywrightPreNavCrawlingContext'): from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext +with _try_import(__name__, 'PlaywrightPostNavCrawlingContext'): + from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext __all__ = [ 'PlaywrightCrawler', 'PlaywrightCrawlingContext', + 'PlaywrightPostNavCrawlingContext', 'PlaywrightPreNavCrawlingContext', ] diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 6f4b2b0e9d..42602c7e57 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -30,6 +30,7 @@ from ._playwright_crawling_context import PlaywrightCrawlingContext from ._playwright_http_client import PlaywrightHttpClient, browser_page_context +from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext from ._types import GotoOptions from ._utils import block_requests, infinite_scroll @@ -194,12 +195,15 @@ def __init__( ContextPipeline() .compose(self._open_page) .compose(self._navigate) + .compose(self._execute_post_navigation_hooks) .compose(self._handle_status_code_response) .compose(self._handle_blocked_request_by_content) + .compose(self._create_crawling_context) ) kwargs['_additional_context_managers'] = [self._browser_pool] kwargs.setdefault('_logger', logging.getLogger(__name__)) self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = [] + self._post_navigation_hooks: list[Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]] = [] kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client'] @@ -274,7 +278,7 @@ async def route_handler(route: Route, _: PlaywrightRequest) -> None: async def _navigate( self, context: PlaywrightPreNavCrawlingContext, - ) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]: + ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, Exception | None]: """Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library. Args: @@ -330,35 +334,21 @@ async def _navigate( # Set the loaded URL to the actual URL after redirection. context.request.loaded_url = context.page.url - extract_links = self._create_extract_links_function(context) - - async with browser_page_context(context.page): - error = yield PlaywrightCrawlingContext( - request=context.request, - session=context.session, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - use_state=context.use_state, - proxy_info=context.proxy_info, - get_key_value_store=context.get_key_value_store, - log=context.log, - page=context.page, - infinite_scroll=lambda: infinite_scroll(context.page), - response=response, - extract_links=extract_links, - enqueue_links=self._create_enqueue_links_function(context, extract_links), - block_requests=partial(block_requests, page=context.page), - goto_options=context.goto_options, - ) - - if context.session: - pw_cookies = await self._get_cookies(context.page) - context.session.cookies.set_cookies_from_playwright_format(pw_cookies) - - # Collect data in case of errors, before the page object is closed. - if error: - await self.statistics.error_tracker.add(error=error, context=context, early=True) + yield PlaywrightPostNavCrawlingContext( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + page=context.page, + block_requests=context.block_requests, + goto_options=context.goto_options, + response=response, + ) def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction: """Create a callback function for extracting links from context. @@ -442,8 +432,8 @@ async def extract_links( return extract_links async def _handle_status_code_response( - self, context: PlaywrightCrawlingContext - ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: + self, context: PlaywrightPostNavCrawlingContext + ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: """Validate the HTTP status code and raise appropriate exceptions if needed. Args: @@ -465,8 +455,8 @@ async def _handle_status_code_response( async def _handle_blocked_request_by_content( self, - context: PlaywrightCrawlingContext, - ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: + context: PlaywrightPostNavCrawlingContext, + ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: """Try to detect if the request is blocked based on the response content. Args: @@ -492,6 +482,46 @@ async def _handle_blocked_request_by_content( yield context + async def _execute_post_navigation_hooks( + self, context: PlaywrightPostNavCrawlingContext + ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: + for hook in self._post_navigation_hooks: + await hook(context) + yield context + + async def _create_crawling_context( + self, context: PlaywrightPostNavCrawlingContext + ) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]: + extract_links = self._create_extract_links_function(context) + + async with browser_page_context(context.page): + error = yield PlaywrightCrawlingContext( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + page=context.page, + goto_options=context.goto_options, + response=context.response, + infinite_scroll=lambda: infinite_scroll(context.page), + extract_links=extract_links, + enqueue_links=self._create_enqueue_links_function(context, extract_links), + block_requests=partial(block_requests, page=context.page), + ) + + if context.session: + pw_cookies = await self._get_cookies(context.page) + context.session.cookies.set_cookies_from_playwright_format(pw_cookies) + + # Collect data in case of errors, before the page object is closed. + if error: + await self.statistics.error_tracker.add(error=error, context=context, early=True) + def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: """Register a hook to be called before each navigation. @@ -500,6 +530,14 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], """ self._pre_navigation_hooks.append(hook) + def post_navigation_hook(self, hook: Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None: + """Register a hook to be called after each navigation. + + Args: + hook: A coroutine function to be called after each navigation. + """ + self._post_navigation_hooks.append(hook) + async def _get_cookies(self, page: Page) -> list[PlaywrightCookieParam]: """Get the cookies from the page.""" cookies = await page.context.cookies() diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py b/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py index b90165cce2..2c98a4c115 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py @@ -5,27 +5,22 @@ from crawlee._utils.docs import docs_group -from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext +from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext if TYPE_CHECKING: from collections.abc import Awaitable, Callable - from playwright.async_api import Response - from crawlee._types import EnqueueLinksFunction, ExtractLinksFunction @dataclass(frozen=True) @docs_group('Crawling contexts') -class PlaywrightCrawlingContext(PlaywrightPreNavCrawlingContext): +class PlaywrightCrawlingContext(PlaywrightPostNavCrawlingContext): """The crawling context used by the `PlaywrightCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. """ - response: Response - """The Playwright `Response` object containing the response details for the current URL.""" - enqueue_links: EnqueueLinksFunction """The Playwright `EnqueueLinksFunction` implementation.""" diff --git a/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py b/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py new file mode 100644 index 0000000000..40e227349a --- /dev/null +++ b/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from crawlee._utils.docs import docs_group + +from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext + +if TYPE_CHECKING: + from playwright.async_api import Response + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class PlaywrightPostNavCrawlingContext(PlaywrightPreNavCrawlingContext): + """The post navigation crawling context used by the `PlaywrightCrawler`. + + It provides access to the `Page` and `Response` objects, after the navigation to the URL is performed. + """ + + response: Response + """The Playwright `Response` object containing the response details for the current URL.""" From e61140bc8ccf58931acc97f12c6fb628f78b2695 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 15 Mar 2026 22:43:06 +0000 Subject: [PATCH 2/6] update tests for navigation hooks --- .../_adaptive_playwright_crawling_context.py | 102 ++++++++-------- .../test_adaptive_playwright_crawler.py | 69 ++++++++++- .../unit/crawlers/_http/test_http_crawler.py | 68 +++++++++-- .../_playwright/test_playwright_crawler.py | 115 +++++++++++++++++- 4 files changed, 290 insertions(+), 64 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 0149f0914d..fda1457caf 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -9,6 +9,7 @@ from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext +from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext from crawlee.crawlers._playwright._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext from crawlee.crawlers._playwright._types import PlaywrightHttpResponse @@ -18,7 +19,6 @@ from playwright.async_api import Page, Response from typing_extensions import Self - from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions @@ -205,12 +205,56 @@ async def from_playwright_crawling_context( @dataclass(frozen=True) @docs_group('Crawling contexts') -class AdaptivePlaywrightPostNavCrawlingContext(BasicCrawlingContext): - """A post-navigation crawling context for the `AdaptivePlaywrightCrawler`. +class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext): + """A wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext. + + Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext. + """ + + _page: Page | None = None + block_requests: BlockRequestsFunction | None = None + """Blocks network requests matching specified URL patterns.""" + + goto_options: GotoOptions | None = None + """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported.""" + + @property + def page(self) -> Page: + """The Playwright `Page` object for the current page. + + Raises `AdaptiveContextError` if accessed during static crawling. + """ + if self._page is not None: + return self._page + raise AdaptiveContextError( + 'Page was crawled with static sub crawler and not with crawled with PlaywrightCrawler. For Playwright only ' + 'hooks please use `playwright_only`=True when registering the hook. ' + 'For example: @crawler.pre_navigation_hook(playwright_only=True)' + ) + + @classmethod + def from_pre_navigation_context(cls, context: BasicCrawlingContext) -> Self: + """Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`.""" + context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} + context_kwargs['_page'] = context_kwargs.pop('page', None) + + # For static sub crawler replace block requests by function doing nothing. + async def dummy_block_requests( + url_patterns: list[str] | None = None, # noqa:ARG001 + extra_url_patterns: list[str] | None = None, # noqa:ARG001 + ) -> None: + return + + context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests) + return cls(**context_kwargs) + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class AdaptivePlaywrightPostNavCrawlingContext(HttpCrawlingContext): + """A wrapper around HttpCrawlingContext or AdaptivePlaywrightCrawlingContext. - Wraps either `HttpCrawlingContext` (static sub-crawler) or `PlaywrightPostNavCrawlingContext` (browser - sub-crawler). Playwright-specific attributes (`page`, `response`) raise `AdaptiveContextError` when accessed - during static crawling. + Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is HttpCrawlingContext. """ _page: Page | None = None @@ -256,49 +300,3 @@ async def from_post_navigation_context( response=context.response, protocol=protocol_guess or '' ) return cls(**context_kwargs) - - -@dataclass(frozen=True) -@docs_group('Crawling contexts') -class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext): - """A wrapper around BasicCrawlingContext or AdaptivePlaywrightCrawlingContext. - - Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is BasicCrawlingContext. - """ - - _page: Page | None = None - block_requests: BlockRequestsFunction | None = None - """Blocks network requests matching specified URL patterns.""" - - goto_options: GotoOptions | None = None - """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported.""" - - @property - def page(self) -> Page: - """The Playwright `Page` object for the current page. - - Raises `AdaptiveContextError` if accessed during static crawling. - """ - if self._page is not None: - return self._page - raise AdaptiveContextError( - 'Page was crawled with static sub crawler and not with crawled with PlaywrightCrawler. For Playwright only ' - 'hooks please use `playwright_only`=True when registering the hook. ' - 'For example: @crawler.pre_navigation_hook(playwright_only=True)' - ) - - @classmethod - def from_pre_navigation_context(cls, context: BasicCrawlingContext) -> Self: - """Initialize a new instance from an existing pre-navigation `BasicCrawlingContext`.""" - context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} - context_kwargs['_page'] = context_kwargs.pop('page', None) - - # For static sub crawler replace block requests by function doing nothing. - async def dummy_block_requests( - url_patterns: list[str] | None = None, # noqa:ARG001 - extra_url_patterns: list[str] | None = None, # noqa:ARG001 - ) -> None: - return - - context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests) - return cls(**context_kwargs) diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 94d74d365b..be203c9c12 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -17,6 +17,7 @@ from crawlee.crawlers import ( AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, BasicCrawler, RenderingType, @@ -50,7 +51,7 @@

Initial text