diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py index 5d0825eb01..6445ad5d64 100644 --- a/src/crawlee/crawlers/__init__.py +++ b/src/crawlee/crawlers/__init__.py @@ -15,13 +15,25 @@ with _try_import(__name__, 'ParselCrawler', 'ParselCrawlingContext'): from ._parsel import ParselCrawler, ParselCrawlingContext -with _try_import(__name__, 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavCrawlingContext'): - from ._playwright import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext +with _try_import( + __name__, + 'PlaywrightCrawler', + 'PlaywrightCrawlingContext', + 'PlaywrightPostNavCrawlingContext', + 'PlaywrightPreNavCrawlingContext', +): + from ._playwright import ( + PlaywrightCrawler, + PlaywrightCrawlingContext, + PlaywrightPostNavCrawlingContext, + PlaywrightPreNavCrawlingContext, + ) with _try_import( __name__, 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlingContext', + 'AdaptivePlaywrightPostNavCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', 'AdaptivePlaywrightCrawlerStatisticState', 'RenderingType', @@ -32,6 +44,7 @@ AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlerStatisticState, AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, RenderingType, RenderingTypePrediction, @@ -45,6 +58,7 @@ 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlerStatisticState', 'AdaptivePlaywrightCrawlingContext', + 'AdaptivePlaywrightPostNavCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', 'BasicCrawler', 'BasicCrawlerOptions', @@ -62,6 +76,7 @@ 'ParselCrawlingContext', 'PlaywrightCrawler', 'PlaywrightCrawlingContext', + 'PlaywrightPostNavCrawlingContext', 'PlaywrightPreNavCrawlingContext', 'RenderingType', 'RenderingTypePrediction', diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index b9ea429f98..c4ef4a14f9 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -77,6 +77,7 @@ def __init__( self._parser = parser self._navigation_timeout = navigation_timeout or timedelta(minutes=1) self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = [] + self._post_navigation_hooks: list[Callable[[HttpCrawlingContext], Awaitable[None]]] = [] self._shared_navigation_timeouts: dict[int, SharedTimeout] = {} if '_context_pipeline' not in kwargs: @@ -120,6 +121,7 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC ContextPipeline() .compose(self._execute_pre_navigation_hooks) .compose(self._make_http_request) + .compose(self._execute_post_navigation_hooks) .compose(self._handle_status_code_response) .compose(self._parse_http_response) .compose(self._handle_blocked_request_by_content) @@ -140,6 +142,14 @@ async def _execute_pre_navigation_hooks( finally: self._shared_navigation_timeouts.pop(context_id, None) + async def _execute_post_navigation_hooks( + self, context: HttpCrawlingContext + ) -> AsyncGenerator[HttpCrawlingContext, None]: + for hook in self._post_navigation_hooks: + await hook(context) + + yield context + async def _parse_http_response( self, context: HttpCrawlingContext ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]: @@ -311,3 +321,11 @@ def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], Awaitable[N hook: A coroutine function to be called before each navigation. """ self._pre_navigation_hooks.append(hook) + + def post_navigation_hook(self, hook: Callable[[HttpCrawlingContext], Awaitable[None]]) -> None: + """Register a hook to be called after each navigation. + + Args: + hook: A coroutine function to be called after each navigation. + """ + self._post_navigation_hooks.append(hook) diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py index 89d0a26888..a680ca97e6 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/__init__.py +++ b/src/crawlee/crawlers/_adaptive_playwright/__init__.py @@ -4,6 +4,7 @@ # These imports have only mandatory dependencies, so they are imported directly. from ._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) @@ -22,6 +23,7 @@ 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlerStatisticState', 'AdaptivePlaywrightCrawlingContext', + 'AdaptivePlaywrightPostNavCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', 'RenderingType', 'RenderingTypePrediction', diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index fbbf811f11..ef64d1ba58 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -20,9 +20,11 @@ AbstractHttpParser, BasicCrawler, BeautifulSoupParserType, + HttpCrawlingContext, ParsedHttpCrawlingContext, PlaywrightCrawler, PlaywrightCrawlingContext, + PlaywrightPostNavCrawlingContext, PlaywrightPreNavCrawlingContext, ) from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser @@ -33,6 +35,7 @@ from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState from ._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor @@ -196,6 +199,25 @@ async def adaptive_pre_navigation_hook_pw(context: PlaywrightPreNavCrawlingConte static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_static) playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_pw) + # Register post navigation hooks on sub crawlers + self._post_navigation_hooks = list[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]]() + self._post_navigation_hooks_pw_only = list[ + Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] + ]() + + async def adaptive_post_navigation_hook_static(context: HttpCrawlingContext) -> None: + adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context) + for hook in self._post_navigation_hooks: + await hook(adaptive_context) + + async def adaptive_post_navigation_hook_pw(context: PlaywrightPostNavCrawlingContext) -> None: + adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context) + for hook in self._post_navigation_hooks + self._post_navigation_hooks_pw_only: + await hook(adaptive_context) + + static_crawler.post_navigation_hook(adaptive_post_navigation_hook_static) + playwright_crawler.post_navigation_hook(adaptive_post_navigation_hook_pw) + self._additional_context_managers = [ *self._additional_context_managers, self.rendering_type_predictor, @@ -437,6 +459,32 @@ def register_hooks(hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awa # Return parametrized decorator that will be executed through decorator syntax if called with parameter. return register_hooks + def post_navigation_hook( + self, + hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] | None = None, + *, + playwright_only: bool = False, + ) -> Callable[[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]], None]: + """Post navigation hooks for adaptive crawler are delegated to sub crawlers. + + Optionally parametrized decorator. + Hooks are wrapped in context that handles possibly missing `page` and `response` objects by raising + `AdaptiveContextError`. + """ + + def register_hooks(hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None: + if playwright_only: + self._post_navigation_hooks_pw_only.append(hook) + else: + self._post_navigation_hooks.append(hook) + + # No parameter in decorator. Execute directly. + if hook: + register_hooks(hook) + + # Return parametrized decorator that will be executed through decorator syntax if called with parameter. + return register_hooks + def track_http_only_request_handler_runs(self) -> None: self.statistics.state.http_only_request_handler_runs += 1 diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 9026fb358f..5f7cf26a44 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -9,6 +9,8 @@ from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext +from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext +from crawlee.crawlers._playwright._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext from crawlee.crawlers._playwright._types import PlaywrightHttpResponse if TYPE_CHECKING: @@ -186,7 +188,7 @@ async def from_playwright_crawling_context( context_kwargs['_page'] = context_kwargs.pop('page') context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll') # This might not be always available. - protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol') + protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0]?.nextHopProtocol') http_response = await PlaywrightHttpResponse.from_playwright_response( response=context.response, protocol=protocol_guess or '' ) @@ -245,3 +247,58 @@ async def dummy_block_requests( context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests) return cls(**context_kwargs) + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class AdaptivePlaywrightPostNavCrawlingContext(HttpCrawlingContext): + """A wrapper around HttpCrawlingContext or AdaptivePlaywrightCrawlingContext. + + Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is HttpCrawlingContext. + """ + + _page: Page | None = None + _response: Response | None = None + + @property + def page(self) -> Page: + """The Playwright `Page` object for the current page. + + Raises `AdaptiveContextError` if accessed during static crawling. + """ + if not self._page: + raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.') + return self._page + + @property + def response(self) -> Response: + """The Playwright `Response` object containing the response details for the current URL. + + Raises `AdaptiveContextError` if accessed during static crawling. + """ + if not self._response: + raise AdaptiveContextError('Response was not crawled with PlaywrightCrawler.') + return self._response + + @classmethod + async def from_post_navigation_context( + cls, context: HttpCrawlingContext | PlaywrightPostNavCrawlingContext + ) -> Self: + """Initialize a new instance from an existing post-navigation context.""" + context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} + + context_kwargs['_page'] = context_kwargs.pop('page', None) + context_kwargs['_response'] = context_kwargs.pop('response', None) + + # block_requests and goto_options are useful only on pre-navigation contexts. + context_kwargs.pop('block_requests', None) + context_kwargs.pop('goto_options', None) + + if isinstance(context, PlaywrightPostNavCrawlingContext): + protocol_guess = await context_kwargs['_page'].evaluate( + '() => performance.getEntries()[0]?.nextHopProtocol' + ) + context_kwargs['http_response'] = await PlaywrightHttpResponse.from_playwright_response( + response=context.response, protocol=protocol_guess or '' + ) + return cls(**context_kwargs) diff --git a/src/crawlee/crawlers/_playwright/__init__.py b/src/crawlee/crawlers/_playwright/__init__.py index 58eef84772..959b84780d 100644 --- a/src/crawlee/crawlers/_playwright/__init__.py +++ b/src/crawlee/crawlers/_playwright/__init__.py @@ -11,9 +11,12 @@ from ._playwright_crawling_context import PlaywrightCrawlingContext with _try_import(__name__, 'PlaywrightPreNavCrawlingContext'): from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext +with _try_import(__name__, 'PlaywrightPostNavCrawlingContext'): + from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext __all__ = [ 'PlaywrightCrawler', 'PlaywrightCrawlingContext', + 'PlaywrightPostNavCrawlingContext', 'PlaywrightPreNavCrawlingContext', ] diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index cf70ff94bd..0aaffd7100 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -30,6 +30,7 @@ from ._playwright_crawling_context import PlaywrightCrawlingContext from ._playwright_http_client import PlaywrightHttpClient, browser_page_context +from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext from ._types import GotoOptions from ._utils import block_requests, infinite_scroll @@ -194,12 +195,15 @@ def __init__( ContextPipeline() .compose(self._open_page) .compose(self._navigate) + .compose(self._execute_post_navigation_hooks) .compose(self._handle_status_code_response) .compose(self._handle_blocked_request_by_content) + .compose(self._create_crawling_context) ) kwargs['_additional_context_managers'] = [self._browser_pool] kwargs.setdefault('_logger', logging.getLogger(__name__)) self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = [] + self._post_navigation_hooks: list[Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]] = [] kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client'] @@ -241,12 +245,15 @@ async def _open_page( self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout) try: + # Only use the page context manager here — it sets the current page in a context variable, + # making it accessible to PlaywrightHttpClient in subsequent pipeline steps. async with browser_page_context(crawlee_page.page): for hook in self._pre_navigation_hooks: async with self._shared_navigation_timeouts[context_id]: await hook(pre_navigation_context) - yield pre_navigation_context + # Yield should be inside the browser_page_context. + yield pre_navigation_context finally: self._shared_navigation_timeouts.pop(context_id, None) @@ -274,7 +281,7 @@ async def route_handler(route: Route, _: PlaywrightRequest) -> None: async def _navigate( self, context: PlaywrightPreNavCrawlingContext, - ) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]: + ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, Exception | None]: """Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library. Args: @@ -330,35 +337,21 @@ async def _navigate( # Set the loaded URL to the actual URL after redirection. context.request.loaded_url = context.page.url - extract_links = self._create_extract_links_function(context) - - async with browser_page_context(context.page): - error = yield PlaywrightCrawlingContext( - request=context.request, - session=context.session, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - use_state=context.use_state, - proxy_info=context.proxy_info, - get_key_value_store=context.get_key_value_store, - log=context.log, - page=context.page, - infinite_scroll=lambda: infinite_scroll(context.page), - response=response, - extract_links=extract_links, - enqueue_links=self._create_enqueue_links_function(context, extract_links), - block_requests=partial(block_requests, page=context.page), - goto_options=context.goto_options, - ) - - if context.session: - pw_cookies = await self._get_cookies(context.page) - context.session.cookies.set_cookies_from_playwright_format(pw_cookies) - - # Collect data in case of errors, before the page object is closed. - if error: - await self.statistics.error_tracker.add(error=error, context=context, early=True) + yield PlaywrightPostNavCrawlingContext( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + page=context.page, + block_requests=context.block_requests, + goto_options=context.goto_options, + response=response, + ) def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction: """Create a callback function for extracting links from context. @@ -443,8 +436,8 @@ async def extract_links( return extract_links async def _handle_status_code_response( - self, context: PlaywrightCrawlingContext - ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: + self, context: PlaywrightPostNavCrawlingContext + ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: """Validate the HTTP status code and raise appropriate exceptions if needed. Args: @@ -466,8 +459,8 @@ async def _handle_status_code_response( async def _handle_blocked_request_by_content( self, - context: PlaywrightCrawlingContext, - ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: + context: PlaywrightPostNavCrawlingContext, + ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: """Try to detect if the request is blocked based on the response content. Args: @@ -493,6 +486,45 @@ async def _handle_blocked_request_by_content( yield context + async def _execute_post_navigation_hooks( + self, context: PlaywrightPostNavCrawlingContext + ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: + for hook in self._post_navigation_hooks: + await hook(context) + yield context + + async def _create_crawling_context( + self, context: PlaywrightPostNavCrawlingContext + ) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]: + extract_links = self._create_extract_links_function(context) + + error = yield PlaywrightCrawlingContext( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + page=context.page, + goto_options=context.goto_options, + response=context.response, + infinite_scroll=lambda: infinite_scroll(context.page), + extract_links=extract_links, + enqueue_links=self._create_enqueue_links_function(context, extract_links), + block_requests=partial(block_requests, page=context.page), + ) + + if context.session: + pw_cookies = await self._get_cookies(context.page) + context.session.cookies.set_cookies_from_playwright_format(pw_cookies) + + # Collect data in case of errors, before the page object is closed. + if error: + await self.statistics.error_tracker.add(error=error, context=context, early=True) + def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: """Register a hook to be called before each navigation. @@ -501,6 +533,14 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], """ self._pre_navigation_hooks.append(hook) + def post_navigation_hook(self, hook: Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None: + """Register a hook to be called after each navigation. + + Args: + hook: A coroutine function to be called after each navigation. + """ + self._post_navigation_hooks.append(hook) + async def _get_cookies(self, page: Page) -> list[PlaywrightCookieParam]: """Get the cookies from the page.""" cookies = await page.context.cookies() diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py b/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py index b90165cce2..2c98a4c115 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py @@ -5,27 +5,22 @@ from crawlee._utils.docs import docs_group -from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext +from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext if TYPE_CHECKING: from collections.abc import Awaitable, Callable - from playwright.async_api import Response - from crawlee._types import EnqueueLinksFunction, ExtractLinksFunction @dataclass(frozen=True) @docs_group('Crawling contexts') -class PlaywrightCrawlingContext(PlaywrightPreNavCrawlingContext): +class PlaywrightCrawlingContext(PlaywrightPostNavCrawlingContext): """The crawling context used by the `PlaywrightCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. """ - response: Response - """The Playwright `Response` object containing the response details for the current URL.""" - enqueue_links: EnqueueLinksFunction """The Playwright `EnqueueLinksFunction` implementation.""" diff --git a/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py b/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py new file mode 100644 index 0000000000..40e227349a --- /dev/null +++ b/src/crawlee/crawlers/_playwright/_playwright_post_nav_crawling_context.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from crawlee._utils.docs import docs_group + +from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext + +if TYPE_CHECKING: + from playwright.async_api import Response + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class PlaywrightPostNavCrawlingContext(PlaywrightPreNavCrawlingContext): + """The post navigation crawling context used by the `PlaywrightCrawler`. + + It provides access to the `Page` and `Response` objects, after the navigation to the URL is performed. + """ + + response: Response + """The Playwright `Response` object containing the response details for the current URL.""" diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 94d74d365b..7eed877112 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -17,6 +17,7 @@ from crawlee.crawlers import ( AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext, + AdaptivePlaywrightPostNavCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, BasicCrawler, RenderingType, @@ -50,7 +51,7 @@

Initial text