Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions src/crawlee/crawlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,25 @@
with _try_import(__name__, 'ParselCrawler', 'ParselCrawlingContext'):
from ._parsel import ParselCrawler, ParselCrawlingContext

with _try_import(__name__, 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavCrawlingContext'):
from ._playwright import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
with _try_import(
__name__,
'PlaywrightCrawler',
'PlaywrightCrawlingContext',
'PlaywrightPostNavCrawlingContext',
'PlaywrightPreNavCrawlingContext',
):
from ._playwright import (
PlaywrightCrawler,
PlaywrightCrawlingContext,
PlaywrightPostNavCrawlingContext,
PlaywrightPreNavCrawlingContext,
)

with _try_import(
__name__,
'AdaptivePlaywrightCrawler',
'AdaptivePlaywrightCrawlingContext',
'AdaptivePlaywrightPostNavCrawlingContext',
'AdaptivePlaywrightPreNavCrawlingContext',
'AdaptivePlaywrightCrawlerStatisticState',
'RenderingType',
Expand All @@ -32,6 +44,7 @@
AdaptivePlaywrightCrawler,
AdaptivePlaywrightCrawlerStatisticState,
AdaptivePlaywrightCrawlingContext,
AdaptivePlaywrightPostNavCrawlingContext,
AdaptivePlaywrightPreNavCrawlingContext,
RenderingType,
RenderingTypePrediction,
Expand All @@ -45,6 +58,7 @@
'AdaptivePlaywrightCrawler',
'AdaptivePlaywrightCrawlerStatisticState',
'AdaptivePlaywrightCrawlingContext',
'AdaptivePlaywrightPostNavCrawlingContext',
'AdaptivePlaywrightPreNavCrawlingContext',
'BasicCrawler',
'BasicCrawlerOptions',
Expand All @@ -62,6 +76,7 @@
'ParselCrawlingContext',
'PlaywrightCrawler',
'PlaywrightCrawlingContext',
'PlaywrightPostNavCrawlingContext',
'PlaywrightPreNavCrawlingContext',
'RenderingType',
'RenderingTypePrediction',
Expand Down
18 changes: 18 additions & 0 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(
self._parser = parser
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
self._post_navigation_hooks: list[Callable[[HttpCrawlingContext], Awaitable[None]]] = []
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}

if '_context_pipeline' not in kwargs:
Expand Down Expand Up @@ -120,6 +121,7 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
ContextPipeline()
.compose(self._execute_pre_navigation_hooks)
.compose(self._make_http_request)
.compose(self._execute_post_navigation_hooks)
.compose(self._handle_status_code_response)
.compose(self._parse_http_response)
.compose(self._handle_blocked_request_by_content)
Expand All @@ -140,6 +142,14 @@ async def _execute_pre_navigation_hooks(
finally:
self._shared_navigation_timeouts.pop(context_id, None)

async def _execute_post_navigation_hooks(
self, context: HttpCrawlingContext
) -> AsyncGenerator[HttpCrawlingContext, None]:
for hook in self._post_navigation_hooks:
await hook(context)

yield context

async def _parse_http_response(
self, context: HttpCrawlingContext
) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:
Expand Down Expand Up @@ -311,3 +321,11 @@ def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], Awaitable[N
hook: A coroutine function to be called before each navigation.
"""
self._pre_navigation_hooks.append(hook)

def post_navigation_hook(self, hook: Callable[[HttpCrawlingContext], Awaitable[None]]) -> None:
"""Register a hook to be called after each navigation.

Args:
hook: A coroutine function to be called after each navigation.
"""
self._post_navigation_hooks.append(hook)
2 changes: 2 additions & 0 deletions src/crawlee/crawlers/_adaptive_playwright/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# These imports have only mandatory dependencies, so they are imported directly.
from ._adaptive_playwright_crawling_context import (
AdaptivePlaywrightCrawlingContext,
AdaptivePlaywrightPostNavCrawlingContext,
AdaptivePlaywrightPreNavCrawlingContext,
)

Expand All @@ -22,6 +23,7 @@
'AdaptivePlaywrightCrawler',
'AdaptivePlaywrightCrawlerStatisticState',
'AdaptivePlaywrightCrawlingContext',
'AdaptivePlaywrightPostNavCrawlingContext',
'AdaptivePlaywrightPreNavCrawlingContext',
'RenderingType',
'RenderingTypePrediction',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
AbstractHttpParser,
BasicCrawler,
BeautifulSoupParserType,
HttpCrawlingContext,
ParsedHttpCrawlingContext,
PlaywrightCrawler,
PlaywrightCrawlingContext,
PlaywrightPostNavCrawlingContext,
PlaywrightPreNavCrawlingContext,
)
from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
Expand All @@ -33,6 +35,7 @@
from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
from ._adaptive_playwright_crawling_context import (
AdaptivePlaywrightCrawlingContext,
AdaptivePlaywrightPostNavCrawlingContext,
AdaptivePlaywrightPreNavCrawlingContext,
)
from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
Expand Down Expand Up @@ -196,6 +199,25 @@ async def adaptive_pre_navigation_hook_pw(context: PlaywrightPreNavCrawlingConte
static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_static)
playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_pw)

# Register post navigation hooks on sub crawlers
self._post_navigation_hooks = list[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]]()
self._post_navigation_hooks_pw_only = list[
Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]
]()

async def adaptive_post_navigation_hook_static(context: HttpCrawlingContext) -> None:
adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)
for hook in self._post_navigation_hooks:
await hook(adaptive_context)

async def adaptive_post_navigation_hook_pw(context: PlaywrightPostNavCrawlingContext) -> None:
adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)
for hook in self._post_navigation_hooks + self._post_navigation_hooks_pw_only:
await hook(adaptive_context)

static_crawler.post_navigation_hook(adaptive_post_navigation_hook_static)
playwright_crawler.post_navigation_hook(adaptive_post_navigation_hook_pw)

self._additional_context_managers = [
*self._additional_context_managers,
self.rendering_type_predictor,
Expand Down Expand Up @@ -437,6 +459,32 @@ def register_hooks(hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awa
# Return parametrized decorator that will be executed through decorator syntax if called with parameter.
return register_hooks

def post_navigation_hook(
self,
hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] | None = None,
*,
playwright_only: bool = False,
) -> Callable[[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]], None]:
"""Post navigation hooks for adaptive crawler are delegated to sub crawlers.

Optionally parametrized decorator.
Hooks are wrapped in context that handles possibly missing `page` and `response` objects by raising
`AdaptiveContextError`.
"""

def register_hooks(hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None:
if playwright_only:
self._post_navigation_hooks_pw_only.append(hook)
else:
self._post_navigation_hooks.append(hook)

# No parameter in decorator. Execute directly.
if hook:
register_hooks(hook)

# Return parametrized decorator that will be executed through decorator syntax if called with parameter.
return register_hooks

def track_http_only_request_handler_runs(self) -> None:
self.statistics.state.http_only_request_handler_runs += 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from crawlee._types import BasicCrawlingContext
from crawlee._utils.docs import docs_group
from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext
from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext
from crawlee.crawlers._playwright._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext
from crawlee.crawlers._playwright._types import PlaywrightHttpResponse

if TYPE_CHECKING:
Expand Down Expand Up @@ -186,7 +188,7 @@ async def from_playwright_crawling_context(
context_kwargs['_page'] = context_kwargs.pop('page')
context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll')
# This might not be always available.
protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol')
protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0]?.nextHopProtocol')
http_response = await PlaywrightHttpResponse.from_playwright_response(
response=context.response, protocol=protocol_guess or ''
)
Expand Down Expand Up @@ -245,3 +247,58 @@ async def dummy_block_requests(

context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests)
return cls(**context_kwargs)


@dataclass(frozen=True)
@docs_group('Crawling contexts')
class AdaptivePlaywrightPostNavCrawlingContext(HttpCrawlingContext):
"""A wrapper around HttpCrawlingContext or AdaptivePlaywrightCrawlingContext.

Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is HttpCrawlingContext.
"""

_page: Page | None = None
_response: Response | None = None

@property
def page(self) -> Page:
"""The Playwright `Page` object for the current page.

Raises `AdaptiveContextError` if accessed during static crawling.
"""
if not self._page:
raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.')
return self._page

@property
def response(self) -> Response:
"""The Playwright `Response` object containing the response details for the current URL.

Raises `AdaptiveContextError` if accessed during static crawling.
"""
if not self._response:
raise AdaptiveContextError('Response was not crawled with PlaywrightCrawler.')
return self._response

@classmethod
async def from_post_navigation_context(
cls, context: HttpCrawlingContext | PlaywrightPostNavCrawlingContext
) -> Self:
"""Initialize a new instance from an existing post-navigation context."""
context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}

context_kwargs['_page'] = context_kwargs.pop('page', None)
context_kwargs['_response'] = context_kwargs.pop('response', None)

# block_requests and goto_options are useful only on pre-navigation contexts.
context_kwargs.pop('block_requests', None)
context_kwargs.pop('goto_options', None)

if isinstance(context, PlaywrightPostNavCrawlingContext):
protocol_guess = await context_kwargs['_page'].evaluate(
'() => performance.getEntries()[0]?.nextHopProtocol'
)
context_kwargs['http_response'] = await PlaywrightHttpResponse.from_playwright_response(
response=context.response, protocol=protocol_guess or ''
)
return cls(**context_kwargs)
3 changes: 3 additions & 0 deletions src/crawlee/crawlers/_playwright/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@
from ._playwright_crawling_context import PlaywrightCrawlingContext
with _try_import(__name__, 'PlaywrightPreNavCrawlingContext'):
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
with _try_import(__name__, 'PlaywrightPostNavCrawlingContext'):
from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext

__all__ = [
'PlaywrightCrawler',
'PlaywrightCrawlingContext',
'PlaywrightPostNavCrawlingContext',
'PlaywrightPreNavCrawlingContext',
]
Loading
Loading