From 8020867b20b17e80370b1a716302c5cd2d31a9ff Mon Sep 17 00:00:00 2001 From: SentienceDev Date: Thu, 8 Jan 2026 17:20:16 -0800 Subject: [PATCH 1/3] Phase 2: full integration with backend protocol --- sentience/__init__.py | 18 + sentience/backends/__init__.py | 21 +- sentience/backends/actions.py | 341 ++++++++++++++++++ sentience/backends/browser_use_adapter.py | 4 +- sentience/backends/cdp_backend.py | 15 +- sentience/backends/playwright_backend.py | 187 ++++++++++ sentience/backends/snapshot.py | 297 ++++++++++++++++ tests/test_backends.py | 401 ++++++++++++++++++++-- 8 files changed, 1244 insertions(+), 40 deletions(-) create mode 100644 sentience/backends/actions.py create mode 100644 sentience/backends/playwright_backend.py create mode 100644 sentience/backends/snapshot.py diff --git a/sentience/__init__.py b/sentience/__init__.py index 1149bad..ecb4711 100644 --- a/sentience/__init__.py +++ b/sentience/__init__.py @@ -16,16 +16,25 @@ from .agent_config import AgentConfig from .agent_runtime import AgentRuntime +# Backend-agnostic actions (aliased to avoid conflict with existing actions) # Browser backends (for browser-use integration) from .backends import ( BrowserBackendV0, BrowserUseAdapter, BrowserUseCDPTransport, + CachedSnapshot, CDPBackendV0, CDPTransport, LayoutMetrics, + PlaywrightBackend, ViewportInfo, ) +from .backends import click as backend_click +from .backends import scroll as backend_scroll +from .backends import scroll_to_element as backend_scroll_to_element +from .backends import snapshot as backend_snapshot +from .backends import type_text as backend_type_text +from .backends import wait_for_stable as backend_wait_for_stable # Agent Layer (Phase 1 & 2) from .base_agent import BaseAgent @@ -123,10 +132,19 @@ "BrowserBackendV0", "CDPTransport", "CDPBackendV0", + "PlaywrightBackend", "BrowserUseAdapter", "BrowserUseCDPTransport", "ViewportInfo", "LayoutMetrics", + "backend_snapshot", + "CachedSnapshot", + # Backend-agnostic actions (prefixed to avoid conflicts) + "backend_click", + "backend_type_text", + "backend_scroll", + "backend_scroll_to_element", + "backend_wait_for_stable", # Core SDK "SentienceBrowser", "Snapshot", diff --git a/sentience/backends/__init__.py b/sentience/backends/__init__.py index 063685a..0c7d7f3 100644 --- a/sentience/backends/__init__.py +++ b/sentience/backends/__init__.py @@ -12,7 +12,7 @@ For browser-use integration: from browser_use import BrowserSession, BrowserProfile from sentience import get_extension_dir - from sentience.backends import BrowserUseAdapter, CDPBackendV0 + from sentience.backends import BrowserUseAdapter, snapshot, click, type_text # Setup browser-use with Sentience extension profile = BrowserProfile(args=[f"--load-extension={get_extension_dir()}"]) @@ -23,13 +23,18 @@ adapter = BrowserUseAdapter(session) backend = await adapter.create_backend() - # Use backend for precise operations - await backend.mouse_click(100, 200) + # Take snapshot and interact + snap = await snapshot(backend) + element = find(snap, 'role=button[name="Submit"]') + await click(backend, element.bbox) """ +from .actions import click, scroll, scroll_to_element, type_text, wait_for_stable from .browser_use_adapter import BrowserUseAdapter, BrowserUseCDPTransport from .cdp_backend import CDPBackendV0, CDPTransport +from .playwright_backend import PlaywrightBackend from .protocol_v0 import BrowserBackendV0, LayoutMetrics, ViewportInfo +from .snapshot import CachedSnapshot, snapshot __all__ = [ # Protocol @@ -40,7 +45,17 @@ # CDP Backend "CDPTransport", "CDPBackendV0", + # Playwright Backend + "PlaywrightBackend", # browser-use adapter "BrowserUseAdapter", "BrowserUseCDPTransport", + # Backend-agnostic functions + "snapshot", + "CachedSnapshot", + "click", + "type_text", + "scroll", + "scroll_to_element", + "wait_for_stable", ] diff --git a/sentience/backends/actions.py b/sentience/backends/actions.py new file mode 100644 index 0000000..c987d64 --- /dev/null +++ b/sentience/backends/actions.py @@ -0,0 +1,341 @@ +""" +Backend-agnostic actions for browser-use integration. + +These actions work with any BrowserBackendV0 implementation, +enabling Sentience grounding with browser-use or other frameworks. + +Usage with browser-use: + from sentience.backends import BrowserUseAdapter + from sentience.backends.actions import click, type_text, scroll + + adapter = BrowserUseAdapter(session) + backend = await adapter.create_backend() + + # Take snapshot and click element + snap = await snapshot_from_backend(backend) + element = find(snap, 'role=button[name="Submit"]') + await click(backend, element.bbox) +""" + +import asyncio +import time +from typing import TYPE_CHECKING, Any, Literal + +from ..models import ActionResult, BBox, Snapshot + +if TYPE_CHECKING: + from .protocol_v0 import BrowserBackendV0 + + +async def click( + backend: "BrowserBackendV0", + target: BBox | dict[str, float] | tuple[float, float], + button: Literal["left", "right", "middle"] = "left", + click_count: int = 1, + move_first: bool = True, +) -> ActionResult: + """ + Click at coordinates using the backend. + + Args: + backend: BrowserBackendV0 implementation + target: Click target - BBox (clicks center), dict with x/y, or (x, y) tuple + button: Mouse button to click + click_count: Number of clicks (1=single, 2=double) + move_first: Whether to move mouse to position before clicking + + Returns: + ActionResult with success status + + Example: + # Click at coordinates + await click(backend, (100, 200)) + + # Click element bbox center + await click(backend, element.bbox) + + # Double-click + await click(backend, element.bbox, click_count=2) + """ + start_time = time.time() + + # Resolve coordinates + x, y = _resolve_coordinates(target) + + try: + # Optional mouse move for hover effects + if move_first: + await backend.mouse_move(x, y) + await asyncio.sleep(0.02) # Brief pause for hover + + # Perform click + await backend.mouse_click(x, y, button=button, click_count=click_count) + + duration_ms = int((time.time() - start_time) * 1000) + return ActionResult( + success=True, + duration_ms=duration_ms, + outcome="dom_updated", + ) + except Exception as e: + duration_ms = int((time.time() - start_time) * 1000) + return ActionResult( + success=False, + duration_ms=duration_ms, + outcome="error", + error={"code": "click_failed", "reason": str(e)}, + ) + + +async def type_text( + backend: "BrowserBackendV0", + text: str, + target: BBox | dict[str, float] | tuple[float, float] | None = None, + clear_first: bool = False, +) -> ActionResult: + """ + Type text, optionally clicking a target first. + + Args: + backend: BrowserBackendV0 implementation + text: Text to type + target: Optional click target before typing (BBox, dict, or tuple) + clear_first: If True, select all and delete before typing + + Returns: + ActionResult with success status + + Example: + # Type into focused element + await type_text(backend, "Hello World") + + # Click input then type + await type_text(backend, "search query", target=search_box.bbox) + + # Clear and type + await type_text(backend, "new value", target=input.bbox, clear_first=True) + """ + start_time = time.time() + + try: + # Click target if provided + if target is not None: + x, y = _resolve_coordinates(target) + await backend.mouse_click(x, y) + await asyncio.sleep(0.05) # Wait for focus + + # Clear existing content if requested + if clear_first: + # Select all (Ctrl+A / Cmd+A) and delete + await backend.eval("document.execCommand('selectAll')") + await asyncio.sleep(0.02) + + # Type the text + await backend.type_text(text) + + duration_ms = int((time.time() - start_time) * 1000) + return ActionResult( + success=True, + duration_ms=duration_ms, + outcome="dom_updated", + ) + except Exception as e: + duration_ms = int((time.time() - start_time) * 1000) + return ActionResult( + success=False, + duration_ms=duration_ms, + outcome="error", + error={"code": "type_failed", "reason": str(e)}, + ) + + +async def scroll( + backend: "BrowserBackendV0", + delta_y: float = 300, + target: BBox | dict[str, float] | tuple[float, float] | None = None, +) -> ActionResult: + """ + Scroll the page or element. + + Args: + backend: BrowserBackendV0 implementation + delta_y: Scroll amount (positive=down, negative=up) + target: Optional position for scroll (defaults to viewport center) + + Returns: + ActionResult with success status + + Example: + # Scroll down 300px + await scroll(backend, 300) + + # Scroll up 500px + await scroll(backend, -500) + + # Scroll at specific position + await scroll(backend, 200, target=(500, 300)) + """ + start_time = time.time() + + try: + x: float | None = None + y: float | None = None + + if target is not None: + x, y = _resolve_coordinates(target) + + await backend.wheel(delta_y=delta_y, x=x, y=y) + + # Wait for scroll to settle + await asyncio.sleep(0.1) + + duration_ms = int((time.time() - start_time) * 1000) + return ActionResult( + success=True, + duration_ms=duration_ms, + outcome="dom_updated", + ) + except Exception as e: + duration_ms = int((time.time() - start_time) * 1000) + return ActionResult( + success=False, + duration_ms=duration_ms, + outcome="error", + error={"code": "scroll_failed", "reason": str(e)}, + ) + + +async def scroll_to_element( + backend: "BrowserBackendV0", + element_id: int, + behavior: Literal["smooth", "instant", "auto"] = "instant", + block: Literal["start", "center", "end", "nearest"] = "center", +) -> ActionResult: + """ + Scroll element into view using JavaScript scrollIntoView. + + Args: + backend: BrowserBackendV0 implementation + element_id: Element ID from snapshot (requires sentience_registry) + behavior: Scroll behavior + block: Vertical alignment + + Returns: + ActionResult with success status + """ + start_time = time.time() + + try: + scrolled = await backend.eval(f""" + (() => {{ + const el = window.sentience_registry && window.sentience_registry[{element_id}]; + if (el && el.scrollIntoView) {{ + el.scrollIntoView({{ + behavior: '{behavior}', + block: '{block}', + inline: 'nearest' + }}); + return true; + }} + return false; + }})() + """) + + # Wait for scroll animation + wait_time = 0.3 if behavior == "smooth" else 0.05 + await asyncio.sleep(wait_time) + + duration_ms = int((time.time() - start_time) * 1000) + + if scrolled: + return ActionResult( + success=True, + duration_ms=duration_ms, + outcome="dom_updated", + ) + else: + return ActionResult( + success=False, + duration_ms=duration_ms, + outcome="error", + error={"code": "scroll_failed", "reason": "Element not found in registry"}, + ) + except Exception as e: + duration_ms = int((time.time() - start_time) * 1000) + return ActionResult( + success=False, + duration_ms=duration_ms, + outcome="error", + error={"code": "scroll_failed", "reason": str(e)}, + ) + + +async def wait_for_stable( + backend: "BrowserBackendV0", + state: Literal["interactive", "complete"] = "complete", + timeout_ms: int = 10000, +) -> ActionResult: + """ + Wait for page to reach stable state. + + Args: + backend: BrowserBackendV0 implementation + state: Target document.readyState + timeout_ms: Maximum wait time + + Returns: + ActionResult with success status + """ + start_time = time.time() + + try: + await backend.wait_ready_state(state=state, timeout_ms=timeout_ms) + + duration_ms = int((time.time() - start_time) * 1000) + return ActionResult( + success=True, + duration_ms=duration_ms, + outcome="dom_updated", + ) + except TimeoutError as e: + duration_ms = int((time.time() - start_time) * 1000) + return ActionResult( + success=False, + duration_ms=duration_ms, + outcome="error", + error={"code": "timeout", "reason": str(e)}, + ) + except Exception as e: + duration_ms = int((time.time() - start_time) * 1000) + return ActionResult( + success=False, + duration_ms=duration_ms, + outcome="error", + error={"code": "wait_failed", "reason": str(e)}, + ) + + +def _resolve_coordinates( + target: BBox | dict[str, float] | tuple[float, float], +) -> tuple[float, float]: + """ + Resolve target to (x, y) coordinates. + + - BBox: Returns center point + - dict: Returns x, y keys (or center if width/height present) + - tuple: Returns as-is + """ + if isinstance(target, BBox): + return (target.x + target.width / 2, target.y + target.height / 2) + elif isinstance(target, tuple): + return target + elif isinstance(target, dict): + # If has width/height, compute center + if "width" in target and "height" in target: + x = target.get("x", 0) + target["width"] / 2 + y = target.get("y", 0) + target["height"] / 2 + return (x, y) + # Otherwise use x/y directly + return (target.get("x", 0), target.get("y", 0)) + else: + raise ValueError(f"Invalid target type: {type(target)}") diff --git a/sentience/backends/browser_use_adapter.py b/sentience/backends/browser_use_adapter.py index b8b9762..c932cd3 100644 --- a/sentience/backends/browser_use_adapter.py +++ b/sentience/backends/browser_use_adapter.py @@ -158,9 +158,7 @@ def page(self) -> Any: if hasattr(self._session, "get_current_page"): # This is async, but we need sync access for property # Caller should use get_page_async() instead - raise RuntimeError( - "Use await adapter.get_page_async() to get the page" - ) + raise RuntimeError("Use await adapter.get_page_async() to get the page") raise RuntimeError("Could not find page in browser-use session") async def get_page_async(self) -> Any: diff --git a/sentience/backends/cdp_backend.py b/sentience/backends/cdp_backend.py index 0768c94..1061e1a 100644 --- a/sentience/backends/cdp_backend.py +++ b/sentience/backends/cdp_backend.py @@ -188,9 +188,7 @@ async def call( if not object_id: # Fallback: evaluate the function directly if args: - args_json = ", ".join( - repr(a) if isinstance(a, str) else str(a) for a in args - ) + args_json = ", ".join(repr(a) if isinstance(a, str) else str(a) for a in args) expression = f"({function_declaration})({args_json})" else: expression = f"({function_declaration})()" @@ -234,8 +232,12 @@ async def get_layout_metrics(self) -> LayoutMetrics: return LayoutMetrics( viewport_x=visual_viewport.get("pageX", 0), viewport_y=visual_viewport.get("pageY", 0), - viewport_width=visual_viewport.get("clientWidth", layout_viewport.get("clientWidth", 0)), - viewport_height=visual_viewport.get("clientHeight", layout_viewport.get("clientHeight", 0)), + viewport_width=visual_viewport.get( + "clientWidth", layout_viewport.get("clientWidth", 0) + ), + viewport_height=visual_viewport.get( + "clientHeight", layout_viewport.get("clientHeight", 0) + ), content_width=content_size.get("width", 0), content_height=content_size.get("height", 0), device_scale_factor=visual_viewport.get("scale", 1.0), @@ -375,8 +377,7 @@ async def wait_ready_state( elapsed = time.monotonic() - start if elapsed >= timeout_sec: raise TimeoutError( - f"Timed out waiting for document.readyState='{state}' " - f"after {timeout_ms}ms" + f"Timed out waiting for document.readyState='{state}' " f"after {timeout_ms}ms" ) current_state = await self.eval("document.readyState") diff --git a/sentience/backends/playwright_backend.py b/sentience/backends/playwright_backend.py new file mode 100644 index 0000000..f5ea8df --- /dev/null +++ b/sentience/backends/playwright_backend.py @@ -0,0 +1,187 @@ +""" +Playwright backend implementation for BrowserBackendV0 protocol. + +This wraps existing SentienceBrowser/AsyncSentienceBrowser to provide +a unified interface, enabling code that works with both browser-use +(CDPBackendV0) and native Playwright (PlaywrightBackend). + +Usage: + from sentience import SentienceBrowserAsync + from sentience.backends import PlaywrightBackend, snapshot_from_backend + + browser = SentienceBrowserAsync() + await browser.start() + await browser.goto("https://example.com") + + # Create backend from existing browser + backend = PlaywrightBackend(browser.page) + + # Use backend-agnostic functions + snap = await snapshot_from_backend(backend) + await click(backend, element.bbox) +""" + +import asyncio +import base64 +import time +from typing import TYPE_CHECKING, Any, Literal + +from .protocol_v0 import BrowserBackendV0, LayoutMetrics, ViewportInfo + +if TYPE_CHECKING: + from playwright.async_api import Page as AsyncPage + + +class PlaywrightBackend: + """ + Playwright-based implementation of BrowserBackendV0. + + Wraps a Playwright async Page to provide the standard backend interface. + This enables using backend-agnostic actions with existing SentienceBrowser code. + """ + + def __init__(self, page: "AsyncPage") -> None: + """ + Initialize Playwright backend. + + Args: + page: Playwright async Page object + """ + self._page = page + self._cached_viewport: ViewportInfo | None = None + + @property + def page(self) -> "AsyncPage": + """Access the underlying Playwright page.""" + return self._page + + async def refresh_page_info(self) -> ViewportInfo: + """Cache viewport + scroll offsets; cheap & safe to call often.""" + result = await self._page.evaluate(""" + (() => ({ + width: window.innerWidth, + height: window.innerHeight, + scroll_x: window.scrollX, + scroll_y: window.scrollY, + content_width: document.documentElement.scrollWidth, + content_height: document.documentElement.scrollHeight + }))() + """) + + self._cached_viewport = ViewportInfo( + width=result.get("width", 0), + height=result.get("height", 0), + scroll_x=result.get("scroll_x", 0), + scroll_y=result.get("scroll_y", 0), + content_width=result.get("content_width"), + content_height=result.get("content_height"), + ) + return self._cached_viewport + + async def eval(self, expression: str) -> Any: + """Evaluate JavaScript expression in page context.""" + return await self._page.evaluate(expression) + + async def call( + self, + function_declaration: str, + args: list[Any] | None = None, + ) -> Any: + """Call JavaScript function with arguments.""" + if args: + return await self._page.evaluate(function_declaration, *args) + return await self._page.evaluate(f"({function_declaration})()") + + async def get_layout_metrics(self) -> LayoutMetrics: + """Get page layout metrics.""" + # Playwright doesn't expose CDP directly in the same way, + # so we approximate using JavaScript + result = await self._page.evaluate(""" + (() => ({ + viewport_x: window.scrollX, + viewport_y: window.scrollY, + viewport_width: window.innerWidth, + viewport_height: window.innerHeight, + content_width: document.documentElement.scrollWidth, + content_height: document.documentElement.scrollHeight, + device_scale_factor: window.devicePixelRatio || 1 + }))() + """) + + return LayoutMetrics( + viewport_x=result.get("viewport_x", 0), + viewport_y=result.get("viewport_y", 0), + viewport_width=result.get("viewport_width", 0), + viewport_height=result.get("viewport_height", 0), + content_width=result.get("content_width", 0), + content_height=result.get("content_height", 0), + device_scale_factor=result.get("device_scale_factor", 1.0), + ) + + async def screenshot_png(self) -> bytes: + """Capture viewport screenshot as PNG bytes.""" + return await self._page.screenshot(type="png") + + async def mouse_move(self, x: float, y: float) -> None: + """Move mouse to viewport coordinates.""" + await self._page.mouse.move(x, y) + + async def mouse_click( + self, + x: float, + y: float, + button: Literal["left", "right", "middle"] = "left", + click_count: int = 1, + ) -> None: + """Click at viewport coordinates.""" + await self._page.mouse.click(x, y, button=button, click_count=click_count) + + async def wheel( + self, + delta_y: float, + x: float | None = None, + y: float | None = None, + ) -> None: + """Scroll using mouse wheel.""" + # Get viewport center if coordinates not provided + if x is None or y is None: + if self._cached_viewport is None: + await self.refresh_page_info() + assert self._cached_viewport is not None + x = x if x is not None else self._cached_viewport.width / 2 + y = y if y is not None else self._cached_viewport.height / 2 + + await self._page.mouse.wheel(0, delta_y) + + async def type_text(self, text: str) -> None: + """Type text using keyboard input.""" + await self._page.keyboard.type(text) + + async def wait_ready_state( + self, + state: Literal["interactive", "complete"] = "interactive", + timeout_ms: int = 15000, + ) -> None: + """Wait for document.readyState to reach target state.""" + acceptable_states = {"complete"} if state == "complete" else {"interactive", "complete"} + + start = time.monotonic() + timeout_sec = timeout_ms / 1000.0 + + while True: + elapsed = time.monotonic() - start + if elapsed >= timeout_sec: + raise TimeoutError( + f"Timed out waiting for document.readyState='{state}' " + f"after {timeout_ms}ms" + ) + + current_state = await self._page.evaluate("document.readyState") + if current_state in acceptable_states: + return + + await asyncio.sleep(0.1) + + +# Verify protocol compliance at import time +assert isinstance(PlaywrightBackend.__new__(PlaywrightBackend), BrowserBackendV0) diff --git a/sentience/backends/snapshot.py b/sentience/backends/snapshot.py new file mode 100644 index 0000000..6f11dd9 --- /dev/null +++ b/sentience/backends/snapshot.py @@ -0,0 +1,297 @@ +""" +Backend-agnostic snapshot for browser-use integration. + +Takes Sentience snapshots using BrowserBackendV0 protocol, +enabling element grounding with browser-use or other frameworks. + +Usage with browser-use: + from sentience.backends import BrowserUseAdapter, snapshot, CachedSnapshot + + adapter = BrowserUseAdapter(session) + backend = await adapter.create_backend() + + # Take snapshot + snap = await snapshot(backend) + print(f"Found {len(snap.elements)} elements") + + # With caching (reuse if fresh) + cache = CachedSnapshot(backend, max_age_ms=2000) + snap1 = await cache.get() # Fresh snapshot + snap2 = await cache.get() # Returns cached if < 2s old + cache.invalidate() # Force refresh on next get() +""" + +import time +from typing import TYPE_CHECKING, Any + +from ..models import Snapshot, SnapshotOptions + +if TYPE_CHECKING: + from .protocol_v0 import BrowserBackendV0 + + +class CachedSnapshot: + """ + Snapshot cache with staleness detection. + + Caches snapshots and returns cached version if still fresh. + Useful for reducing redundant snapshot calls in action loops. + + Usage: + cache = CachedSnapshot(backend, max_age_ms=2000) + + # First call takes fresh snapshot + snap1 = await cache.get() + + # Second call returns cached if < 2s old + snap2 = await cache.get() + + # Invalidate after actions that change DOM + await click(backend, element.bbox) + cache.invalidate() + + # Next get() will take fresh snapshot + snap3 = await cache.get() + """ + + def __init__( + self, + backend: "BrowserBackendV0", + max_age_ms: int = 2000, + options: SnapshotOptions | None = None, + ) -> None: + """ + Initialize cached snapshot. + + Args: + backend: BrowserBackendV0 implementation + max_age_ms: Maximum cache age in milliseconds (default: 2000) + options: Default snapshot options + """ + self._backend = backend + self._max_age_ms = max_age_ms + self._options = options + self._cached: Snapshot | None = None + self._cached_at: float = 0 # timestamp in seconds + self._cached_url: str | None = None + + async def get( + self, + options: SnapshotOptions | None = None, + force_refresh: bool = False, + ) -> Snapshot: + """ + Get snapshot, using cache if fresh. + + Args: + options: Override default options for this call + force_refresh: If True, always take fresh snapshot + + Returns: + Snapshot (cached or fresh) + """ + # Check if we need to refresh + if force_refresh or self._is_stale(): + self._cached = await snapshot( + self._backend, + options or self._options, + ) + self._cached_at = time.time() + self._cached_url = self._cached.url + + assert self._cached is not None + return self._cached + + def invalidate(self) -> None: + """ + Invalidate cache, forcing refresh on next get(). + + Call this after actions that modify the DOM. + """ + self._cached = None + self._cached_at = 0 + self._cached_url = None + + def _is_stale(self) -> bool: + """Check if cache is stale and needs refresh.""" + if self._cached is None: + return True + + # Check age + age_ms = (time.time() - self._cached_at) * 1000 + if age_ms > self._max_age_ms: + return True + + return False + + @property + def is_cached(self) -> bool: + """Check if a cached snapshot exists.""" + return self._cached is not None + + @property + def age_ms(self) -> float: + """Get age of cached snapshot in milliseconds.""" + if self._cached is None: + return float("inf") + return (time.time() - self._cached_at) * 1000 + + +async def snapshot( + backend: "BrowserBackendV0", + options: SnapshotOptions | None = None, +) -> Snapshot: + """ + Take a Sentience snapshot using the backend protocol. + + This function calls window.sentience.snapshot() via the backend's eval(), + enabling snapshot collection with any BrowserBackendV0 implementation. + + Requires: + - Sentience extension loaded in browser (via --load-extension) + - Extension injected window.sentience API + + Args: + backend: BrowserBackendV0 implementation (CDPBackendV0, PlaywrightBackend, etc.) + options: Snapshot options (limit, filter, screenshot, etc.) + + Returns: + Snapshot with elements, viewport, and optional screenshot + + Example: + from sentience.backends import BrowserUseAdapter + from sentience.backends.snapshot import snapshot_from_backend + + adapter = BrowserUseAdapter(session) + backend = await adapter.create_backend() + + # Basic snapshot + snap = await snapshot_from_backend(backend) + + # With options + snap = await snapshot_from_backend(backend, SnapshotOptions( + limit=100, + screenshot=True + )) + """ + if options is None: + options = SnapshotOptions() + + # Wait for extension injection + await _wait_for_extension(backend, timeout_ms=5000) + + # Build options dict for extension API + ext_options = _build_extension_options(options) + + # Call extension's snapshot function + result = await backend.eval(f""" + (() => {{ + const options = {_json_serialize(ext_options)}; + return window.sentience.snapshot(options); + }})() + """) + + if result is None: + raise RuntimeError( + "window.sentience.snapshot() returned null. " + "Is the Sentience extension loaded and injected?" + ) + + # Show overlay if requested + if options.show_overlay: + raw_elements = result.get("raw_elements", []) + if raw_elements: + await backend.eval(f""" + (() => {{ + if (window.sentience && window.sentience.showOverlay) {{ + window.sentience.showOverlay({_json_serialize(raw_elements)}, null); + }} + }})() + """) + + # Build and return Snapshot + return Snapshot(**result) + + +async def _wait_for_extension( + backend: "BrowserBackendV0", + timeout_ms: int = 5000, +) -> None: + """ + Wait for Sentience extension to inject window.sentience API. + + Args: + backend: BrowserBackendV0 implementation + timeout_ms: Maximum wait time + + Raises: + RuntimeError: If extension not injected within timeout + """ + import asyncio + + start = time.monotonic() + timeout_sec = timeout_ms / 1000.0 + + while True: + elapsed = time.monotonic() - start + if elapsed >= timeout_sec: + # Gather diagnostics + try: + diag = await backend.eval(""" + (() => ({ + sentience_defined: typeof window.sentience !== 'undefined', + sentience_snapshot: typeof window.sentience?.snapshot === 'function', + url: window.location.href + }))() + """) + except Exception: + diag = {"error": "Could not gather diagnostics"} + + raise RuntimeError( + f"Sentience extension failed to inject window.sentience API " + f"within {timeout_ms}ms. Diagnostics: {diag}" + ) + + # Check if extension is ready + try: + ready = await backend.eval( + "typeof window.sentience !== 'undefined' && " + "typeof window.sentience.snapshot === 'function'" + ) + if ready: + return + except Exception: + pass # Keep polling + + await asyncio.sleep(0.1) + + +def _build_extension_options(options: SnapshotOptions) -> dict[str, Any]: + """Build options dict for extension API call.""" + ext_options: dict[str, Any] = {} + + # Screenshot config + if options.screenshot is not False: + if hasattr(options.screenshot, "model_dump"): + ext_options["screenshot"] = options.screenshot.model_dump() + else: + ext_options["screenshot"] = options.screenshot + + # Limit (only if not default) + if options.limit != 50: + ext_options["limit"] = options.limit + + # Filter + if options.filter is not None: + if hasattr(options.filter, "model_dump"): + ext_options["filter"] = options.filter.model_dump() + else: + ext_options["filter"] = options.filter + + return ext_options + + +def _json_serialize(obj: Any) -> str: + """Serialize object to JSON string for embedding in JS.""" + import json + return json.dumps(obj) diff --git a/tests/test_backends.py b/tests/test_backends.py index 9c82363..a1c7d90 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -6,6 +6,7 @@ """ import asyncio +import time from typing import Any from unittest.mock import AsyncMock, MagicMock @@ -15,11 +16,18 @@ BrowserBackendV0, BrowserUseAdapter, BrowserUseCDPTransport, + CachedSnapshot, CDPBackendV0, CDPTransport, LayoutMetrics, + PlaywrightBackend, ViewportInfo, + click, + scroll, + type_text, + wait_for_stable, ) +from sentience.models import ActionResult, BBox class MockCDPTransport: @@ -150,9 +158,7 @@ async def test_refresh_page_info( assert info.scroll_y == 100 @pytest.mark.asyncio - async def test_eval( - self, backend: CDPBackendV0, transport: MockCDPTransport - ) -> None: + async def test_eval(self, backend: CDPBackendV0, transport: MockCDPTransport) -> None: """Test eval executes JavaScript and returns value.""" transport.set_response( "Runtime.evaluate", @@ -167,9 +173,7 @@ async def test_eval( assert transport.calls[0][1]["expression"] == "1 + 1" @pytest.mark.asyncio - async def test_eval_exception( - self, backend: CDPBackendV0, transport: MockCDPTransport - ) -> None: + async def test_eval_exception(self, backend: CDPBackendV0, transport: MockCDPTransport) -> None: """Test eval raises on JavaScript exception.""" transport.set_response( "Runtime.evaluate", @@ -211,9 +215,7 @@ async def test_get_layout_metrics( assert metrics.content_height == 5000 @pytest.mark.asyncio - async def test_screenshot_png( - self, backend: CDPBackendV0, transport: MockCDPTransport - ) -> None: + async def test_screenshot_png(self, backend: CDPBackendV0, transport: MockCDPTransport) -> None: """Test screenshot_png returns PNG bytes.""" import base64 @@ -230,9 +232,7 @@ async def test_screenshot_png( assert result.startswith(b"\x89PNG") @pytest.mark.asyncio - async def test_mouse_move( - self, backend: CDPBackendV0, transport: MockCDPTransport - ) -> None: + async def test_mouse_move(self, backend: CDPBackendV0, transport: MockCDPTransport) -> None: """Test mouse_move dispatches mouseMoved event.""" await backend.mouse_move(100, 200) @@ -244,9 +244,7 @@ async def test_mouse_move( assert params["y"] == 200 @pytest.mark.asyncio - async def test_mouse_click( - self, backend: CDPBackendV0, transport: MockCDPTransport - ) -> None: + async def test_mouse_click(self, backend: CDPBackendV0, transport: MockCDPTransport) -> None: """Test mouse_click dispatches press and release events.""" await backend.mouse_click(100, 200) @@ -276,9 +274,7 @@ async def test_mouse_click_right_button( assert params["button"] == "right" @pytest.mark.asyncio - async def test_wheel( - self, backend: CDPBackendV0, transport: MockCDPTransport - ) -> None: + async def test_wheel(self, backend: CDPBackendV0, transport: MockCDPTransport) -> None: """Test wheel dispatches mouseWheel event.""" # First set up viewport info for default coordinates transport.set_response( @@ -304,9 +300,7 @@ async def test_wheel( assert params["y"] == 300 @pytest.mark.asyncio - async def test_type_text( - self, backend: CDPBackendV0, transport: MockCDPTransport - ) -> None: + async def test_type_text(self, backend: CDPBackendV0, transport: MockCDPTransport) -> None: """Test type_text dispatches key events for each character.""" await backend.type_text("Hi") @@ -434,9 +428,7 @@ async def test_create_backend(self) -> None: # Create mock browser session mock_session = MagicMock() - mock_session.get_or_create_cdp_session = AsyncMock( - return_value=mock_cdp_session - ) + mock_session.get_or_create_cdp_session = AsyncMock(return_value=mock_cdp_session) adapter = BrowserUseAdapter(mock_session) backend = await adapter.create_backend() @@ -452,9 +444,7 @@ async def test_create_backend_caches_result(self) -> None: mock_cdp_session.session_id = "session-123" mock_session = MagicMock() - mock_session.get_or_create_cdp_session = AsyncMock( - return_value=mock_cdp_session - ) + mock_session.get_or_create_cdp_session = AsyncMock(return_value=mock_cdp_session) adapter = BrowserUseAdapter(mock_session) @@ -486,3 +476,360 @@ async def test_get_page_async(self) -> None: page = await adapter.get_page_async() assert page is mock_page + + +class TestBackendAgnosticActions: + """Tests for backend-agnostic action functions.""" + + @pytest.fixture + def transport(self) -> MockCDPTransport: + """Create mock transport.""" + return MockCDPTransport() + + @pytest.fixture + def backend(self, transport: MockCDPTransport) -> CDPBackendV0: + """Create backend with mock transport.""" + return CDPBackendV0(transport) + + @pytest.mark.asyncio + async def test_click_with_tuple( + self, backend: CDPBackendV0, transport: MockCDPTransport + ) -> None: + """Test click with (x, y) tuple.""" + result = await click(backend, (100, 200)) + + assert isinstance(result, ActionResult) + assert result.success is True + + # Should have mouse move + mouse click (press + release) + mouse_events = [c for c in transport.calls if c[0] == "Input.dispatchMouseEvent"] + assert len(mouse_events) == 3 # move, press, release + + @pytest.mark.asyncio + async def test_click_with_bbox( + self, backend: CDPBackendV0, transport: MockCDPTransport + ) -> None: + """Test click with BBox (clicks center).""" + bbox = BBox(x=100, y=200, width=50, height=30) + result = await click(backend, bbox) + + assert result.success is True + + # Find the click event + press_events = [ + c + for c in transport.calls + if c[0] == "Input.dispatchMouseEvent" and c[1]["type"] == "mousePressed" + ] + assert len(press_events) == 1 + # Should click at center: (100 + 25, 200 + 15) = (125, 215) + assert press_events[0][1]["x"] == 125 + assert press_events[0][1]["y"] == 215 + + @pytest.mark.asyncio + async def test_click_with_dict( + self, backend: CDPBackendV0, transport: MockCDPTransport + ) -> None: + """Test click with dict containing x, y.""" + result = await click(backend, {"x": 150, "y": 250}) + + assert result.success is True + + @pytest.mark.asyncio + async def test_click_double(self, backend: CDPBackendV0, transport: MockCDPTransport) -> None: + """Test double-click.""" + result = await click(backend, (100, 200), click_count=2) + + assert result.success is True + + # Check clickCount parameter + press_events = [ + c + for c in transport.calls + if c[0] == "Input.dispatchMouseEvent" and c[1]["type"] == "mousePressed" + ] + assert press_events[0][1]["clickCount"] == 2 + + @pytest.mark.asyncio + async def test_type_text_simple( + self, backend: CDPBackendV0, transport: MockCDPTransport + ) -> None: + """Test typing text.""" + result = await type_text(backend, "Hi") + + assert isinstance(result, ActionResult) + assert result.success is True + + # Check key events were dispatched + key_events = [c for c in transport.calls if c[0] == "Input.dispatchKeyEvent"] + assert len(key_events) == 6 # 2 chars * 3 events each + + @pytest.mark.asyncio + async def test_type_text_with_target( + self, backend: CDPBackendV0, transport: MockCDPTransport + ) -> None: + """Test typing text with click target.""" + result = await type_text(backend, "test", target=(100, 200)) + + assert result.success is True + + # Should have click + key events + mouse_events = [c for c in transport.calls if c[0] == "Input.dispatchMouseEvent"] + key_events = [c for c in transport.calls if c[0] == "Input.dispatchKeyEvent"] + assert len(mouse_events) >= 2 # At least press + release + assert len(key_events) == 12 # 4 chars * 3 events + + @pytest.mark.asyncio + async def test_scroll_down(self, backend: CDPBackendV0, transport: MockCDPTransport) -> None: + """Test scrolling down.""" + # Set up viewport for default coordinates + transport.set_response( + "Runtime.evaluate", + { + "result": { + "type": "object", + "value": {"width": 1920, "height": 1080}, + } + }, + ) + + result = await scroll(backend, delta_y=300) + + assert result.success is True + + wheel_events = [ + c + for c in transport.calls + if c[0] == "Input.dispatchMouseEvent" and c[1].get("type") == "mouseWheel" + ] + assert len(wheel_events) == 1 + assert wheel_events[0][1]["deltaY"] == 300 + + @pytest.mark.asyncio + async def test_scroll_at_position( + self, backend: CDPBackendV0, transport: MockCDPTransport + ) -> None: + """Test scrolling at specific position.""" + result = await scroll(backend, delta_y=200, target=(500, 300)) + + assert result.success is True + + wheel_events = [ + c + for c in transport.calls + if c[0] == "Input.dispatchMouseEvent" and c[1].get("type") == "mouseWheel" + ] + assert wheel_events[0][1]["x"] == 500 + assert wheel_events[0][1]["y"] == 300 + + @pytest.mark.asyncio + async def test_wait_for_stable_success( + self, backend: CDPBackendV0, transport: MockCDPTransport + ) -> None: + """Test wait_for_stable with immediate success.""" + transport.set_response( + "Runtime.evaluate", + {"result": {"type": "string", "value": "complete"}}, + ) + + result = await wait_for_stable(backend, state="complete", timeout_ms=1000) + + assert result.success is True + + @pytest.mark.asyncio + async def test_wait_for_stable_timeout( + self, backend: CDPBackendV0, transport: MockCDPTransport + ) -> None: + """Test wait_for_stable timeout.""" + transport.set_response( + "Runtime.evaluate", + {"result": {"type": "string", "value": "loading"}}, + ) + + result = await wait_for_stable(backend, state="complete", timeout_ms=200) + + assert result.success is False + assert result.error["code"] == "timeout" + + +class TestPlaywrightBackend: + """Tests for PlaywrightBackend wrapper.""" + + def test_implements_protocol(self) -> None: + """Verify PlaywrightBackend implements BrowserBackendV0.""" + mock_page = MagicMock() + backend = PlaywrightBackend(mock_page) + assert isinstance(backend, BrowserBackendV0) + + def test_page_property(self) -> None: + """Test page property returns underlying page.""" + mock_page = MagicMock() + backend = PlaywrightBackend(mock_page) + assert backend.page is mock_page + + @pytest.mark.asyncio + async def test_refresh_page_info(self) -> None: + """Test refresh_page_info calls page.evaluate.""" + mock_page = AsyncMock() + mock_page.evaluate = AsyncMock( + return_value={ + "width": 1920, + "height": 1080, + "scroll_x": 0, + "scroll_y": 100, + "content_width": 1920, + "content_height": 5000, + } + ) + + backend = PlaywrightBackend(mock_page) + info = await backend.refresh_page_info() + + assert isinstance(info, ViewportInfo) + assert info.width == 1920 + assert info.scroll_y == 100 + + @pytest.mark.asyncio + async def test_eval(self) -> None: + """Test eval calls page.evaluate.""" + mock_page = AsyncMock() + mock_page.evaluate = AsyncMock(return_value=42) + + backend = PlaywrightBackend(mock_page) + result = await backend.eval("1 + 1") + + assert result == 42 + + @pytest.mark.asyncio + async def test_mouse_click(self) -> None: + """Test mouse_click calls page.mouse.click.""" + mock_mouse = AsyncMock() + mock_page = MagicMock() + mock_page.mouse = mock_mouse + + backend = PlaywrightBackend(mock_page) + await backend.mouse_click(100, 200, button="left", click_count=1) + + mock_mouse.click.assert_called_once_with(100, 200, button="left", click_count=1) + + @pytest.mark.asyncio + async def test_type_text(self) -> None: + """Test type_text calls page.keyboard.type.""" + mock_keyboard = AsyncMock() + mock_page = MagicMock() + mock_page.keyboard = mock_keyboard + + backend = PlaywrightBackend(mock_page) + await backend.type_text("Hello") + + mock_keyboard.type.assert_called_once_with("Hello") + + @pytest.mark.asyncio + async def test_screenshot_png(self) -> None: + """Test screenshot_png calls page.screenshot.""" + mock_page = AsyncMock() + mock_page.screenshot = AsyncMock(return_value=b"\x89PNG\r\n\x1a\n") + + backend = PlaywrightBackend(mock_page) + result = await backend.screenshot_png() + + assert result.startswith(b"\x89PNG") + mock_page.screenshot.assert_called_once_with(type="png") + + +class TestCachedSnapshot: + """Tests for CachedSnapshot caching behavior.""" + + @pytest.fixture + def mock_backend(self) -> MagicMock: + """Create mock backend.""" + backend = MagicMock() + backend.eval = AsyncMock() + return backend + + def test_initial_state(self, mock_backend: MagicMock) -> None: + """Test initial cache state.""" + cache = CachedSnapshot(mock_backend, max_age_ms=2000) + + assert cache.is_cached is False + assert cache.age_ms == float("inf") + + def test_invalidate(self, mock_backend: MagicMock) -> None: + """Test cache invalidation.""" + cache = CachedSnapshot(mock_backend) + cache._cached = MagicMock() # Simulate cached snapshot + cache._cached_at = time.time() + + assert cache.is_cached is True + + cache.invalidate() + + assert cache.is_cached is False + assert cache.age_ms == float("inf") + + def test_staleness_by_age(self, mock_backend: MagicMock) -> None: + """Test cache staleness detection.""" + cache = CachedSnapshot(mock_backend, max_age_ms=100) + + # Simulate old cache + cache._cached = MagicMock() + cache._cached_at = time.time() - 0.2 # 200ms ago + + assert cache._is_stale() is True + + def test_fresh_cache(self, mock_backend: MagicMock) -> None: + """Test fresh cache detection.""" + cache = CachedSnapshot(mock_backend, max_age_ms=2000) + + # Simulate fresh cache + cache._cached = MagicMock() + cache._cached_at = time.time() + + assert cache._is_stale() is False + + +class TestCoordinateResolution: + """Test coordinate resolution in actions.""" + + @pytest.mark.asyncio + async def test_bbox_center_calculation(self) -> None: + """Test BBox center calculation.""" + from sentience.backends.actions import _resolve_coordinates + + bbox = BBox(x=100, y=200, width=50, height=30) + x, y = _resolve_coordinates(bbox) + + assert x == 125 # 100 + 50/2 + assert y == 215 # 200 + 30/2 + + @pytest.mark.asyncio + async def test_dict_with_dimensions(self) -> None: + """Test dict with width/height computes center.""" + from sentience.backends.actions import _resolve_coordinates + + target = {"x": 100, "y": 200, "width": 50, "height": 30} + x, y = _resolve_coordinates(target) + + assert x == 125 + assert y == 215 + + @pytest.mark.asyncio + async def test_dict_without_dimensions(self) -> None: + """Test dict without width/height uses x/y directly.""" + from sentience.backends.actions import _resolve_coordinates + + target = {"x": 150, "y": 250} + x, y = _resolve_coordinates(target) + + assert x == 150 + assert y == 250 + + @pytest.mark.asyncio + async def test_tuple_passthrough(self) -> None: + """Test tuple passes through unchanged.""" + from sentience.backends.actions import _resolve_coordinates + + x, y = _resolve_coordinates((300, 400)) + + assert x == 300 + assert y == 400 From 6c1405c3b1016fef2d30a7e9985b217d9df12f76 Mon Sep 17 00:00:00 2001 From: SentienceDev Date: Thu, 8 Jan 2026 17:35:20 -0800 Subject: [PATCH 2/3] Phase 3: polish --- examples/browser_use_integration.py | 210 ++++++++++++++++++++++ sentience/backends/__init__.py | 85 ++++++++- sentience/backends/actions.py | 6 +- sentience/backends/exceptions.py | 211 +++++++++++++++++++++++ sentience/backends/playwright_backend.py | 15 +- sentience/backends/snapshot.py | 43 +++-- tests/test_backends.py | 123 +++++++++++++ 7 files changed, 662 insertions(+), 31 deletions(-) create mode 100644 examples/browser_use_integration.py create mode 100644 sentience/backends/exceptions.py diff --git a/examples/browser_use_integration.py b/examples/browser_use_integration.py new file mode 100644 index 0000000..9167c5f --- /dev/null +++ b/examples/browser_use_integration.py @@ -0,0 +1,210 @@ +""" +Example: Using Sentience with browser-use for element grounding. + +This example demonstrates how to integrate Sentience's semantic element +detection with browser-use, enabling accurate click/type/scroll operations +using Sentience's snapshot-based grounding instead of coordinate estimation. + +Requirements: + pip install browser-use sentienceapi + +Usage: + python examples/browser_use_integration.py +""" + +import asyncio + +# browser-use imports (install via: pip install browser-use) +# from browser_use import BrowserSession, BrowserProfile + +# Sentience imports +from sentience import ( + find, + get_extension_dir, + query, +) +from sentience.backends import ( + BrowserUseAdapter, + CachedSnapshot, + ExtensionNotLoadedError, + click, + scroll, + snapshot, + type_text, +) + + +async def main() -> None: + """ + Demo: Search on Google using Sentience grounding with browser-use. + + This example shows the full workflow: + 1. Launch browser-use with Sentience extension loaded + 2. Create a Sentience backend adapter + 3. Take snapshots and interact with elements using semantic queries + """ + + # ========================================================================= + # STEP 1: Setup browser-use with Sentience extension + # ========================================================================= + # + # The Sentience extension must be loaded for element grounding to work. + # Use get_extension_dir() to get the path to the bundled extension. + # + # Uncomment the following when running with browser-use installed: + + # extension_path = get_extension_dir() + # print(f"Loading Sentience extension from: {extension_path}") + # + # profile = BrowserProfile( + # args=[ + # f"--load-extension={extension_path}", + # "--disable-extensions-except=" + extension_path, + # ], + # ) + # session = BrowserSession(browser_profile=profile) + # await session.start() + + # ========================================================================= + # STEP 2: Create Sentience backend adapter + # ========================================================================= + # + # The adapter bridges browser-use's CDP client to Sentience's backend protocol. + # + # adapter = BrowserUseAdapter(session) + # backend = await adapter.create_backend() + + # ========================================================================= + # STEP 3: Navigate and take snapshots + # ========================================================================= + # + # await session.navigate("https://www.google.com") + # + # # Take a snapshot - this uses the Sentience extension's element detection + # try: + # snap = await snapshot(backend) + # print(f"Found {len(snap.elements)} elements") + # except ExtensionNotLoadedError as e: + # print(f"Extension not loaded: {e}") + # print("Make sure the browser was launched with --load-extension flag") + # return + + # ========================================================================= + # STEP 4: Find and interact with elements using semantic queries + # ========================================================================= + # + # Sentience provides powerful element selectors: + # - Role-based: 'role=textbox', 'role=button' + # - Name-based: 'role=button[name="Submit"]' + # - Text-based: 'text=Search' + # + # # Find the search input + # search_input = find(snap, 'role=textbox[name*="Search"]') + # if search_input: + # # Click on the search input (uses center of bounding box) + # await click(backend, search_input.bbox) + # + # # Type search query + # await type_text(backend, "Sentience AI browser automation") + # print("Typed search query") + + # ========================================================================= + # STEP 5: Using cached snapshots for efficiency + # ========================================================================= + # + # Taking snapshots has overhead. Use CachedSnapshot to reuse recent snapshots: + # + # cache = CachedSnapshot(backend, max_age_ms=2000) + # + # # First call takes fresh snapshot + # snap1 = await cache.get() + # + # # Second call returns cached version if less than 2 seconds old + # snap2 = await cache.get() + # + # # After actions that modify DOM, invalidate the cache + # await click(backend, some_element.bbox) + # cache.invalidate() # Next get() will take fresh snapshot + + # ========================================================================= + # STEP 6: Scrolling to elements + # ========================================================================= + # + # # Scroll down by 500 pixels + # await scroll(backend, delta_y=500) + # + # # Scroll at a specific position (useful for scrollable containers) + # await scroll(backend, delta_y=300, target=(400, 500)) + + # ========================================================================= + # STEP 7: Advanced element queries + # ========================================================================= + # + # # Find all buttons + # buttons = query(snap, 'role=button') + # print(f"Found {len(buttons)} buttons") + # + # # Find by partial text match + # links = query(snap, 'role=link[name*="Learn"]') + # + # # Find by exact text + # submit_btn = find(snap, 'role=button[name="Submit"]') + + # ========================================================================= + # STEP 8: Error handling + # ========================================================================= + # + # Sentience provides specific exceptions for common errors: + # + # from sentience.backends import ( + # ExtensionNotLoadedError, # Extension not loaded in browser + # SnapshotError, # Snapshot failed + # ActionError, # Click/type/scroll failed + # ) + # + # try: + # snap = await snapshot(backend) + # except ExtensionNotLoadedError as e: + # # The error message includes fix suggestions + # print(f"Fix: {e}") + + # ========================================================================= + # CLEANUP + # ========================================================================= + # + # await session.stop() + + print("=" * 60) + print("browser-use + Sentience Integration Example") + print("=" * 60) + print() + print("This example demonstrates the integration pattern.") + print("To run with a real browser, uncomment the code sections above") + print("and install browser-use: pip install browser-use") + print() + print("Key imports:") + print(" from sentience import get_extension_dir, find, query") + print(" from sentience.backends import (") + print(" BrowserUseAdapter, snapshot, click, type_text, scroll") + print(" )") + print() + print("Extension path:", get_extension_dir()) + + +async def full_example() -> None: + """ + Complete working example - requires browser-use installed. + + This is the uncommented version for users who have browser-use installed. + """ + # Import browser-use (uncomment when installed) + # from browser_use import BrowserSession, BrowserProfile + + print("To run the full example:") + print("1. Install browser-use: pip install browser-use") + print("2. Uncomment the imports in this function") + print("3. Run: python examples/browser_use_integration.py") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/sentience/backends/__init__.py b/sentience/backends/__init__.py index 0c7d7f3..97601c6 100644 --- a/sentience/backends/__init__.py +++ b/sentience/backends/__init__.py @@ -5,13 +5,28 @@ Sentience actions (click, type, scroll) to work with different browser automation frameworks. -Supported backends: -- PlaywrightBackend: Default backend using Playwright (existing SentienceBrowser) -- CDPBackendV0: CDP-based backend for browser-use integration +Supported Backends +------------------ + +**PlaywrightBackend** + Wraps Playwright Page objects. Use this when integrating with existing + SentienceBrowser or Playwright-based code. + +**CDPBackendV0** + Low-level CDP (Chrome DevTools Protocol) backend. Use this when you have + direct access to a CDP client and session. + +**BrowserUseAdapter** + High-level adapter for browser-use framework. Automatically creates a + CDPBackendV0 from a BrowserSession. + +Quick Start with browser-use +---------------------------- + +.. code-block:: python -For browser-use integration: from browser_use import BrowserSession, BrowserProfile - from sentience import get_extension_dir + from sentience import get_extension_dir, find from sentience.backends import BrowserUseAdapter, snapshot, click, type_text # Setup browser-use with Sentience extension @@ -23,15 +38,63 @@ adapter = BrowserUseAdapter(session) backend = await adapter.create_backend() - # Take snapshot and interact + # Take snapshot and interact with elements snap = await snapshot(backend) - element = find(snap, 'role=button[name="Submit"]') + search_box = find(snap, 'role=textbox[name*="Search"]') + await click(backend, search_box.bbox) + await type_text(backend, "Sentience AI") + +Snapshot Caching +---------------- + +Use CachedSnapshot to reduce redundant snapshot calls in action loops: + +.. code-block:: python + + from sentience.backends import CachedSnapshot + + cache = CachedSnapshot(backend, max_age_ms=2000) + + snap1 = await cache.get() # Takes fresh snapshot + snap2 = await cache.get() # Returns cached if < 2s old + await click(backend, element.bbox) + cache.invalidate() # Force refresh on next get() + +Error Handling +-------------- + +The module provides specific exceptions for common failure modes: + +- ``ExtensionNotLoadedError``: Extension not loaded in browser launch args +- ``SnapshotError``: window.sentience.snapshot() failed +- ``ActionError``: Click/type/scroll operation failed + +All exceptions inherit from ``SentienceBackendError`` and include helpful +fix suggestions in their error messages. + +.. code-block:: python + + from sentience.backends import ExtensionNotLoadedError, snapshot + + try: + snap = await snapshot(backend) + except ExtensionNotLoadedError as e: + print(f"Fix suggestion: {e}") """ from .actions import click, scroll, scroll_to_element, type_text, wait_for_stable from .browser_use_adapter import BrowserUseAdapter, BrowserUseCDPTransport from .cdp_backend import CDPBackendV0, CDPTransport +from .exceptions import ( + ActionError, + BackendEvalError, + ExtensionDiagnostics, + ExtensionInjectionError, + ExtensionNotLoadedError, + SentienceBackendError, + SnapshotError, +) from .playwright_backend import PlaywrightBackend from .protocol_v0 import BrowserBackendV0, LayoutMetrics, ViewportInfo from .snapshot import CachedSnapshot, snapshot @@ -58,4 +121,12 @@ "scroll", "scroll_to_element", "wait_for_stable", + # Exceptions + "SentienceBackendError", + "ExtensionNotLoadedError", + "ExtensionInjectionError", + "ExtensionDiagnostics", + "BackendEvalError", + "SnapshotError", + "ActionError", ] diff --git a/sentience/backends/actions.py b/sentience/backends/actions.py index c987d64..67ec479 100644 --- a/sentience/backends/actions.py +++ b/sentience/backends/actions.py @@ -226,7 +226,8 @@ async def scroll_to_element( start_time = time.time() try: - scrolled = await backend.eval(f""" + scrolled = await backend.eval( + f""" (() => {{ const el = window.sentience_registry && window.sentience_registry[{element_id}]; if (el && el.scrollIntoView) {{ @@ -239,7 +240,8 @@ async def scroll_to_element( }} return false; }})() - """) + """ + ) # Wait for scroll animation wait_time = 0.3 if behavior == "smooth" else 0.05 diff --git a/sentience/backends/exceptions.py b/sentience/backends/exceptions.py new file mode 100644 index 0000000..a1d176c --- /dev/null +++ b/sentience/backends/exceptions.py @@ -0,0 +1,211 @@ +""" +Custom exceptions for Sentience backends. + +These exceptions provide clear, actionable error messages when things go wrong +during browser-use integration or backend operations. +""" + +from dataclasses import dataclass +from typing import Any + + +class SentienceBackendError(Exception): + """Base exception for all Sentience backend errors.""" + + pass + + +@dataclass +class ExtensionDiagnostics: + """Diagnostics collected when extension loading fails.""" + + sentience_defined: bool = False + sentience_snapshot: bool = False + url: str = "" + error: str | None = None + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ExtensionDiagnostics": + """Create from diagnostic dict returned by browser eval.""" + return cls( + sentience_defined=data.get("sentience_defined", False), + sentience_snapshot=data.get("sentience_snapshot", False), + url=data.get("url", ""), + error=data.get("error"), + ) + + def to_dict(self) -> dict[str, Any]: + """Convert to dict for serialization.""" + return { + "sentience_defined": self.sentience_defined, + "sentience_snapshot": self.sentience_snapshot, + "url": self.url, + "error": self.error, + } + + +class ExtensionNotLoadedError(SentienceBackendError): + """ + Raised when the Sentience extension is not loaded in the browser. + + This typically means: + 1. Browser was launched without --load-extension flag + 2. Extension path is incorrect + 3. Extension failed to initialize + + Example fix for browser-use: + from sentience import get_extension_dir + from browser_use import BrowserSession, BrowserProfile + + profile = BrowserProfile( + args=[f"--load-extension={get_extension_dir()}"], + ) + session = BrowserSession(browser_profile=profile) + """ + + def __init__( + self, + message: str, + timeout_ms: int | None = None, + diagnostics: ExtensionDiagnostics | None = None, + ) -> None: + self.timeout_ms = timeout_ms + self.diagnostics = diagnostics + super().__init__(message) + + @classmethod + def from_timeout( + cls, + timeout_ms: int, + diagnostics: ExtensionDiagnostics | None = None, + ) -> "ExtensionNotLoadedError": + """Create error from timeout during extension wait.""" + diag_info = "" + if diagnostics: + if diagnostics.error: + diag_info = f"\n Error: {diagnostics.error}" + else: + diag_info = ( + f"\n window.sentience defined: {diagnostics.sentience_defined}" + f"\n window.sentience.snapshot available: {diagnostics.sentience_snapshot}" + f"\n Page URL: {diagnostics.url}" + ) + + message = ( + f"Sentience extension not loaded after {timeout_ms}ms.{diag_info}\n\n" + "To fix this, ensure the extension is loaded when launching the browser:\n\n" + " from sentience import get_extension_dir\n" + " from browser_use import BrowserSession, BrowserProfile\n\n" + " profile = BrowserProfile(\n" + f' args=[f"--load-extension={{get_extension_dir()}}"],\n' + " )\n" + " session = BrowserSession(browser_profile=profile)\n" + ) + return cls(message, timeout_ms=timeout_ms, diagnostics=diagnostics) + + +class ExtensionInjectionError(SentienceBackendError): + """ + Raised when window.sentience API is not available on the page. + + This can happen when: + 1. Page loaded before extension could inject + 2. Page has Content Security Policy blocking extension + 3. Extension crashed or was disabled + + Call snapshot() with a longer timeout or wait for page load. + """ + + def __init__( + self, + message: str, + url: str | None = None, + ) -> None: + self.url = url + super().__init__(message) + + @classmethod + def from_page(cls, url: str) -> "ExtensionInjectionError": + """Create error for a specific page.""" + message = ( + f"window.sentience API not available on page: {url}\n\n" + "Possible causes:\n" + " 1. Page loaded before extension could inject (try increasing timeout)\n" + " 2. Page has Content Security Policy blocking the extension\n" + " 3. Extension was disabled or crashed\n\n" + "Try:\n" + " snap = await snapshot(backend, options=SnapshotOptions(timeout_ms=10000))" + ) + return cls(message, url=url) + + +class BackendEvalError(SentienceBackendError): + """ + Raised when JavaScript evaluation fails in the browser. + + This wraps underlying CDP or Playwright errors with context. + """ + + def __init__( + self, + message: str, + expression: str | None = None, + original_error: Exception | None = None, + ) -> None: + self.expression = expression + self.original_error = original_error + super().__init__(message) + + +class SnapshotError(SentienceBackendError): + """ + Raised when taking a snapshot fails. + + This can happen when: + 1. Extension returned null or invalid data + 2. Page is in an invalid state + 3. Extension threw an error + """ + + def __init__( + self, + message: str, + url: str | None = None, + raw_result: Any = None, + ) -> None: + self.url = url + self.raw_result = raw_result + super().__init__(message) + + @classmethod + def from_null_result(cls, url: str | None = None) -> "SnapshotError": + """Create error for null snapshot result.""" + message = ( + "window.sentience.snapshot() returned null.\n\n" + "Possible causes:\n" + " 1. Extension is not properly initialized\n" + " 2. Page DOM is in an invalid state\n" + " 3. Extension encountered an internal error\n\n" + "Try refreshing the page and taking a new snapshot." + ) + if url: + message = f"{message}\n Page URL: {url}" + return cls(message, url=url, raw_result=None) + + +class ActionError(SentienceBackendError): + """ + Raised when a browser action (click, type, scroll) fails. + """ + + def __init__( + self, + action: str, + message: str, + coordinates: tuple[float, float] | None = None, + original_error: Exception | None = None, + ) -> None: + self.action = action + self.coordinates = coordinates + self.original_error = original_error + super().__init__(f"{action} failed: {message}") diff --git a/sentience/backends/playwright_backend.py b/sentience/backends/playwright_backend.py index f5ea8df..719561a 100644 --- a/sentience/backends/playwright_backend.py +++ b/sentience/backends/playwright_backend.py @@ -57,7 +57,8 @@ def page(self) -> "AsyncPage": async def refresh_page_info(self) -> ViewportInfo: """Cache viewport + scroll offsets; cheap & safe to call often.""" - result = await self._page.evaluate(""" + result = await self._page.evaluate( + """ (() => ({ width: window.innerWidth, height: window.innerHeight, @@ -66,7 +67,8 @@ async def refresh_page_info(self) -> ViewportInfo: content_width: document.documentElement.scrollWidth, content_height: document.documentElement.scrollHeight }))() - """) + """ + ) self._cached_viewport = ViewportInfo( width=result.get("width", 0), @@ -96,7 +98,8 @@ async def get_layout_metrics(self) -> LayoutMetrics: """Get page layout metrics.""" # Playwright doesn't expose CDP directly in the same way, # so we approximate using JavaScript - result = await self._page.evaluate(""" + result = await self._page.evaluate( + """ (() => ({ viewport_x: window.scrollX, viewport_y: window.scrollY, @@ -106,7 +109,8 @@ async def get_layout_metrics(self) -> LayoutMetrics: content_height: document.documentElement.scrollHeight, device_scale_factor: window.devicePixelRatio || 1 }))() - """) + """ + ) return LayoutMetrics( viewport_x=result.get("viewport_x", 0), @@ -172,8 +176,7 @@ async def wait_ready_state( elapsed = time.monotonic() - start if elapsed >= timeout_sec: raise TimeoutError( - f"Timed out waiting for document.readyState='{state}' " - f"after {timeout_ms}ms" + f"Timed out waiting for document.readyState='{state}' " f"after {timeout_ms}ms" ) current_state = await self._page.evaluate("document.readyState") diff --git a/sentience/backends/snapshot.py b/sentience/backends/snapshot.py index 6f11dd9..ffe647b 100644 --- a/sentience/backends/snapshot.py +++ b/sentience/backends/snapshot.py @@ -25,6 +25,7 @@ from typing import TYPE_CHECKING, Any from ..models import Snapshot, SnapshotOptions +from .exceptions import ExtensionDiagnostics, ExtensionNotLoadedError, SnapshotError if TYPE_CHECKING: from .protocol_v0 import BrowserBackendV0 @@ -184,30 +185,36 @@ async def snapshot( ext_options = _build_extension_options(options) # Call extension's snapshot function - result = await backend.eval(f""" + result = await backend.eval( + f""" (() => {{ const options = {_json_serialize(ext_options)}; return window.sentience.snapshot(options); }})() - """) + """ + ) if result is None: - raise RuntimeError( - "window.sentience.snapshot() returned null. " - "Is the Sentience extension loaded and injected?" - ) + # Try to get URL for better error message + try: + url = await backend.eval("window.location.href") + except Exception: + url = None + raise SnapshotError.from_null_result(url=url) # Show overlay if requested if options.show_overlay: raw_elements = result.get("raw_elements", []) if raw_elements: - await backend.eval(f""" + await backend.eval( + f""" (() => {{ if (window.sentience && window.sentience.showOverlay) {{ window.sentience.showOverlay({_json_serialize(raw_elements)}, null); }} }})() - """) + """ + ) # Build and return Snapshot return Snapshot(**result) @@ -237,19 +244,22 @@ async def _wait_for_extension( if elapsed >= timeout_sec: # Gather diagnostics try: - diag = await backend.eval(""" + diag_dict = await backend.eval( + """ (() => ({ sentience_defined: typeof window.sentience !== 'undefined', sentience_snapshot: typeof window.sentience?.snapshot === 'function', url: window.location.href }))() - """) - except Exception: - diag = {"error": "Could not gather diagnostics"} - - raise RuntimeError( - f"Sentience extension failed to inject window.sentience API " - f"within {timeout_ms}ms. Diagnostics: {diag}" + """ + ) + diagnostics = ExtensionDiagnostics.from_dict(diag_dict) + except Exception as e: + diagnostics = ExtensionDiagnostics(error=f"Could not gather diagnostics: {e}") + + raise ExtensionNotLoadedError.from_timeout( + timeout_ms=timeout_ms, + diagnostics=diagnostics, ) # Check if extension is ready @@ -294,4 +304,5 @@ def _build_extension_options(options: SnapshotOptions) -> dict[str, Any]: def _json_serialize(obj: Any) -> str: """Serialize object to JSON string for embedding in JS.""" import json + return json.dumps(obj) diff --git a/tests/test_backends.py b/tests/test_backends.py index a1c7d90..00e4325 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -833,3 +833,126 @@ async def test_tuple_passthrough(self) -> None: assert x == 300 assert y == 400 + + +class TestBackendExceptions: + """Tests for custom backend exceptions.""" + + def test_extension_diagnostics_from_dict(self) -> None: + """Test ExtensionDiagnostics.from_dict.""" + from sentience.backends.exceptions import ExtensionDiagnostics + + data = { + "sentience_defined": True, + "sentience_snapshot": False, + "url": "https://example.com", + } + diag = ExtensionDiagnostics.from_dict(data) + + assert diag.sentience_defined is True + assert diag.sentience_snapshot is False + assert diag.url == "https://example.com" + assert diag.error is None + + def test_extension_diagnostics_to_dict(self) -> None: + """Test ExtensionDiagnostics.to_dict.""" + from sentience.backends.exceptions import ExtensionDiagnostics + + diag = ExtensionDiagnostics( + sentience_defined=True, + sentience_snapshot=True, + url="https://test.com", + error=None, + ) + result = diag.to_dict() + + assert result["sentience_defined"] is True + assert result["sentience_snapshot"] is True + assert result["url"] == "https://test.com" + + def test_extension_not_loaded_error_from_timeout(self) -> None: + """Test ExtensionNotLoadedError.from_timeout creates helpful message.""" + from sentience.backends.exceptions import ExtensionDiagnostics, ExtensionNotLoadedError + + diag = ExtensionDiagnostics( + sentience_defined=False, + sentience_snapshot=False, + url="https://example.com", + ) + error = ExtensionNotLoadedError.from_timeout(timeout_ms=5000, diagnostics=diag) + + assert error.timeout_ms == 5000 + assert error.diagnostics is diag + assert "5000ms" in str(error) + assert "window.sentience defined: False" in str(error) + assert "get_extension_dir" in str(error) # Contains fix suggestion + + def test_extension_not_loaded_error_with_eval_error(self) -> None: + """Test ExtensionNotLoadedError when diagnostics collection failed.""" + from sentience.backends.exceptions import ExtensionDiagnostics, ExtensionNotLoadedError + + diag = ExtensionDiagnostics(error="Could not evaluate JavaScript") + error = ExtensionNotLoadedError.from_timeout(timeout_ms=3000, diagnostics=diag) + + assert "Could not evaluate JavaScript" in str(error) + + def test_snapshot_error_from_null_result(self) -> None: + """Test SnapshotError.from_null_result creates helpful message.""" + from sentience.backends.exceptions import SnapshotError + + error = SnapshotError.from_null_result(url="https://example.com/page") + + assert error.url == "https://example.com/page" + assert "returned null" in str(error) + assert "example.com/page" in str(error) + + def test_snapshot_error_from_null_result_no_url(self) -> None: + """Test SnapshotError.from_null_result without URL.""" + from sentience.backends.exceptions import SnapshotError + + error = SnapshotError.from_null_result(url=None) + + assert error.url is None + assert "returned null" in str(error) + + def test_action_error_message_format(self) -> None: + """Test ActionError formats message correctly.""" + from sentience.backends.exceptions import ActionError + + error = ActionError( + action="click", + message="Element not found", + coordinates=(100, 200), + ) + + assert error.action == "click" + assert error.coordinates == (100, 200) + assert "click failed" in str(error) + assert "Element not found" in str(error) + + def test_sentience_backend_error_inheritance(self) -> None: + """Test all exceptions inherit from SentienceBackendError.""" + from sentience.backends.exceptions import ( + ActionError, + BackendEvalError, + ExtensionInjectionError, + ExtensionNotLoadedError, + SentienceBackendError, + SnapshotError, + ) + + assert issubclass(ExtensionNotLoadedError, SentienceBackendError) + assert issubclass(ExtensionInjectionError, SentienceBackendError) + assert issubclass(BackendEvalError, SentienceBackendError) + assert issubclass(SnapshotError, SentienceBackendError) + assert issubclass(ActionError, SentienceBackendError) + + def test_extension_injection_error_from_page(self) -> None: + """Test ExtensionInjectionError.from_page.""" + from sentience.backends.exceptions import ExtensionInjectionError + + error = ExtensionInjectionError.from_page("https://secure-site.com") + + assert error.url == "https://secure-site.com" + assert "secure-site.com" in str(error) + assert "Content Security Policy" in str(error) From c598d490f605ebb1718264d79b998c4568c62957 Mon Sep 17 00:00:00 2001 From: SentienceDev Date: Thu, 8 Jan 2026 21:35:32 -0800 Subject: [PATCH 3/3] backend and regular snapshot consistent --- examples/browser_use_integration.py | 12 +- sentience/__init__.py | 2 +- sentience/backends/snapshot.py | 212 ++++++++++++++++++++++------ sentience/extension/background.js | 2 +- sentience/snapshot.py | 165 ++++++++++++++++------ 5 files changed, 291 insertions(+), 102 deletions(-) diff --git a/examples/browser_use_integration.py b/examples/browser_use_integration.py index 9167c5f..d24468f 100644 --- a/examples/browser_use_integration.py +++ b/examples/browser_use_integration.py @@ -14,15 +14,8 @@ import asyncio -# browser-use imports (install via: pip install browser-use) -# from browser_use import BrowserSession, BrowserProfile - # Sentience imports -from sentience import ( - find, - get_extension_dir, - query, -) +from sentience import find, get_extension_dir, query from sentience.backends import ( BrowserUseAdapter, CachedSnapshot, @@ -33,6 +26,9 @@ type_text, ) +# browser-use imports (install via: pip install browser-use) +# from browser_use import BrowserSession, BrowserProfile + async def main() -> None: """ diff --git a/sentience/__init__.py b/sentience/__init__.py index ecb4711..91ebe36 100644 --- a/sentience/__init__.py +++ b/sentience/__init__.py @@ -118,7 +118,7 @@ from .visual_agent import SentienceVisualAgent, SentienceVisualAgentAsync from .wait import wait_for -__version__ = "0.92.3" +__version__ = "0.93.0" __all__ = [ # Extension helpers (for browser-use integration) diff --git a/sentience/backends/snapshot.py b/sentience/backends/snapshot.py index ffe647b..2a1ff7d 100644 --- a/sentience/backends/snapshot.py +++ b/sentience/backends/snapshot.py @@ -25,6 +25,11 @@ from typing import TYPE_CHECKING, Any from ..models import Snapshot, SnapshotOptions +from ..snapshot import ( + _build_snapshot_payload, + _merge_api_result_with_local, + _post_snapshot_to_gateway_async, +) from .exceptions import ExtensionDiagnostics, ExtensionNotLoadedError, SnapshotError if TYPE_CHECKING: @@ -145,8 +150,9 @@ async def snapshot( """ Take a Sentience snapshot using the backend protocol. - This function calls window.sentience.snapshot() via the backend's eval(), - enabling snapshot collection with any BrowserBackendV0 implementation. + This function respects the `use_api` option and can call either: + - Server-side API (Pro/Enterprise tier) when `use_api=True` and API key is provided + - Local extension (Free tier) when `use_api=False` or no API key Requires: - Sentience extension loaded in browser (via --load-extension) @@ -154,70 +160,50 @@ async def snapshot( Args: backend: BrowserBackendV0 implementation (CDPBackendV0, PlaywrightBackend, etc.) - options: Snapshot options (limit, filter, screenshot, etc.) + options: Snapshot options (limit, filter, screenshot, use_api, sentience_api_key, etc.) Returns: Snapshot with elements, viewport, and optional screenshot Example: from sentience.backends import BrowserUseAdapter - from sentience.backends.snapshot import snapshot_from_backend + from sentience.backends.snapshot import snapshot + from sentience.models import SnapshotOptions adapter = BrowserUseAdapter(session) backend = await adapter.create_backend() - # Basic snapshot - snap = await snapshot_from_backend(backend) + # Basic snapshot (uses local extension) + snap = await snapshot(backend) - # With options - snap = await snapshot_from_backend(backend, SnapshotOptions( + # With server-side API (Pro/Enterprise tier) + snap = await snapshot(backend, SnapshotOptions( + use_api=True, + sentience_api_key="sk_pro_xxxxx", limit=100, screenshot=True )) + + # Force local extension (Free tier) + snap = await snapshot(backend, SnapshotOptions( + use_api=False + )) """ if options is None: options = SnapshotOptions() - # Wait for extension injection - await _wait_for_extension(backend, timeout_ms=5000) - - # Build options dict for extension API - ext_options = _build_extension_options(options) - - # Call extension's snapshot function - result = await backend.eval( - f""" - (() => {{ - const options = {_json_serialize(ext_options)}; - return window.sentience.snapshot(options); - }})() - """ + # Determine if we should use server-side API + # Same logic as main snapshot() function in sentience/snapshot.py + should_use_api = ( + options.use_api if options.use_api is not None else (options.sentience_api_key is not None) ) - if result is None: - # Try to get URL for better error message - try: - url = await backend.eval("window.location.href") - except Exception: - url = None - raise SnapshotError.from_null_result(url=url) - - # Show overlay if requested - if options.show_overlay: - raw_elements = result.get("raw_elements", []) - if raw_elements: - await backend.eval( - f""" - (() => {{ - if (window.sentience && window.sentience.showOverlay) {{ - window.sentience.showOverlay({_json_serialize(raw_elements)}, null); - }} - }})() - """ - ) - - # Build and return Snapshot - return Snapshot(**result) + if should_use_api and options.sentience_api_key: + # Use server-side API (Pro/Enterprise tier) + return await _snapshot_via_api(backend, options) + else: + # Use local extension (Free tier) + return await _snapshot_via_extension(backend, options) async def _wait_for_extension( @@ -235,12 +221,23 @@ async def _wait_for_extension( RuntimeError: If extension not injected within timeout """ import asyncio + import logging + + logger = logging.getLogger("sentience.backends.snapshot") start = time.monotonic() timeout_sec = timeout_ms / 1000.0 + poll_count = 0 + + logger.debug(f"Waiting for extension injection (timeout={timeout_ms}ms)...") while True: elapsed = time.monotonic() - start + poll_count += 1 + + if poll_count % 10 == 0: # Log every 10 polls (~1 second) + logger.debug(f"Extension poll #{poll_count}, elapsed={elapsed*1000:.0f}ms") + if elapsed >= timeout_sec: # Gather diagnostics try: @@ -249,11 +246,14 @@ async def _wait_for_extension( (() => ({ sentience_defined: typeof window.sentience !== 'undefined', sentience_snapshot: typeof window.sentience?.snapshot === 'function', - url: window.location.href + url: window.location.href, + extension_id: document.documentElement.dataset.sentienceExtensionId || null, + has_content_script: !!document.documentElement.dataset.sentienceExtensionId }))() """ ) diagnostics = ExtensionDiagnostics.from_dict(diag_dict) + logger.debug(f"Extension diagnostics: {diag_dict}") except Exception as e: diagnostics = ExtensionDiagnostics(error=f"Could not gather diagnostics: {e}") @@ -276,6 +276,124 @@ async def _wait_for_extension( await asyncio.sleep(0.1) +async def _snapshot_via_extension( + backend: "BrowserBackendV0", + options: SnapshotOptions, +) -> Snapshot: + """Take snapshot using local extension (Free tier)""" + # Wait for extension injection + await _wait_for_extension(backend, timeout_ms=5000) + + # Build options dict for extension API + ext_options = _build_extension_options(options) + + # Call extension's snapshot function + result = await backend.eval( + f""" + (() => {{ + const options = {_json_serialize(ext_options)}; + return window.sentience.snapshot(options); + }})() + """ + ) + + if result is None: + # Try to get URL for better error message + try: + url = await backend.eval("window.location.href") + except Exception: + url = None + raise SnapshotError.from_null_result(url=url) + + # Show overlay if requested + if options.show_overlay: + raw_elements = result.get("raw_elements", []) + if raw_elements: + await backend.eval( + f""" + (() => {{ + if (window.sentience && window.sentience.showOverlay) {{ + window.sentience.showOverlay({_json_serialize(raw_elements)}, null); + }} + }})() + """ + ) + + # Build and return Snapshot + return Snapshot(**result) + + +async def _snapshot_via_api( + backend: "BrowserBackendV0", + options: SnapshotOptions, +) -> Snapshot: + """Take snapshot using server-side API (Pro/Enterprise tier)""" + # Default API URL (same as main snapshot function) + api_url = "https://api.sentienceapi.com" + + # Wait for extension injection (needed even for API mode to collect raw data) + await _wait_for_extension(backend, timeout_ms=5000) + + # Step 1: Get raw data from local extension (always happens locally) + raw_options: dict[str, Any] = {} + if options.screenshot is not False: + raw_options["screenshot"] = options.screenshot + + # Call extension to get raw elements + raw_result = await backend.eval( + f""" + (() => {{ + const options = {_json_serialize(raw_options)}; + return window.sentience.snapshot(options); + }})() + """ + ) + + if raw_result is None: + try: + url = await backend.eval("window.location.href") + except Exception: + url = None + raise SnapshotError.from_null_result(url=url) + + # Step 2: Send to server for smart ranking/filtering + payload = _build_snapshot_payload(raw_result, options) + + try: + api_result = await _post_snapshot_to_gateway_async( + payload, options.sentience_api_key, api_url + ) + + # Merge API result with local data (screenshot, etc.) + snapshot_data = _merge_api_result_with_local(api_result, raw_result) + + # Show visual overlay if requested (use API-ranked elements) + if options.show_overlay: + elements = api_result.get("elements", []) + if elements: + await backend.eval( + f""" + (() => {{ + if (window.sentience && window.sentience.showOverlay) {{ + window.sentience.showOverlay({_json_serialize(elements)}, null); + }} + }})() + """ + ) + + return Snapshot(**snapshot_data) + except (RuntimeError, ValueError): + # Re-raise validation errors as-is + raise + except Exception as e: + # Fallback to local extension on API error + # This matches the behavior of the main snapshot function + raise RuntimeError( + f"Server-side snapshot API failed: {e}. " + "Try using use_api=False to use local extension instead." + ) from e + + def _build_extension_options(options: SnapshotOptions) -> dict[str, Any]: """Build options dict for extension API call.""" ext_options: dict[str, Any] = {} diff --git a/sentience/extension/background.js b/sentience/extension/background.js index aff49b0..02c0408 100644 --- a/sentience/extension/background.js +++ b/sentience/extension/background.js @@ -1,4 +1,4 @@ -import init, { analyze_page_with_options, analyze_page, prune_for_api } from "../pkg/sentience_core.js"; +import init, { analyze_page_with_options, analyze_page, prune_for_api } from "./pkg/sentience_core.js"; let wasmReady = !1, wasmInitPromise = null; diff --git a/sentience/snapshot.py b/sentience/snapshot.py index ec17d5a..3366141 100644 --- a/sentience/snapshot.py +++ b/sentience/snapshot.py @@ -19,6 +19,122 @@ MAX_PAYLOAD_BYTES = 10 * 1024 * 1024 +def _build_snapshot_payload( + raw_result: dict[str, Any], + options: SnapshotOptions, +) -> dict[str, Any]: + """ + Build payload dict for gateway snapshot API. + + Shared helper used by both sync and async snapshot implementations. + """ + return { + "raw_elements": raw_result.get("raw_elements", []), + "url": raw_result.get("url", ""), + "viewport": raw_result.get("viewport"), + "goal": options.goal, + "options": { + "limit": options.limit, + "filter": options.filter.model_dump() if options.filter else None, + }, + } + + +def _validate_payload_size(payload_json: str) -> None: + """ + Validate payload size before sending to gateway. + + Raises ValueError if payload exceeds server limit. + """ + payload_size = len(payload_json.encode("utf-8")) + if payload_size > MAX_PAYLOAD_BYTES: + raise ValueError( + f"Payload size ({payload_size / 1024 / 1024:.2f}MB) exceeds server limit " + f"({MAX_PAYLOAD_BYTES / 1024 / 1024:.0f}MB). " + f"Try reducing the number of elements on the page or filtering elements." + ) + + +def _post_snapshot_to_gateway_sync( + payload: dict[str, Any], + api_key: str, + api_url: str = "https://api.sentienceapi.com", +) -> dict[str, Any]: + """ + Post snapshot payload to gateway (synchronous). + + Used by sync snapshot() function. + """ + payload_json = json.dumps(payload) + _validate_payload_size(payload_json) + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + response = requests.post( + f"{api_url}/v1/snapshot", + data=payload_json, + headers=headers, + timeout=30, + ) + response.raise_for_status() + return response.json() + + +async def _post_snapshot_to_gateway_async( + payload: dict[str, Any], + api_key: str, + api_url: str = "https://api.sentienceapi.com", +) -> dict[str, Any]: + """ + Post snapshot payload to gateway (asynchronous). + + Used by async backend snapshot() function. + """ + # Lazy import httpx - only needed for async API calls + import httpx + + payload_json = json.dumps(payload) + _validate_payload_size(payload_json) + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + f"{api_url}/v1/snapshot", + content=payload_json, + headers=headers, + ) + response.raise_for_status() + return response.json() + + +def _merge_api_result_with_local( + api_result: dict[str, Any], + raw_result: dict[str, Any], +) -> dict[str, Any]: + """ + Merge API result with local data (screenshot, etc.). + + Shared helper used by both sync and async snapshot implementations. + """ + return { + "status": api_result.get("status", "success"), + "timestamp": api_result.get("timestamp"), + "url": api_result.get("url", raw_result.get("url", "")), + "viewport": api_result.get("viewport", raw_result.get("viewport")), + "elements": api_result.get("elements", []), + "screenshot": raw_result.get("screenshot"), # Keep local screenshot + "screenshot_format": raw_result.get("screenshot_format"), + "error": api_result.get("error"), + } + + def _save_trace_to_file(raw_elements: list[dict[str, Any]], trace_path: str | None = None) -> None: """ Save raw_elements to a JSON file for benchmarking/training @@ -181,54 +297,13 @@ def _snapshot_via_api( # Step 2: Send to server for smart ranking/filtering # Use raw_elements (raw data) instead of elements (processed data) # Server validates API key and applies proprietary ranking logic - payload = { - "raw_elements": raw_result.get("raw_elements", []), # Raw data needed for server processing - "url": raw_result.get("url", ""), - "viewport": raw_result.get("viewport"), - "goal": options.goal, # Optional goal/task description - "options": { - "limit": options.limit, - "filter": options.filter.model_dump() if options.filter else None, - }, - } - - # Check payload size before sending (server has 10MB limit) - payload_json = json.dumps(payload) - payload_size = len(payload_json.encode("utf-8")) - if payload_size > MAX_PAYLOAD_BYTES: - raise ValueError( - f"Payload size ({payload_size / 1024 / 1024:.2f}MB) exceeds server limit " - f"({MAX_PAYLOAD_BYTES / 1024 / 1024:.0f}MB). " - f"Try reducing the number of elements on the page or filtering elements." - ) - - headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json", - } + payload = _build_snapshot_payload(raw_result, options) try: - response = requests.post( - f"{api_url}/v1/snapshot", - data=payload_json, # Reuse already-serialized JSON - headers=headers, - timeout=30, - ) - response.raise_for_status() - - api_result = response.json() + api_result = _post_snapshot_to_gateway_sync(payload, api_key, api_url) # Merge API result with local data (screenshot, etc.) - snapshot_data = { - "status": api_result.get("status", "success"), - "timestamp": api_result.get("timestamp"), - "url": api_result.get("url", raw_result.get("url", "")), - "viewport": api_result.get("viewport", raw_result.get("viewport")), - "elements": api_result.get("elements", []), - "screenshot": raw_result.get("screenshot"), # Keep local screenshot - "screenshot_format": raw_result.get("screenshot_format"), - "error": api_result.get("error"), - } + snapshot_data = _merge_api_result_with_local(api_result, raw_result) # Show visual overlay if requested (use API-ranked elements) if options.show_overlay: @@ -247,7 +322,7 @@ def _snapshot_via_api( return Snapshot(**snapshot_data) except requests.exceptions.RequestException as e: - raise RuntimeError(f"API request failed: {e}") + raise RuntimeError(f"API request failed: {e}") from e # ========== Async Snapshot Functions ==========