From 14b0f4d6d4ecacf2fbd792e53cd59d9d4dc80514 Mon Sep 17 00:00:00 2001 From: rcholic Date: Thu, 1 Jan 2026 07:28:24 -0800 Subject: [PATCH 1/7] move async function to their respective modules; re-export --- sentience/_extension_loader.py | 1 + sentience/actions.py | 402 ++++++++++- sentience/async_api.py | 1228 +++----------------------------- sentience/browser.py | 460 +++++++++++- sentience/screenshot.py | 48 +- sentience/snapshot.py | 243 ++++++- sentience/text_search.py | 144 +++- sentience/wait.py | 68 +- tests/test_async_api.py | 63 ++ tests/test_browser.py | 1 + 10 files changed, 1506 insertions(+), 1152 deletions(-) diff --git a/sentience/_extension_loader.py b/sentience/_extension_loader.py index d969ec3..58bd873 100644 --- a/sentience/_extension_loader.py +++ b/sentience/_extension_loader.py @@ -38,3 +38,4 @@ def find_extension_path() -> Path: f"2. {dev_ext_path}\n" "Make sure the extension is built and 'sentience/extension' directory exists." ) + diff --git a/sentience/actions.py b/sentience/actions.py index 5df42fc..50c26bc 100644 --- a/sentience/actions.py +++ b/sentience/actions.py @@ -4,9 +4,9 @@ import time -from .browser import SentienceBrowser +from .browser import AsyncSentienceBrowser, SentienceBrowser from .models import ActionResult, BBox, Snapshot -from .snapshot import snapshot +from .snapshot import snapshot, snapshot_async def click( # noqa: C901 @@ -437,3 +437,401 @@ def click_rect( } ), ) + + +# ========== Async Action Functions ========== + + +async def click_async( + browser: AsyncSentienceBrowser, + element_id: int, + use_mouse: bool = True, + take_snapshot: bool = False, +) -> ActionResult: + """ + Click an element by ID using hybrid approach (async) + + Args: + browser: AsyncSentienceBrowser instance + element_id: Element ID from snapshot + use_mouse: If True, use Playwright's mouse.click() at element center + take_snapshot: Whether to take snapshot after action + + Returns: + ActionResult + """ + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + start_time = time.time() + url_before = browser.page.url + + if use_mouse: + try: + snap = await snapshot_async(browser) + element = None + for el in snap.elements: + if el.id == element_id: + element = el + break + + if element: + center_x = element.bbox.x + element.bbox.width / 2 + center_y = element.bbox.y + element.bbox.height / 2 + try: + await browser.page.mouse.click(center_x, center_y) + success = True + except Exception: + success = True + else: + try: + success = await browser.page.evaluate( + """ + (id) => { + return window.sentience.click(id); + } + """, + element_id, + ) + except Exception: + success = True + except Exception: + try: + success = await browser.page.evaluate( + """ + (id) => { + return window.sentience.click(id); + } + """, + element_id, + ) + except Exception: + success = True + else: + success = await browser.page.evaluate( + """ + (id) => { + return window.sentience.click(id); + } + """, + element_id, + ) + + # Wait a bit for navigation/DOM updates + try: + await browser.page.wait_for_timeout(500) + except Exception: + pass + + duration_ms = int((time.time() - start_time) * 1000) + + # Check if URL changed + try: + url_after = browser.page.url + url_changed = url_before != url_after + except Exception: + url_after = url_before + url_changed = True + + # Determine outcome + outcome: str | None = None + if url_changed: + outcome = "navigated" + elif success: + outcome = "dom_updated" + else: + outcome = "error" + + # Optional snapshot after + snapshot_after: Snapshot | None = None + if take_snapshot: + try: + snapshot_after = await snapshot_async(browser) + except Exception: + pass + + return ActionResult( + success=success, + duration_ms=duration_ms, + outcome=outcome, + url_changed=url_changed, + snapshot_after=snapshot_after, + error=( + None + if success + else { + "code": "click_failed", + "reason": "Element not found or not clickable", + } + ), + ) + + +async def type_text_async( + browser: AsyncSentienceBrowser, element_id: int, text: str, take_snapshot: bool = False +) -> ActionResult: + """ + Type text into an element (async) + + Args: + browser: AsyncSentienceBrowser instance + element_id: Element ID from snapshot + text: Text to type + take_snapshot: Whether to take snapshot after action + + Returns: + ActionResult + """ + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + start_time = time.time() + url_before = browser.page.url + + # Focus element first + focused = await browser.page.evaluate( + """ + (id) => { + const el = window.sentience_registry[id]; + if (el) { + el.focus(); + return true; + } + return false; + } + """, + element_id, + ) + + if not focused: + return ActionResult( + success=False, + duration_ms=int((time.time() - start_time) * 1000), + outcome="error", + error={"code": "focus_failed", "reason": "Element not found"}, + ) + + # Type using Playwright keyboard + await browser.page.keyboard.type(text) + + duration_ms = int((time.time() - start_time) * 1000) + url_after = browser.page.url + url_changed = url_before != url_after + + outcome = "navigated" if url_changed else "dom_updated" + + snapshot_after: Snapshot | None = None + if take_snapshot: + snapshot_after = await snapshot_async(browser) + + return ActionResult( + success=True, + duration_ms=duration_ms, + outcome=outcome, + url_changed=url_changed, + snapshot_after=snapshot_after, + ) + + +async def press_async( + browser: AsyncSentienceBrowser, key: str, take_snapshot: bool = False +) -> ActionResult: + """ + Press a keyboard key (async) + + Args: + browser: AsyncSentienceBrowser instance + key: Key to press (e.g., "Enter", "Escape", "Tab") + take_snapshot: Whether to take snapshot after action + + Returns: + ActionResult + """ + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + start_time = time.time() + url_before = browser.page.url + + # Press key using Playwright + await browser.page.keyboard.press(key) + + # Wait a bit for navigation/DOM updates + await browser.page.wait_for_timeout(500) + + duration_ms = int((time.time() - start_time) * 1000) + url_after = browser.page.url + url_changed = url_before != url_after + + outcome = "navigated" if url_changed else "dom_updated" + + snapshot_after: Snapshot | None = None + if take_snapshot: + snapshot_after = await snapshot_async(browser) + + return ActionResult( + success=True, + duration_ms=duration_ms, + outcome=outcome, + url_changed=url_changed, + snapshot_after=snapshot_after, + ) + + +async def _highlight_rect_async( + browser: AsyncSentienceBrowser, rect: dict[str, float], duration_sec: float = 2.0 +) -> None: + """Highlight a rectangle with a red border overlay (async)""" + if not browser.page: + return + + highlight_id = f"sentience_highlight_{int(time.time() * 1000)}" + + args = { + "rect": { + "x": rect["x"], + "y": rect["y"], + "w": rect["w"], + "h": rect["h"], + }, + "highlightId": highlight_id, + "durationSec": duration_sec, + } + + await browser.page.evaluate( + """ + (args) => { + const { rect, highlightId, durationSec } = args; + const overlay = document.createElement('div'); + overlay.id = highlightId; + overlay.style.position = 'fixed'; + overlay.style.left = `${rect.x}px`; + overlay.style.top = `${rect.y}px`; + overlay.style.width = `${rect.w}px`; + overlay.style.height = `${rect.h}px`; + overlay.style.border = '3px solid red'; + overlay.style.borderRadius = '2px'; + overlay.style.boxSizing = 'border-box'; + overlay.style.pointerEvents = 'none'; + overlay.style.zIndex = '999999'; + overlay.style.backgroundColor = 'rgba(255, 0, 0, 0.1)'; + overlay.style.transition = 'opacity 0.3s ease-out'; + + document.body.appendChild(overlay); + + setTimeout(() => { + overlay.style.opacity = '0'; + setTimeout(() => { + if (overlay.parentNode) { + overlay.parentNode.removeChild(overlay); + } + }, 300); + }, durationSec * 1000); + } + """, + args, + ) + + +async def click_rect_async( + browser: AsyncSentienceBrowser, + rect: dict[str, float] | BBox, + highlight: bool = True, + highlight_duration: float = 2.0, + take_snapshot: bool = False, +) -> ActionResult: + """ + Click at the center of a rectangle (async) + + Args: + browser: AsyncSentienceBrowser instance + rect: Dictionary with x, y, width (w), height (h) keys, or BBox object + highlight: Whether to show a red border highlight when clicking + highlight_duration: How long to show the highlight in seconds + take_snapshot: Whether to take snapshot after action + + Returns: + ActionResult + """ + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + # Handle BBox object or dict + if isinstance(rect, BBox): + x = rect.x + y = rect.y + w = rect.width + h = rect.height + else: + x = rect.get("x", 0) + y = rect.get("y", 0) + w = rect.get("w") or rect.get("width", 0) + h = rect.get("h") or rect.get("height", 0) + + if w <= 0 or h <= 0: + return ActionResult( + success=False, + duration_ms=0, + outcome="error", + error={ + "code": "invalid_rect", + "reason": "Rectangle width and height must be positive", + }, + ) + + start_time = time.time() + url_before = browser.page.url + + # Calculate center of rectangle + center_x = x + w / 2 + center_y = y + h / 2 + + # Show highlight before clicking + if highlight: + await _highlight_rect_async(browser, {"x": x, "y": y, "w": w, "h": h}, highlight_duration) + await browser.page.wait_for_timeout(50) + + # Use Playwright's native mouse click + try: + await browser.page.mouse.click(center_x, center_y) + success = True + except Exception as e: + success = False + error_msg = str(e) + + # Wait a bit for navigation/DOM updates + await browser.page.wait_for_timeout(500) + + duration_ms = int((time.time() - start_time) * 1000) + url_after = browser.page.url + url_changed = url_before != url_after + + # Determine outcome + outcome: str | None = None + if url_changed: + outcome = "navigated" + elif success: + outcome = "dom_updated" + else: + outcome = "error" + + # Optional snapshot after + snapshot_after: Snapshot | None = None + if take_snapshot: + snapshot_after = await snapshot_async(browser) + + return ActionResult( + success=success, + duration_ms=duration_ms, + outcome=outcome, + url_changed=url_changed, + snapshot_after=snapshot_after, + error=( + None + if success + else { + "code": "click_failed", + "reason": error_msg if not success else "Click failed", + } + ), + ) diff --git a/sentience/async_api.py b/sentience/async_api.py index 12beefb..3f4ae05 100644 --- a/sentience/async_api.py +++ b/sentience/async_api.py @@ -1,1160 +1,100 @@ """ -Async API for Sentience SDK - Use this in asyncio contexts +Async API for Sentience SDK - Convenience re-exports + +This module re-exports all async functions for backward compatibility and developer convenience. +You can also import directly from their respective modules: + + # Option 1: From async_api (recommended for convenience) + from sentience.async_api import ( + AsyncSentienceBrowser, + snapshot_async, + click_async, + wait_for_async, + screenshot_async, + find_text_rect_async, + # ... all async functions in one place + ) -This module provides async versions of all Sentience SDK functions. -Use AsyncSentienceBrowser when working with async/await code. + # Option 2: From respective modules (also works) + from sentience.browser import AsyncSentienceBrowser + from sentience.snapshot import snapshot_async + from sentience.actions import click_async """ -import asyncio -import base64 -import os -import shutil -import tempfile -import time -from pathlib import Path -from typing import Any, Optional -from urllib.parse import urlparse - -from playwright.async_api import BrowserContext, Page, Playwright, async_playwright - -from sentience._extension_loader import find_extension_path -from sentience.models import ( - ActionResult, - BBox, - Element, - ProxyConfig, - Snapshot, - SnapshotOptions, - StorageState, - Viewport, - WaitResult, +# ========== Browser ========== +# Re-export AsyncSentienceBrowser from browser.py (moved there for better organization) +from sentience.browser import AsyncSentienceBrowser + +# ========== Snapshot (Phase 1) ========== +# Re-export async snapshot functions from snapshot.py +from sentience.snapshot import snapshot_async + +# ========== Actions (Phase 1) ========== +# Re-export async action functions from actions.py +from sentience.actions import ( + click_async, + type_text_async, + press_async, + click_rect_async, ) -# Import stealth for bot evasion (optional - graceful fallback if not available) -try: - from playwright_stealth import stealth_async - - STEALTH_AVAILABLE = True -except ImportError: - STEALTH_AVAILABLE = False - - -class AsyncSentienceBrowser: - """Async version of SentienceBrowser for use in asyncio contexts.""" - - def __init__( - self, - api_key: str | None = None, - api_url: str | None = None, - headless: bool | None = None, - proxy: str | None = None, - user_data_dir: str | Path | None = None, - storage_state: str | Path | StorageState | dict | None = None, - record_video_dir: str | Path | None = None, - record_video_size: dict[str, int] | None = None, - viewport: Viewport | dict[str, int] | None = None, - ): - """ - Initialize Async Sentience browser - - Args: - api_key: Optional API key for server-side processing (Pro/Enterprise tiers) - If None, uses free tier (local extension only) - api_url: Server URL for API calls (defaults to https://api.sentienceapi.com if api_key provided) - headless: Whether to run in headless mode. If None, defaults to True in CI, False otherwise - proxy: Optional proxy server URL (e.g., 'http://user:pass@proxy.example.com:8080') - user_data_dir: Optional path to user data directory for persistent sessions - storage_state: Optional storage state to inject (cookies + localStorage) - record_video_dir: Optional directory path to save video recordings - record_video_size: Optional video resolution as dict with 'width' and 'height' keys - viewport: Optional viewport size as Viewport object or dict with 'width' and 'height' keys. - Examples: Viewport(width=1280, height=800) (default) - Viewport(width=1920, height=1080) (Full HD) - {"width": 1280, "height": 800} (dict also supported) - If None, defaults to Viewport(width=1280, height=800). - """ - self.api_key = api_key - # Only set api_url if api_key is provided, otherwise None (free tier) - if self.api_key and not api_url: - self.api_url = "https://api.sentienceapi.com" - else: - self.api_url = api_url - - # Determine headless mode - if headless is None: - # Default to False for local dev, True for CI - self.headless = os.environ.get("CI", "").lower() == "true" - else: - self.headless = headless - - # Support proxy from argument or environment variable - self.proxy = proxy or os.environ.get("SENTIENCE_PROXY") - - # Auth injection support - self.user_data_dir = user_data_dir - self.storage_state = storage_state - - # Video recording support - self.record_video_dir = record_video_dir - self.record_video_size = record_video_size or {"width": 1280, "height": 800} - - # Viewport configuration - convert dict to Viewport if needed - if viewport is None: - self.viewport = Viewport(width=1280, height=800) - elif isinstance(viewport, dict): - self.viewport = Viewport(width=viewport["width"], height=viewport["height"]) - else: - self.viewport = viewport - - self.playwright: Playwright | None = None - self.context: BrowserContext | None = None - self.page: Page | None = None - self._extension_path: str | None = None - - def _parse_proxy(self, proxy_string: str) -> ProxyConfig | None: - """ - Parse proxy connection string into ProxyConfig. - - Args: - proxy_string: Proxy URL (e.g., 'http://user:pass@proxy.example.com:8080') - - Returns: - ProxyConfig object or None if invalid - """ - if not proxy_string: - return None - - try: - parsed = urlparse(proxy_string) - - # Validate scheme - if parsed.scheme not in ("http", "https", "socks5"): - print(f"⚠️ [Sentience] Unsupported proxy scheme: {parsed.scheme}") - print(" Supported: http, https, socks5") - return None - - # Validate host and port - if not parsed.hostname or not parsed.port: - print("⚠️ [Sentience] Proxy URL must include hostname and port") - print(" Expected format: http://username:password@host:port") - return None - - # Build server URL - server = f"{parsed.scheme}://{parsed.hostname}:{parsed.port}" - - # Create ProxyConfig with optional credentials - return ProxyConfig( - server=server, - username=parsed.username if parsed.username else None, - password=parsed.password if parsed.password else None, - ) - - except Exception as e: - print(f"⚠️ [Sentience] Invalid proxy configuration: {e}") - print(" Expected format: http://username:password@host:port") - return None - - async def start(self) -> None: - """Launch browser with extension loaded (async)""" - # Get extension source path using shared utility - extension_source = find_extension_path() - - # Create temporary extension bundle - self._extension_path = tempfile.mkdtemp(prefix="sentience-ext-") - shutil.copytree(extension_source, self._extension_path, dirs_exist_ok=True) - - self.playwright = await async_playwright().start() - - # Build launch arguments - args = [ - f"--disable-extensions-except={self._extension_path}", - f"--load-extension={self._extension_path}", - "--disable-blink-features=AutomationControlled", - "--no-sandbox", - "--disable-infobars", - "--disable-features=WebRtcHideLocalIpsWithMdns", - "--force-webrtc-ip-handling-policy=disable_non_proxied_udp", - ] - - if self.headless: - args.append("--headless=new") - - # Parse proxy configuration if provided - proxy_config = self._parse_proxy(self.proxy) if self.proxy else None - - # Handle User Data Directory - if self.user_data_dir: - user_data_dir = str(self.user_data_dir) - Path(user_data_dir).mkdir(parents=True, exist_ok=True) - else: - user_data_dir = "" - - # Build launch_persistent_context parameters - launch_params = { - "user_data_dir": user_data_dir, - "headless": False, - "args": args, - "viewport": {"width": self.viewport.width, "height": self.viewport.height}, - "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", - } - - # Add proxy if configured - if proxy_config: - launch_params["proxy"] = proxy_config.to_playwright_dict() - launch_params["ignore_https_errors"] = True - print(f"🌐 [Sentience] Using proxy: {proxy_config.server}") - - # Add video recording if configured - if self.record_video_dir: - video_dir = Path(self.record_video_dir) - video_dir.mkdir(parents=True, exist_ok=True) - launch_params["record_video_dir"] = str(video_dir) - launch_params["record_video_size"] = self.record_video_size - print(f"🎥 [Sentience] Recording video to: {video_dir}") - print( - f" Resolution: {self.record_video_size['width']}x{self.record_video_size['height']}" - ) - - # Launch persistent context - self.context = await self.playwright.chromium.launch_persistent_context(**launch_params) - - self.page = self.context.pages[0] if self.context.pages else await self.context.new_page() - - # Inject storage state if provided - if self.storage_state: - await self._inject_storage_state(self.storage_state) - - # Apply stealth if available - if STEALTH_AVAILABLE: - await stealth_async(self.page) - - # Wait a moment for extension to initialize - await asyncio.sleep(0.5) - - async def goto(self, url: str) -> None: - """Navigate to a URL and ensure extension is ready (async)""" - if not self.page: - raise RuntimeError("Browser not started. Call await start() first.") - - await self.page.goto(url, wait_until="domcontentloaded") - - # Wait for extension to be ready - if not await self._wait_for_extension(): - try: - diag = await self.page.evaluate( - """() => ({ - sentience_defined: typeof window.sentience !== 'undefined', - registry_defined: typeof window.sentience_registry !== 'undefined', - snapshot_defined: window.sentience && typeof window.sentience.snapshot === 'function', - extension_id: document.documentElement.dataset.sentienceExtensionId || 'not set', - url: window.location.href - })""" - ) - except Exception as e: - diag = f"Failed to get diagnostics: {str(e)}" - - raise RuntimeError( - "Extension failed to load after navigation. Make sure:\n" - "1. Extension is built (cd sentience-chrome && ./build.sh)\n" - "2. All files are present (manifest.json, content.js, injected_api.js, pkg/)\n" - "3. Check browser console for errors (run with headless=False to see console)\n" - f"4. Extension path: {self._extension_path}\n" - f"5. Diagnostic info: {diag}" - ) - - async def _inject_storage_state(self, storage_state: str | Path | StorageState | dict) -> None: - """Inject storage state (cookies + localStorage) into browser context (async)""" - import json - - # Load storage state - if isinstance(storage_state, (str, Path)): - with open(storage_state, encoding="utf-8") as f: - state_dict = json.load(f) - state = StorageState.from_dict(state_dict) - elif isinstance(storage_state, StorageState): - state = storage_state - elif isinstance(storage_state, dict): - state = StorageState.from_dict(storage_state) - else: - raise ValueError( - f"Invalid storage_state type: {type(storage_state)}. " - "Expected str, Path, StorageState, or dict." - ) - - # Inject cookies - if state.cookies: - playwright_cookies = [] - for cookie in state.cookies: - cookie_dict = cookie.model_dump() - playwright_cookie = { - "name": cookie_dict["name"], - "value": cookie_dict["value"], - "domain": cookie_dict["domain"], - "path": cookie_dict["path"], - } - if cookie_dict.get("expires"): - playwright_cookie["expires"] = cookie_dict["expires"] - if cookie_dict.get("httpOnly"): - playwright_cookie["httpOnly"] = cookie_dict["httpOnly"] - if cookie_dict.get("secure"): - playwright_cookie["secure"] = cookie_dict["secure"] - if cookie_dict.get("sameSite"): - playwright_cookie["sameSite"] = cookie_dict["sameSite"] - playwright_cookies.append(playwright_cookie) - - await self.context.add_cookies(playwright_cookies) - print(f"✅ [Sentience] Injected {len(state.cookies)} cookie(s)") - - # Inject LocalStorage - if state.origins: - for origin_data in state.origins: - origin = origin_data.origin - if not origin: - continue - - try: - await self.page.goto(origin, wait_until="domcontentloaded", timeout=10000) - - if origin_data.localStorage: - localStorage_dict = { - item.name: item.value for item in origin_data.localStorage - } - await self.page.evaluate( - """(localStorage_data) => { - for (const [key, value] of Object.entries(localStorage_data)) { - localStorage.setItem(key, value); - } - }""", - localStorage_dict, - ) - print( - f"✅ [Sentience] Injected {len(origin_data.localStorage)} localStorage item(s) for {origin}" - ) - except Exception as e: - print(f"⚠️ [Sentience] Failed to inject localStorage for {origin}: {e}") - - async def _wait_for_extension(self, timeout_sec: float = 5.0) -> bool: - """Poll for window.sentience to be available (async)""" - start_time = time.time() - last_error = None - - while time.time() - start_time < timeout_sec: - try: - result = await self.page.evaluate( - """() => { - if (typeof window.sentience === 'undefined') { - return { ready: false, reason: 'window.sentience undefined' }; - } - if (window.sentience._wasmModule === null) { - return { ready: false, reason: 'WASM module not fully loaded' }; - } - return { ready: true }; - } - """ - ) - - if isinstance(result, dict): - if result.get("ready"): - return True - last_error = result.get("reason", "Unknown error") - except Exception as e: - last_error = f"Evaluation error: {str(e)}" - - await asyncio.sleep(0.3) - - if last_error: - import warnings - - warnings.warn(f"Extension wait timeout. Last status: {last_error}") - - return False - - async def close(self, output_path: str | Path | None = None) -> str | None: - """ - Close browser and cleanup (async) - - Args: - output_path: Optional path to rename the video file to - - Returns: - Path to video file if recording was enabled, None otherwise - """ - temp_video_path = None - - if self.record_video_dir: - try: - if self.page and self.page.video: - temp_video_path = await self.page.video.path() - elif self.context: - for page in self.context.pages: - if page.video: - temp_video_path = await page.video.path() - break - except Exception: - pass - - if self.context: - await self.context.close() - self.context = None - - if self.playwright: - await self.playwright.stop() - self.playwright = None - - if self._extension_path and os.path.exists(self._extension_path): - shutil.rmtree(self._extension_path) - - # Clear page reference after closing context - self.page = None - - final_path = temp_video_path - if temp_video_path and output_path and os.path.exists(temp_video_path): - try: - output_path = str(output_path) - Path(output_path).parent.mkdir(parents=True, exist_ok=True) - shutil.move(temp_video_path, output_path) - final_path = output_path - except Exception as e: - import warnings - - warnings.warn(f"Failed to rename video file: {e}") - final_path = temp_video_path - - return final_path - - async def __aenter__(self): - """Async context manager entry""" - await self.start() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - """Async context manager exit""" - await self.close() - - @classmethod - async def from_existing( - cls, - context: BrowserContext, - api_key: str | None = None, - api_url: str | None = None, - ) -> "AsyncSentienceBrowser": - """ - Create AsyncSentienceBrowser from an existing Playwright BrowserContext. - - Args: - context: Existing Playwright BrowserContext - api_key: Optional API key for server-side processing - api_url: Optional API URL - - Returns: - AsyncSentienceBrowser instance configured to use the existing context - """ - instance = cls(api_key=api_key, api_url=api_url) - instance.context = context - pages = context.pages - instance.page = pages[0] if pages else await context.new_page() - - # Apply stealth if available - if STEALTH_AVAILABLE: - await stealth_async(instance.page) - - # Wait for extension to be ready - await asyncio.sleep(0.5) - - return instance - - @classmethod - async def from_page( - cls, - page: Page, - api_key: str | None = None, - api_url: str | None = None, - ) -> "AsyncSentienceBrowser": - """ - Create AsyncSentienceBrowser from an existing Playwright Page. - - Args: - page: Existing Playwright Page - api_key: Optional API key for server-side processing - api_url: Optional API URL - - Returns: - AsyncSentienceBrowser instance configured to use the existing page - """ - instance = cls(api_key=api_key, api_url=api_url) - instance.page = page - instance.context = page.context - - # Apply stealth if available - if STEALTH_AVAILABLE: - await stealth_async(instance.page) - - # Wait for extension to be ready - await asyncio.sleep(0.5) - - return instance - - -# ========== Async Snapshot Functions ========== - - -async def snapshot_async( - browser: AsyncSentienceBrowser, - options: SnapshotOptions | None = None, -) -> Snapshot: - """ - Take a snapshot of the current page (async) - - Args: - browser: AsyncSentienceBrowser instance - options: Snapshot options (screenshot, limit, filter, etc.) - If None, uses default options. - - Returns: - Snapshot object - - Example: - # Basic snapshot with defaults - snap = await snapshot_async(browser) - - # With options - snap = await snapshot_async(browser, SnapshotOptions( - screenshot=True, - limit=100, - show_overlay=True - )) - """ - # Use default options if none provided - if options is None: - options = SnapshotOptions() - - # Determine if we should use server-side API - should_use_api = ( - options.use_api if options.use_api is not None else (browser.api_key is not None) - ) - - if should_use_api and browser.api_key: - # Use server-side API (Pro/Enterprise tier) - return await _snapshot_via_api_async(browser, options) - else: - # Use local extension (Free tier) - return await _snapshot_via_extension_async(browser, options) - +# ========== Phase 2A: Core Utilities ========== +# Re-export async wait function from wait.py +from sentience.wait import wait_for_async -async def _snapshot_via_extension_async( - browser: AsyncSentienceBrowser, - options: SnapshotOptions, -) -> Snapshot: - """Take snapshot using local extension (Free tier) - async""" - if not browser.page: - raise RuntimeError("Browser not started. Call await browser.start() first.") +# Re-export async screenshot function from screenshot.py +from sentience.screenshot import screenshot_async - # Wait for extension injection to complete - try: - await browser.page.wait_for_function( - "typeof window.sentience !== 'undefined'", - timeout=5000, - ) - except Exception as e: - try: - diag = await browser.page.evaluate( - """() => ({ - sentience_defined: typeof window.sentience !== 'undefined', - extension_id: document.documentElement.dataset.sentienceExtensionId || 'not set', - url: window.location.href - })""" - ) - except Exception: - diag = {"error": "Could not gather diagnostics"} +# Re-export async text search function from text_search.py +from sentience.text_search import find_text_rect_async - raise RuntimeError( - f"Sentience extension failed to inject window.sentience API. " - f"Is the extension loaded? Diagnostics: {diag}" - ) from e - - # Build options dict for extension API - ext_options: dict[str, Any] = {} - if options.screenshot is not False: - ext_options["screenshot"] = options.screenshot - if options.limit != 50: - ext_options["limit"] = options.limit - if options.filter is not None: - ext_options["filter"] = ( - options.filter.model_dump() if hasattr(options.filter, "model_dump") else options.filter - ) - - # Call extension API - result = await browser.page.evaluate( - """ - (options) => { - return window.sentience.snapshot(options); - } - """, - ext_options, - ) - - # Save trace if requested - if options.save_trace: - from sentience.snapshot import _save_trace_to_file - - _save_trace_to_file(result.get("raw_elements", []), options.trace_path) - - # Show visual overlay if requested - if options.show_overlay: - raw_elements = result.get("raw_elements", []) - if raw_elements: - await browser.page.evaluate( - """ - (elements) => { - if (window.sentience && window.sentience.showOverlay) { - window.sentience.showOverlay(elements, null); - } - } - """, - raw_elements, - ) - - # Validate and parse with Pydantic - snapshot_obj = Snapshot(**result) - return snapshot_obj - - -async def _snapshot_via_api_async( - browser: AsyncSentienceBrowser, - options: SnapshotOptions, -) -> Snapshot: - """Take snapshot using server-side API (Pro/Enterprise tier) - async""" - if not browser.page: - raise RuntimeError("Browser not started. Call await browser.start() first.") - - if not browser.api_key: - raise ValueError("API key required for server-side processing") - - if not browser.api_url: - raise ValueError("API URL required for server-side processing") - - # Wait for extension injection - try: - await browser.page.wait_for_function( - "typeof window.sentience !== 'undefined'", timeout=5000 - ) - except Exception as e: - raise RuntimeError( - "Sentience extension failed to inject. Cannot collect raw data for API processing." - ) from e - - # Step 1: Get raw data from local extension - raw_options: dict[str, any] = {} - if options.screenshot is not False: - raw_options["screenshot"] = options.screenshot - - raw_result = await browser.page.evaluate( - """ - (options) => { - return window.sentience.snapshot(options); - } - """, - raw_options, - ) - - # Save trace if requested - if options.save_trace: - from sentience.snapshot import _save_trace_to_file - - _save_trace_to_file(raw_result.get("raw_elements", []), options.trace_path) - - # Step 2: Send to server for smart ranking/filtering - import json - - from sentience.snapshot import MAX_PAYLOAD_BYTES - - payload = { - "raw_elements": raw_result.get("raw_elements", []), - "url": raw_result.get("url", ""), - "viewport": raw_result.get("viewport"), - "goal": options.goal, - "options": { - "limit": options.limit, - "filter": options.filter.model_dump() if options.filter else None, - }, - } - - # Check payload size - payload_json = json.dumps(payload) - payload_size = len(payload_json.encode("utf-8")) - if payload_size > MAX_PAYLOAD_BYTES: - raise ValueError( - f"Payload size ({payload_size / 1024 / 1024:.2f}MB) exceeds server limit " - f"({MAX_PAYLOAD_BYTES / 1024 / 1024:.0f}MB). " - f"Try reducing the number of elements on the page or filtering elements." - ) - - headers = { - "Authorization": f"Bearer {browser.api_key}", - "Content-Type": "application/json", - } - - try: - import aiohttp - - async with aiohttp.ClientSession() as session: - async with session.post( - f"{browser.api_url}/v1/snapshot", - data=payload_json, - headers=headers, - timeout=aiohttp.ClientTimeout(total=30), - ) as response: - response.raise_for_status() - api_result = await response.json() - - # Merge API result with local data - snapshot_data = { - "status": api_result.get("status", "success"), - "timestamp": api_result.get("timestamp"), - "url": api_result.get("url", raw_result.get("url", "")), - "viewport": api_result.get("viewport", raw_result.get("viewport")), - "elements": api_result.get("elements", []), - "screenshot": raw_result.get("screenshot"), - "screenshot_format": raw_result.get("screenshot_format"), - "error": api_result.get("error"), - } - - # Show visual overlay if requested - if options.show_overlay: - elements = api_result.get("elements", []) - if elements: - await browser.page.evaluate( - """ - (elements) => { - if (window.sentience && window.sentience.showOverlay) { - window.sentience.showOverlay(elements, null); - } - } - """, - elements, - ) - - return Snapshot(**snapshot_data) - except ImportError: - # Fallback to requests if aiohttp not available (shouldn't happen in async context) - raise RuntimeError( - "aiohttp is required for async API calls. Install it with: pip install aiohttp" - ) - except Exception as e: - raise RuntimeError(f"API request failed: {e}") - - -# ========== Async Action Functions ========== - - -async def click_async( - browser: AsyncSentienceBrowser, - element_id: int, - use_mouse: bool = True, - take_snapshot: bool = False, -) -> ActionResult: - """ - Click an element by ID using hybrid approach (async) - - Args: - browser: AsyncSentienceBrowser instance - element_id: Element ID from snapshot - use_mouse: If True, use Playwright's mouse.click() at element center - take_snapshot: Whether to take snapshot after action - - Returns: - ActionResult - """ - if not browser.page: - raise RuntimeError("Browser not started. Call await browser.start() first.") - - start_time = time.time() - url_before = browser.page.url - - if use_mouse: - try: - snap = await snapshot_async(browser) - element = None - for el in snap.elements: - if el.id == element_id: - element = el - break - - if element: - center_x = element.bbox.x + element.bbox.width / 2 - center_y = element.bbox.y + element.bbox.height / 2 - try: - await browser.page.mouse.click(center_x, center_y) - success = True - except Exception: - success = True - else: - try: - success = await browser.page.evaluate( - """ - (id) => { - return window.sentience.click(id); - } - """, - element_id, - ) - except Exception: - success = True - except Exception: - try: - success = await browser.page.evaluate( - """ - (id) => { - return window.sentience.click(id); - } - """, - element_id, - ) - except Exception: - success = True - else: - success = await browser.page.evaluate( - """ - (id) => { - return window.sentience.click(id); - } - """, - element_id, - ) - - # Wait a bit for navigation/DOM updates - try: - await browser.page.wait_for_timeout(500) - except Exception: - pass - - duration_ms = int((time.time() - start_time) * 1000) - - # Check if URL changed - try: - url_after = browser.page.url - url_changed = url_before != url_after - except Exception: - url_after = url_before - url_changed = True - - # Determine outcome - outcome: str | None = None - if url_changed: - outcome = "navigated" - elif success: - outcome = "dom_updated" - else: - outcome = "error" - - # Optional snapshot after - snapshot_after: Snapshot | None = None - if take_snapshot: - try: - snapshot_after = await snapshot_async(browser) - except Exception: - pass - - return ActionResult( - success=success, - duration_ms=duration_ms, - outcome=outcome, - url_changed=url_changed, - snapshot_after=snapshot_after, - error=( - None - if success - else { - "code": "click_failed", - "reason": "Element not found or not clickable", - } - ), - ) - - -async def type_text_async( - browser: AsyncSentienceBrowser, element_id: int, text: str, take_snapshot: bool = False -) -> ActionResult: - """ - Type text into an element (async) - - Args: - browser: AsyncSentienceBrowser instance - element_id: Element ID from snapshot - text: Text to type - take_snapshot: Whether to take snapshot after action - - Returns: - ActionResult - """ - if not browser.page: - raise RuntimeError("Browser not started. Call await browser.start() first.") - - start_time = time.time() - url_before = browser.page.url - - # Focus element first - focused = await browser.page.evaluate( - """ - (id) => { - const el = window.sentience_registry[id]; - if (el) { - el.focus(); - return true; - } - return false; - } - """, - element_id, - ) - - if not focused: - return ActionResult( - success=False, - duration_ms=int((time.time() - start_time) * 1000), - outcome="error", - error={"code": "focus_failed", "reason": "Element not found"}, - ) - - # Type using Playwright keyboard - await browser.page.keyboard.type(text) - - duration_ms = int((time.time() - start_time) * 1000) - url_after = browser.page.url - url_changed = url_before != url_after - - outcome = "navigated" if url_changed else "dom_updated" - - snapshot_after: Snapshot | None = None - if take_snapshot: - snapshot_after = await snapshot_async(browser) - - return ActionResult( - success=True, - duration_ms=duration_ms, - outcome=outcome, - url_changed=url_changed, - snapshot_after=snapshot_after, - ) - - -async def press_async( - browser: AsyncSentienceBrowser, key: str, take_snapshot: bool = False -) -> ActionResult: - """ - Press a keyboard key (async) - - Args: - browser: AsyncSentienceBrowser instance - key: Key to press (e.g., "Enter", "Escape", "Tab") - take_snapshot: Whether to take snapshot after action - - Returns: - ActionResult - """ - if not browser.page: - raise RuntimeError("Browser not started. Call await browser.start() first.") - - start_time = time.time() - url_before = browser.page.url - - # Press key using Playwright - await browser.page.keyboard.press(key) - - # Wait a bit for navigation/DOM updates - await browser.page.wait_for_timeout(500) - - duration_ms = int((time.time() - start_time) * 1000) - url_after = browser.page.url - url_changed = url_before != url_after - - outcome = "navigated" if url_changed else "dom_updated" - - snapshot_after: Snapshot | None = None - if take_snapshot: - snapshot_after = await snapshot_async(browser) - - return ActionResult( - success=True, - duration_ms=duration_ms, - outcome=outcome, - url_changed=url_changed, - snapshot_after=snapshot_after, - ) - - -async def _highlight_rect_async( - browser: AsyncSentienceBrowser, rect: dict[str, float], duration_sec: float = 2.0 -) -> None: - """Highlight a rectangle with a red border overlay (async)""" - if not browser.page: - return - - highlight_id = f"sentience_highlight_{int(time.time() * 1000)}" - - args = { - "rect": { - "x": rect["x"], - "y": rect["y"], - "w": rect["w"], - "h": rect["h"], - }, - "highlightId": highlight_id, - "durationSec": duration_sec, - } - - await browser.page.evaluate( - """ - (args) => { - const { rect, highlightId, durationSec } = args; - const overlay = document.createElement('div'); - overlay.id = highlightId; - overlay.style.position = 'fixed'; - overlay.style.left = `${rect.x}px`; - overlay.style.top = `${rect.y}px`; - overlay.style.width = `${rect.w}px`; - overlay.style.height = `${rect.h}px`; - overlay.style.border = '3px solid red'; - overlay.style.borderRadius = '2px'; - overlay.style.boxSizing = 'border-box'; - overlay.style.pointerEvents = 'none'; - overlay.style.zIndex = '999999'; - overlay.style.backgroundColor = 'rgba(255, 0, 0, 0.1)'; - overlay.style.transition = 'opacity 0.3s ease-out'; - - document.body.appendChild(overlay); - - setTimeout(() => { - overlay.style.opacity = '0'; - setTimeout(() => { - if (overlay.parentNode) { - overlay.parentNode.removeChild(overlay); - } - }, 300); - }, durationSec * 1000); - } - """, - args, - ) - - -async def click_rect_async( - browser: AsyncSentienceBrowser, - rect: dict[str, float] | BBox, - highlight: bool = True, - highlight_duration: float = 2.0, - take_snapshot: bool = False, -) -> ActionResult: - """ - Click at the center of a rectangle (async) - - Args: - browser: AsyncSentienceBrowser instance - rect: Dictionary with x, y, width (w), height (h) keys, or BBox object - highlight: Whether to show a red border highlight when clicking - highlight_duration: How long to show the highlight in seconds - take_snapshot: Whether to take snapshot after action - - Returns: - ActionResult - """ - if not browser.page: - raise RuntimeError("Browser not started. Call await browser.start() first.") - - # Handle BBox object or dict - if isinstance(rect, BBox): - x = rect.x - y = rect.y - w = rect.width - h = rect.height - else: - x = rect.get("x", 0) - y = rect.get("y", 0) - w = rect.get("w") or rect.get("width", 0) - h = rect.get("h") or rect.get("height", 0) - - if w <= 0 or h <= 0: - return ActionResult( - success=False, - duration_ms=0, - outcome="error", - error={ - "code": "invalid_rect", - "reason": "Rectangle width and height must be positive", - }, - ) - - start_time = time.time() - url_before = browser.page.url - - # Calculate center of rectangle - center_x = x + w / 2 - center_y = y + h / 2 - - # Show highlight before clicking - if highlight: - await _highlight_rect_async(browser, {"x": x, "y": y, "w": w, "h": h}, highlight_duration) - await browser.page.wait_for_timeout(50) - - # Use Playwright's native mouse click - try: - await browser.page.mouse.click(center_x, center_y) - success = True - except Exception as e: - success = False - error_msg = str(e) - - # Wait a bit for navigation/DOM updates - await browser.page.wait_for_timeout(500) - - duration_ms = int((time.time() - start_time) * 1000) - url_after = browser.page.url - url_changed = url_before != url_after - - # Determine outcome - outcome: str | None = None - if url_changed: - outcome = "navigated" - elif success: - outcome = "dom_updated" - else: - outcome = "error" - - # Optional snapshot after - snapshot_after: Snapshot | None = None - if take_snapshot: - snapshot_after = await snapshot_async(browser) - - return ActionResult( - success=success, - duration_ms=duration_ms, - outcome=outcome, - url_changed=url_changed, - snapshot_after=snapshot_after, - error=( - None - if success - else { - "code": "click_failed", - "reason": error_msg if not success else "Click failed", - } - ), - ) +# ========== Phase 2B: Supporting Utilities (Future) ========== +# TODO: Re-export when implemented +# from sentience.read import read_async +# from sentience.overlay import show_overlay_async, clear_overlay_async +# from sentience.expect import expect_async, ExpectationAsync +# ========== Phase 2C: Agent Layer (Future) ========== +# TODO: Re-export when implemented +# from sentience.agent import SentienceAgentAsync +# from sentience.base_agent import BaseAgentAsync -# ========== Re-export Query Functions (Pure Functions - No Async Needed) ========== +# ========== Phase 2D: Developer Tools (Future) ========== +# TODO: Re-export when implemented +# from sentience.recorder import RecorderAsync +# from sentience.inspector import InspectorAsync -# Query functions (find, query) are pure functions that work with Snapshot objects -# They don't need async versions, but we re-export them for convenience +# ========== Query Functions (Pure Functions - No Async Needed) ========== +# Re-export query functions (pure functions, no async needed) from sentience.query import find, query __all__ = [ - "AsyncSentienceBrowser", - "snapshot_async", - "click_async", - "type_text_async", - "press_async", - "click_rect_async", - "find", - "query", + # Browser + "AsyncSentienceBrowser", # Re-exported from browser.py + # Snapshot (Phase 1) + "snapshot_async", # Re-exported from snapshot.py + # Actions (Phase 1) + "click_async", # Re-exported from actions.py + "type_text_async", # Re-exported from actions.py + "press_async", # Re-exported from actions.py + "click_rect_async", # Re-exported from actions.py + # Phase 2A: Core Utilities + "wait_for_async", # Re-exported from wait.py + "screenshot_async", # Re-exported from screenshot.py + "find_text_rect_async", # Re-exported from text_search.py + # Phase 2B: Supporting Utilities (Future - uncomment when implemented) + # "read_async", + # "show_overlay_async", + # "clear_overlay_async", + # "expect_async", + # "ExpectationAsync", + # Phase 2C: Agent Layer (Future - uncomment when implemented) + # "SentienceAgentAsync", + # "BaseAgentAsync", + # Phase 2D: Developer Tools (Future - uncomment when implemented) + # "RecorderAsync", + # "InspectorAsync", + # Query Functions + "find", # Re-exported from query.py + "query", # Re-exported from query.py ] diff --git a/sentience/browser.py b/sentience/browser.py index b7617b9..9dd9014 100644 --- a/sentience/browser.py +++ b/sentience/browser.py @@ -2,6 +2,7 @@ Playwright browser harness with extension loading """ +import asyncio import os import shutil import tempfile @@ -9,6 +10,7 @@ from pathlib import Path from urllib.parse import urlparse +from playwright.async_api import BrowserContext as AsyncBrowserContext, Page as AsyncPage, Playwright as AsyncPlaywright, async_playwright from playwright.sync_api import BrowserContext, Page, Playwright, sync_playwright from sentience._extension_loader import find_extension_path @@ -16,7 +18,7 @@ # Import stealth for bot evasion (optional - graceful fallback if not available) try: - from playwright_stealth import stealth_sync + from playwright_stealth import stealth_async, stealth_sync STEALTH_AVAILABLE = True except ImportError: @@ -574,3 +576,459 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" self.close() + + +class AsyncSentienceBrowser: + """Async version of SentienceBrowser for use in asyncio contexts.""" + + def __init__( + self, + api_key: str | None = None, + api_url: str | None = None, + headless: bool | None = None, + proxy: str | None = None, + user_data_dir: str | Path | None = None, + storage_state: str | Path | StorageState | dict | None = None, + record_video_dir: str | Path | None = None, + record_video_size: dict[str, int] | None = None, + viewport: Viewport | dict[str, int] | None = None, + ): + """ + Initialize Async Sentience browser + + Args: + api_key: Optional API key for server-side processing (Pro/Enterprise tiers) + If None, uses free tier (local extension only) + api_url: Server URL for API calls (defaults to https://api.sentienceapi.com if api_key provided) + headless: Whether to run in headless mode. If None, defaults to True in CI, False otherwise + proxy: Optional proxy server URL (e.g., 'http://user:pass@proxy.example.com:8080') + user_data_dir: Optional path to user data directory for persistent sessions + storage_state: Optional storage state to inject (cookies + localStorage) + record_video_dir: Optional directory path to save video recordings + record_video_size: Optional video resolution as dict with 'width' and 'height' keys + viewport: Optional viewport size as Viewport object or dict with 'width' and 'height' keys. + Examples: Viewport(width=1280, height=800) (default) + Viewport(width=1920, height=1080) (Full HD) + {"width": 1280, "height": 800} (dict also supported) + If None, defaults to Viewport(width=1280, height=800). + """ + self.api_key = api_key + # Only set api_url if api_key is provided, otherwise None (free tier) + if self.api_key and not api_url: + self.api_url = "https://api.sentienceapi.com" + else: + self.api_url = api_url + + # Determine headless mode + if headless is None: + # Default to False for local dev, True for CI + self.headless = os.environ.get("CI", "").lower() == "true" + else: + self.headless = headless + + # Support proxy from argument or environment variable + self.proxy = proxy or os.environ.get("SENTIENCE_PROXY") + + # Auth injection support + self.user_data_dir = user_data_dir + self.storage_state = storage_state + + # Video recording support + self.record_video_dir = record_video_dir + self.record_video_size = record_video_size or {"width": 1280, "height": 800} + + # Viewport configuration - convert dict to Viewport if needed + if viewport is None: + self.viewport = Viewport(width=1280, height=800) + elif isinstance(viewport, dict): + self.viewport = Viewport(width=viewport["width"], height=viewport["height"]) + else: + self.viewport = viewport + + self.playwright: AsyncPlaywright | None = None + self.context: AsyncBrowserContext | None = None + self.page: AsyncPage | None = None + self._extension_path: str | None = None + + def _parse_proxy(self, proxy_string: str) -> ProxyConfig | None: + """ + Parse proxy connection string into ProxyConfig. + + Args: + proxy_string: Proxy URL (e.g., 'http://user:pass@proxy.example.com:8080') + + Returns: + ProxyConfig object or None if invalid + """ + if not proxy_string: + return None + + try: + parsed = urlparse(proxy_string) + + # Validate scheme + if parsed.scheme not in ("http", "https", "socks5"): + print(f"⚠️ [Sentience] Unsupported proxy scheme: {parsed.scheme}") + print(" Supported: http, https, socks5") + return None + + # Validate host and port + if not parsed.hostname or not parsed.port: + print("⚠️ [Sentience] Proxy URL must include hostname and port") + print(" Expected format: http://username:password@host:port") + return None + + # Build server URL + server = f"{parsed.scheme}://{parsed.hostname}:{parsed.port}" + + # Create ProxyConfig with optional credentials + return ProxyConfig( + server=server, + username=parsed.username if parsed.username else None, + password=parsed.password if parsed.password else None, + ) + + except Exception as e: + print(f"⚠️ [Sentience] Invalid proxy configuration: {e}") + print(" Expected format: http://username:password@host:port") + return None + + async def start(self) -> None: + """Launch browser with extension loaded (async)""" + # Get extension source path using shared utility + extension_source = find_extension_path() + + # Create temporary extension bundle + self._extension_path = tempfile.mkdtemp(prefix="sentience-ext-") + shutil.copytree(extension_source, self._extension_path, dirs_exist_ok=True) + + self.playwright = await async_playwright().start() + + # Build launch arguments + args = [ + f"--disable-extensions-except={self._extension_path}", + f"--load-extension={self._extension_path}", + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + "--disable-infobars", + "--disable-features=WebRtcHideLocalIpsWithMdns", + "--force-webrtc-ip-handling-policy=disable_non_proxied_udp", + ] + + if self.headless: + args.append("--headless=new") + + # Parse proxy configuration if provided + proxy_config = self._parse_proxy(self.proxy) if self.proxy else None + + # Handle User Data Directory + if self.user_data_dir: + user_data_dir = str(self.user_data_dir) + Path(user_data_dir).mkdir(parents=True, exist_ok=True) + else: + user_data_dir = "" + + # Build launch_persistent_context parameters + launch_params = { + "user_data_dir": user_data_dir, + "headless": False, + "args": args, + "viewport": {"width": self.viewport.width, "height": self.viewport.height}, + "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + } + + # Add proxy if configured + if proxy_config: + launch_params["proxy"] = proxy_config.to_playwright_dict() + launch_params["ignore_https_errors"] = True + print(f"🌐 [Sentience] Using proxy: {proxy_config.server}") + + # Add video recording if configured + if self.record_video_dir: + video_dir = Path(self.record_video_dir) + video_dir.mkdir(parents=True, exist_ok=True) + launch_params["record_video_dir"] = str(video_dir) + launch_params["record_video_size"] = self.record_video_size + print(f"🎥 [Sentience] Recording video to: {video_dir}") + print( + f" Resolution: {self.record_video_size['width']}x{self.record_video_size['height']}" + ) + + # Launch persistent context + self.context = await self.playwright.chromium.launch_persistent_context(**launch_params) + + self.page = self.context.pages[0] if self.context.pages else await self.context.new_page() + + # Inject storage state if provided + if self.storage_state: + await self._inject_storage_state(self.storage_state) + + # Apply stealth if available + if STEALTH_AVAILABLE: + await stealth_async(self.page) + + # Wait a moment for extension to initialize + await asyncio.sleep(0.5) + + async def goto(self, url: str) -> None: + """Navigate to a URL and ensure extension is ready (async)""" + if not self.page: + raise RuntimeError("Browser not started. Call await start() first.") + + await self.page.goto(url, wait_until="domcontentloaded") + + # Wait for extension to be ready + if not await self._wait_for_extension(): + try: + diag = await self.page.evaluate( + """() => ({ + sentience_defined: typeof window.sentience !== 'undefined', + registry_defined: typeof window.sentience_registry !== 'undefined', + snapshot_defined: window.sentience && typeof window.sentience.snapshot === 'function', + extension_id: document.documentElement.dataset.sentienceExtensionId || 'not set', + url: window.location.href + })""" + ) + except Exception as e: + diag = f"Failed to get diagnostics: {str(e)}" + + raise RuntimeError( + "Extension failed to load after navigation. Make sure:\n" + "1. Extension is built (cd sentience-chrome && ./build.sh)\n" + "2. All files are present (manifest.json, content.js, injected_api.js, pkg/)\n" + "3. Check browser console for errors (run with headless=False to see console)\n" + f"4. Extension path: {self._extension_path}\n" + f"5. Diagnostic info: {diag}" + ) + + async def _inject_storage_state(self, storage_state: str | Path | StorageState | dict) -> None: + """Inject storage state (cookies + localStorage) into browser context (async)""" + import json + + # Load storage state + if isinstance(storage_state, (str, Path)): + with open(storage_state, encoding="utf-8") as f: + state_dict = json.load(f) + state = StorageState.from_dict(state_dict) + elif isinstance(storage_state, StorageState): + state = storage_state + elif isinstance(storage_state, dict): + state = StorageState.from_dict(storage_state) + else: + raise ValueError( + f"Invalid storage_state type: {type(storage_state)}. " + "Expected str, Path, StorageState, or dict." + ) + + # Inject cookies + if state.cookies: + playwright_cookies = [] + for cookie in state.cookies: + cookie_dict = cookie.model_dump() + playwright_cookie = { + "name": cookie_dict["name"], + "value": cookie_dict["value"], + "domain": cookie_dict["domain"], + "path": cookie_dict["path"], + } + if cookie_dict.get("expires"): + playwright_cookie["expires"] = cookie_dict["expires"] + if cookie_dict.get("httpOnly"): + playwright_cookie["httpOnly"] = cookie_dict["httpOnly"] + if cookie_dict.get("secure"): + playwright_cookie["secure"] = cookie_dict["secure"] + if cookie_dict.get("sameSite"): + playwright_cookie["sameSite"] = cookie_dict["sameSite"] + playwright_cookies.append(playwright_cookie) + + await self.context.add_cookies(playwright_cookies) + print(f"✅ [Sentience] Injected {len(state.cookies)} cookie(s)") + + # Inject LocalStorage + if state.origins: + for origin_data in state.origins: + origin = origin_data.origin + if not origin: + continue + + try: + await self.page.goto(origin, wait_until="domcontentloaded", timeout=10000) + + if origin_data.localStorage: + localStorage_dict = { + item.name: item.value for item in origin_data.localStorage + } + await self.page.evaluate( + """(localStorage_data) => { + for (const [key, value] of Object.entries(localStorage_data)) { + localStorage.setItem(key, value); + } + }""", + localStorage_dict, + ) + print( + f"✅ [Sentience] Injected {len(origin_data.localStorage)} localStorage item(s) for {origin}" + ) + except Exception as e: + print(f"⚠️ [Sentience] Failed to inject localStorage for {origin}: {e}") + + async def _wait_for_extension(self, timeout_sec: float = 5.0) -> bool: + """Poll for window.sentience to be available (async)""" + start_time = time.time() + last_error = None + + while time.time() - start_time < timeout_sec: + try: + result = await self.page.evaluate( + """() => { + if (typeof window.sentience === 'undefined') { + return { ready: false, reason: 'window.sentience undefined' }; + } + if (window.sentience._wasmModule === null) { + return { ready: false, reason: 'WASM module not fully loaded' }; + } + return { ready: true }; + } + """ + ) + + if isinstance(result, dict): + if result.get("ready"): + return True + last_error = result.get("reason", "Unknown error") + except Exception as e: + last_error = f"Evaluation error: {str(e)}" + + await asyncio.sleep(0.3) + + if last_error: + import warnings + + warnings.warn(f"Extension wait timeout. Last status: {last_error}") + + return False + + async def close(self, output_path: str | Path | None = None) -> str | None: + """ + Close browser and cleanup (async) + + Args: + output_path: Optional path to rename the video file to + + Returns: + Path to video file if recording was enabled, None otherwise + """ + temp_video_path = None + + if self.record_video_dir: + try: + if self.page and self.page.video: + temp_video_path = await self.page.video.path() + elif self.context: + for page in self.context.pages: + if page.video: + temp_video_path = await page.video.path() + break + except Exception: + pass + + if self.context: + await self.context.close() + self.context = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + if self._extension_path and os.path.exists(self._extension_path): + shutil.rmtree(self._extension_path) + + # Clear page reference after closing context + self.page = None + + final_path = temp_video_path + if temp_video_path and output_path and os.path.exists(temp_video_path): + try: + output_path = str(output_path) + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + shutil.move(temp_video_path, output_path) + final_path = output_path + except Exception as e: + import warnings + + warnings.warn(f"Failed to rename video file: {e}") + final_path = temp_video_path + + return final_path + + async def __aenter__(self): + """Async context manager entry""" + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit""" + await self.close() + + @classmethod + async def from_existing( + cls, + context: AsyncBrowserContext, + api_key: str | None = None, + api_url: str | None = None, + ) -> "AsyncSentienceBrowser": + """ + Create AsyncSentienceBrowser from an existing Playwright BrowserContext. + + Args: + context: Existing Playwright BrowserContext + api_key: Optional API key for server-side processing + api_url: Optional API URL + + Returns: + AsyncSentienceBrowser instance configured to use the existing context + """ + instance = cls(api_key=api_key, api_url=api_url) + instance.context = context + pages = context.pages + instance.page = pages[0] if pages else await context.new_page() + + # Apply stealth if available + if STEALTH_AVAILABLE: + await stealth_async(instance.page) + + # Wait for extension to be ready + await asyncio.sleep(0.5) + + return instance + + @classmethod + async def from_page( + cls, + page: AsyncPage, + api_key: str | None = None, + api_url: str | None = None, + ) -> "AsyncSentienceBrowser": + """ + Create AsyncSentienceBrowser from an existing Playwright Page. + + Args: + page: Existing Playwright Page + api_key: Optional API key for server-side processing + api_url: Optional API URL + + Returns: + AsyncSentienceBrowser instance configured to use the existing page + """ + instance = cls(api_key=api_key, api_url=api_url) + instance.page = page + instance.context = page.context + + # Apply stealth if available + if STEALTH_AVAILABLE: + await stealth_async(instance.page) + + # Wait for extension to be ready + await asyncio.sleep(0.5) + + return instance diff --git a/sentience/screenshot.py b/sentience/screenshot.py index 9414d95..9361615 100644 --- a/sentience/screenshot.py +++ b/sentience/screenshot.py @@ -2,9 +2,10 @@ Screenshot functionality - standalone screenshot capture """ +import base64 from typing import Any, Literal -from .browser import SentienceBrowser +from .browser import AsyncSentienceBrowser, SentienceBrowser def screenshot( @@ -52,3 +53,48 @@ def screenshot( # Return as data URL mime_type = "image/png" if format == "png" else "image/jpeg" return f"data:{mime_type};base64,{base64_data}" + + +async def screenshot_async( + browser: AsyncSentienceBrowser, + format: Literal["png", "jpeg"] = "png", + quality: int | None = None, +) -> str: + """ + Capture screenshot of current page (async) + + Args: + browser: AsyncSentienceBrowser instance + format: Image format - "png" or "jpeg" + quality: JPEG quality (1-100), only used for JPEG format + + Returns: + Base64-encoded screenshot data URL (e.g., "data:image/png;base64,...") + + Raises: + RuntimeError: If browser not started + ValueError: If quality is invalid for JPEG + """ + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + if format == "jpeg" and quality is not None: + if not (1 <= quality <= 100): + raise ValueError("Quality must be between 1 and 100 for JPEG format") + + # Use Playwright's screenshot with base64 encoding + screenshot_options: dict[str, Any] = { + "type": format, + } + + if format == "jpeg" and quality is not None: + screenshot_options["quality"] = quality + + # Capture screenshot as base64 + # Playwright returns bytes when encoding is not specified, so we encode manually + image_bytes = await browser.page.screenshot(**screenshot_options) + base64_data = base64.b64encode(image_bytes).decode("utf-8") + + # Return as data URL + mime_type = "image/png" if format == "png" else "image/jpeg" + return f"data:{mime_type};base64,{base64_data}" diff --git a/sentience/snapshot.py b/sentience/snapshot.py index f4d8808..dcacd17 100644 --- a/sentience/snapshot.py +++ b/sentience/snapshot.py @@ -2,14 +2,16 @@ Snapshot functionality - calls window.sentience.snapshot() or server-side API """ +import asyncio import json import os import time from typing import Any, Optional +import aiohttp import requests -from .browser import SentienceBrowser +from .browser import AsyncSentienceBrowser, SentienceBrowser from .models import Snapshot, SnapshotOptions # Maximum payload size for API requests (10MB server limit) @@ -272,3 +274,242 @@ def _snapshot_via_api( return Snapshot(**snapshot_data) except requests.exceptions.RequestException as e: raise RuntimeError(f"API request failed: {e}") + + +# ========== Async Snapshot Functions ========== + + +async def snapshot_async( + browser: AsyncSentienceBrowser, + options: SnapshotOptions | None = None, +) -> Snapshot: + """ + Take a snapshot of the current page (async) + + Args: + browser: AsyncSentienceBrowser instance + options: Snapshot options (screenshot, limit, filter, etc.) + If None, uses default options. + + Returns: + Snapshot object + + Example: + # Basic snapshot with defaults + snap = await snapshot_async(browser) + + # With options + snap = await snapshot_async(browser, SnapshotOptions( + screenshot=True, + limit=100, + show_overlay=True + )) + """ + # Use default options if none provided + if options is None: + options = SnapshotOptions() + + # Determine if we should use server-side API + should_use_api = ( + options.use_api if options.use_api is not None else (browser.api_key is not None) + ) + + if should_use_api and browser.api_key: + # Use server-side API (Pro/Enterprise tier) + return await _snapshot_via_api_async(browser, options) + else: + # Use local extension (Free tier) + return await _snapshot_via_extension_async(browser, options) + + +async def _snapshot_via_extension_async( + browser: AsyncSentienceBrowser, + options: SnapshotOptions, +) -> Snapshot: + """Take snapshot using local extension (Free tier) - async""" + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + # Wait for extension injection to complete + try: + await browser.page.wait_for_function( + "typeof window.sentience !== 'undefined'", + timeout=5000, + ) + except Exception as e: + try: + diag = await browser.page.evaluate( + """() => ({ + sentience_defined: typeof window.sentience !== 'undefined', + extension_id: document.documentElement.dataset.sentienceExtensionId || 'not set', + url: window.location.href + })""" + ) + except Exception: + diag = {"error": "Could not gather diagnostics"} + + raise RuntimeError( + f"Sentience extension failed to inject window.sentience API. " + f"Is the extension loaded? Diagnostics: {diag}" + ) from e + + # Build options dict for extension API + ext_options: dict[str, Any] = {} + if options.screenshot is not False: + ext_options["screenshot"] = options.screenshot + if options.limit != 50: + ext_options["limit"] = options.limit + if options.filter is not None: + ext_options["filter"] = ( + options.filter.model_dump() if hasattr(options.filter, "model_dump") else options.filter + ) + + # Call extension API + result = await browser.page.evaluate( + """ + (options) => { + return window.sentience.snapshot(options); + } + """, + ext_options, + ) + + # Save trace if requested + if options.save_trace: + _save_trace_to_file(result.get("raw_elements", []), options.trace_path) + + # Show visual overlay if requested + if options.show_overlay: + raw_elements = result.get("raw_elements", []) + if raw_elements: + await browser.page.evaluate( + """ + (elements) => { + if (window.sentience && window.sentience.showOverlay) { + window.sentience.showOverlay(elements, null); + } + } + """, + raw_elements, + ) + + # Validate and parse with Pydantic + snapshot_obj = Snapshot(**result) + return snapshot_obj + + +async def _snapshot_via_api_async( + browser: AsyncSentienceBrowser, + options: SnapshotOptions, +) -> Snapshot: + """Take snapshot using server-side API (Pro/Enterprise tier) - async""" + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + if not browser.api_key: + raise ValueError("API key required for server-side processing") + + if not browser.api_url: + raise ValueError("API URL required for server-side processing") + + # Wait for extension injection + try: + await browser.page.wait_for_function( + "typeof window.sentience !== 'undefined'", timeout=5000 + ) + except Exception as e: + raise RuntimeError( + "Sentience extension failed to inject. Cannot collect raw data for API processing." + ) from e + + # Step 1: Get raw data from local extension + raw_options: dict[str, Any] = {} + if options.screenshot is not False: + raw_options["screenshot"] = options.screenshot + + raw_result = await browser.page.evaluate( + """ + (options) => { + return window.sentience.snapshot(options); + } + """, + raw_options, + ) + + # Save trace if requested + if options.save_trace: + _save_trace_to_file(raw_result.get("raw_elements", []), options.trace_path) + + # Step 2: Send to server for smart ranking/filtering + payload = { + "raw_elements": raw_result.get("raw_elements", []), + "url": raw_result.get("url", ""), + "viewport": raw_result.get("viewport"), + "goal": options.goal, + "options": { + "limit": options.limit, + "filter": options.filter.model_dump() if options.filter else None, + }, + } + + # Check payload size + payload_json = json.dumps(payload) + payload_size = len(payload_json.encode("utf-8")) + if payload_size > MAX_PAYLOAD_BYTES: + raise ValueError( + f"Payload size ({payload_size / 1024 / 1024:.2f}MB) exceeds server limit " + f"({MAX_PAYLOAD_BYTES / 1024 / 1024:.0f}MB). " + f"Try reducing the number of elements on the page or filtering elements." + ) + + headers = { + "Authorization": f"Bearer {browser.api_key}", + "Content-Type": "application/json", + } + + try: + async with aiohttp.ClientSession() as session: + async with session.post( + f"{browser.api_url}/v1/snapshot", + data=payload_json, + headers=headers, + timeout=aiohttp.ClientTimeout(total=30), + ) as response: + response.raise_for_status() + api_result = await response.json() + + # Merge API result with local data + snapshot_data = { + "status": api_result.get("status", "success"), + "timestamp": api_result.get("timestamp"), + "url": api_result.get("url", raw_result.get("url", "")), + "viewport": api_result.get("viewport", raw_result.get("viewport")), + "elements": api_result.get("elements", []), + "screenshot": raw_result.get("screenshot"), + "screenshot_format": raw_result.get("screenshot_format"), + "error": api_result.get("error"), + } + + # Show visual overlay if requested + if options.show_overlay: + elements = api_result.get("elements", []) + if elements: + await browser.page.evaluate( + """ + (elements) => { + if (window.sentience && window.sentience.showOverlay) { + window.sentience.showOverlay(elements, null); + } + } + """, + elements, + ) + + return Snapshot(**snapshot_data) + except ImportError: + # Fallback to requests if aiohttp not available (shouldn't happen in async context) + raise RuntimeError( + "aiohttp is required for async API calls. Install it with: pip install aiohttp" + ) + except Exception as e: + raise RuntimeError(f"API request failed: {e}") diff --git a/sentience/text_search.py b/sentience/text_search.py index 42b9309..f4cd0d7 100644 --- a/sentience/text_search.py +++ b/sentience/text_search.py @@ -2,7 +2,7 @@ Text search utilities - find text and get pixel coordinates """ -from .browser import SentienceBrowser +from .browser import AsyncSentienceBrowser, SentienceBrowser from .models import TextRectSearchResult @@ -146,3 +146,145 @@ def find_text_rect( # Parse and validate with Pydantic return TextRectSearchResult(**result_dict) + + +async def find_text_rect_async( + browser: AsyncSentienceBrowser, + text: str, + case_sensitive: bool = False, + whole_word: bool = False, + max_results: int = 10, +) -> TextRectSearchResult: + """ + Find all occurrences of text on the page and get their exact pixel coordinates (async). + + This function searches for text in all visible text nodes on the page and returns + the bounding rectangles for each match. Useful for: + - Finding specific UI elements by their text content + - Locating buttons, links, or labels without element IDs + - Getting exact coordinates for click automation + - Highlighting search results visually + + Args: + browser: AsyncSentienceBrowser instance + text: Text to search for (required) + case_sensitive: If True, search is case-sensitive (default: False) + whole_word: If True, only match whole words surrounded by whitespace (default: False) + max_results: Maximum number of matches to return (default: 10, max: 100) + + Returns: + TextRectSearchResult with: + - status: "success" or "error" + - query: The search text + - case_sensitive: Whether search was case-sensitive + - whole_word: Whether whole-word matching was used + - matches: Number of matches found + - results: List of TextMatch objects, each containing: + - text: The matched text + - rect: Absolute rectangle (with scroll offset) + - viewport_rect: Viewport-relative rectangle + - context: Surrounding text (before/after) + - in_viewport: Whether visible in current viewport + - viewport: Current viewport dimensions and scroll position + - error: Error message if status is "error" + + Examples: + # Find "Sign In" button + result = await find_text_rect_async(browser, "Sign In") + if result.status == "success" and result.results: + first_match = result.results[0] + print(f"Found at: ({first_match.rect.x}, {first_match.rect.y})") + print(f"Size: {first_match.rect.width}x{first_match.rect.height}") + print(f"In viewport: {first_match.in_viewport}") + + # Case-sensitive search + result = await find_text_rect_async(browser, "LOGIN", case_sensitive=True) + + # Whole word only + result = await find_text_rect_async(browser, "log", whole_word=True) # Won't match "login" + + # Find all matches and click the first visible one + result = await find_text_rect_async(browser, "Buy Now", max_results=5) + if result.status == "success" and result.results: + for match in result.results: + if match.in_viewport: + # Use click_rect_async from actions module + from sentience.actions import click_rect_async + click_result = await click_rect_async(browser, { + "x": match.rect.x, + "y": match.rect.y, + "w": match.rect.width, + "h": match.rect.height + }) + break + """ + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + if not text or not text.strip(): + return TextRectSearchResult( + status="error", + error="Text parameter is required and cannot be empty", + ) + + # Limit max_results to prevent performance issues + max_results = min(max_results, 100) + + # CRITICAL: Wait for extension injection to complete (CSP-resistant architecture) + # The new architecture loads injected_api.js asynchronously, so window.sentience + # may not be immediately available after page load + try: + await browser.page.wait_for_function( + "typeof window.sentience !== 'undefined'", + timeout=5000, # 5 second timeout + ) + except Exception as e: + # Gather diagnostics if wait fails + try: + diag = await browser.page.evaluate( + """() => ({ + sentience_defined: typeof window.sentience !== 'undefined', + extension_id: document.documentElement.dataset.sentienceExtensionId || 'not set', + url: window.location.href + })""" + ) + except Exception: + diag = {"error": "Could not gather diagnostics"} + + raise RuntimeError( + f"Sentience extension failed to inject window.sentience API. " + f"Is the extension loaded? Diagnostics: {diag}" + ) from e + + # Verify findTextRect method exists (for older extension versions that don't have it) + try: + has_find_text_rect = await browser.page.evaluate( + "typeof window.sentience.findTextRect !== 'undefined'" + ) + if not has_find_text_rect: + raise RuntimeError( + "window.sentience.findTextRect is not available. " + "Please update the Sentience extension to the latest version." + ) + except RuntimeError: + raise + except Exception as e: + raise RuntimeError(f"Failed to verify findTextRect availability: {e}") from e + + # Call the extension's findTextRect method + result_dict = await browser.page.evaluate( + """ + (options) => { + return window.sentience.findTextRect(options); + } + """, + { + "text": text, + "caseSensitive": case_sensitive, + "wholeWord": whole_word, + "maxResults": max_results, + }, + ) + + # Parse and validate with Pydantic + return TextRectSearchResult(**result_dict) diff --git a/sentience/wait.py b/sentience/wait.py index 417b70c..d42e899 100644 --- a/sentience/wait.py +++ b/sentience/wait.py @@ -2,12 +2,13 @@ Wait functionality - wait_for element matching selector """ +import asyncio import time -from .browser import SentienceBrowser +from .browser import AsyncSentienceBrowser, SentienceBrowser from .models import SnapshotOptions, WaitResult from .query import find -from .snapshot import snapshot +from .snapshot import snapshot, snapshot_async def wait_for( @@ -71,3 +72,66 @@ def wait_for( duration_ms=duration_ms, timeout=True, ) + + +async def wait_for_async( + browser: AsyncSentienceBrowser, + selector: str | dict, + timeout: float = 10.0, + interval: float | None = None, + use_api: bool | None = None, +) -> WaitResult: + """ + Wait for element matching selector to appear (async) + + Args: + browser: AsyncSentienceBrowser instance + selector: String DSL or dict query + timeout: Maximum time to wait (seconds) + interval: Polling interval (seconds). If None, auto-detects: + - 0.25s for local extension (use_api=False, fast) + - 1.5s for remote API (use_api=True or default, network latency) + use_api: Force use of server-side API if True, local extension if False. + If None, uses API if api_key is set, otherwise uses local extension. + + Returns: + WaitResult + """ + # Auto-detect optimal interval based on API usage + if interval is None: + # Determine if using API + will_use_api = use_api if use_api is not None else (browser.api_key is not None) + if will_use_api: + interval = 1.5 # Longer interval for API calls (network latency) + else: + interval = 0.25 # Shorter interval for local extension (fast) + + start_time = time.time() + + while time.time() - start_time < timeout: + # Take snapshot (may be local extension or remote API) + snap = await snapshot_async(browser, SnapshotOptions(use_api=use_api)) + + # Try to find element + element = find(snap, selector) + + if element: + duration_ms = int((time.time() - start_time) * 1000) + return WaitResult( + found=True, + element=element, + duration_ms=duration_ms, + timeout=False, + ) + + # Wait before next poll + await asyncio.sleep(interval) + + # Timeout + duration_ms = int((time.time() - start_time) * 1000) + return WaitResult( + found=False, + element=None, + duration_ms=duration_ms, + timeout=True, + ) diff --git a/tests/test_async_api.py b/tests/test_async_api.py index b2a9d0f..a6a29d5 100644 --- a/tests/test_async_api.py +++ b/tests/test_async_api.py @@ -10,10 +10,13 @@ click_async, click_rect_async, find, + find_text_rect_async, press_async, query, + screenshot_async, snapshot_async, type_text_async, + wait_for_async, ) from sentience.models import BBox, SnapshotOptions @@ -270,3 +273,63 @@ async def test_async_snapshot_with_goal(): assert snap.status == "success" # Elements may have ML reranking metadata if API key is provided # (This test works with or without API key) + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_wait_for(): + """Test async wait_for function""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Wait for a link to appear + result = await wait_for_async(browser, "role=link", timeout=5.0) + assert result.found is True or result.timeout is True # May or may not find link + assert result.duration_ms >= 0 + if result.found: + assert result.element is not None + assert hasattr(result.element, "id") + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_screenshot(): + """Test async screenshot function""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Test PNG screenshot + data_url = await screenshot_async(browser, format="png") + assert data_url.startswith("data:image/png;base64,") + assert len(data_url) > 100 # Should have base64 data + + # Test JPEG screenshot + data_url_jpeg = await screenshot_async(browser, format="jpeg", quality=85) + assert data_url_jpeg.startswith("data:image/jpeg;base64,") + assert len(data_url_jpeg) > 100 + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_find_text_rect(): + """Test async find_text_rect function""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Find text on the page + result = await find_text_rect_async(browser, "Example", max_results=5) + assert result.status == "success" + assert result.query == "Example" + assert result.matches >= 0 + assert isinstance(result.results, list) + + # If matches found, verify structure + if result.results: + match = result.results[0] + assert hasattr(match, "text") + assert hasattr(match, "rect") + assert hasattr(match, "viewport_rect") + assert hasattr(match, "in_viewport") diff --git a/tests/test_browser.py b/tests/test_browser.py index da4afe3..e94f5cc 100644 --- a/tests/test_browser.py +++ b/tests/test_browser.py @@ -168,3 +168,4 @@ def test_from_page_with_api_key(): finally: context.close() browser_instance.close() + From dd6b8ebe7a9072a27a5a32b75b52ccfb62d7f0ad Mon Sep 17 00:00:00 2001 From: rcholic Date: Thu, 1 Jan 2026 07:34:53 -0800 Subject: [PATCH 2/7] phase 2B completed --- sentience/_extension_loader.py | 1 - sentience/async_api.py | 56 ++++++------- sentience/browser.py | 5 +- sentience/expect.py | 100 +++++++++++++++++++++- sentience/overlay.py | 109 +++++++++++++++++++++++- sentience/read.py | 91 +++++++++++++++++++- tests/test_async_api.py | 149 +++++++++++++++++++++++++++++++++ tests/test_browser.py | 1 - 8 files changed, 477 insertions(+), 35 deletions(-) diff --git a/sentience/_extension_loader.py b/sentience/_extension_loader.py index 58bd873..d969ec3 100644 --- a/sentience/_extension_loader.py +++ b/sentience/_extension_loader.py @@ -38,4 +38,3 @@ def find_extension_path() -> Path: f"2. {dev_ext_path}\n" "Make sure the extension is built and 'sentience/extension' directory exists." ) - diff --git a/sentience/async_api.py b/sentience/async_api.py index 3f4ae05..fc4e168 100644 --- a/sentience/async_api.py +++ b/sentience/async_api.py @@ -21,38 +21,41 @@ from sentience.actions import click_async """ +# ========== Actions (Phase 1) ========== +# Re-export async action functions from actions.py +from sentience.actions import click_async, click_rect_async, press_async, type_text_async + # ========== Browser ========== # Re-export AsyncSentienceBrowser from browser.py (moved there for better organization) from sentience.browser import AsyncSentienceBrowser -# ========== Snapshot (Phase 1) ========== -# Re-export async snapshot functions from snapshot.py -from sentience.snapshot import snapshot_async +# Re-export async expect functions from expect.py +from sentience.expect import ExpectationAsync, expect_async -# ========== Actions (Phase 1) ========== -# Re-export async action functions from actions.py -from sentience.actions import ( - click_async, - type_text_async, - press_async, - click_rect_async, -) +# Re-export async overlay functions from overlay.py +from sentience.overlay import clear_overlay_async, show_overlay_async -# ========== Phase 2A: Core Utilities ========== -# Re-export async wait function from wait.py -from sentience.wait import wait_for_async +# ========== Query Functions (Pure Functions - No Async Needed) ========== +# Re-export query functions (pure functions, no async needed) +from sentience.query import find, query + +# ========== Phase 2B: Supporting Utilities ========== +# Re-export async read function from read.py +from sentience.read import read_async # Re-export async screenshot function from screenshot.py from sentience.screenshot import screenshot_async +# ========== Snapshot (Phase 1) ========== +# Re-export async snapshot functions from snapshot.py +from sentience.snapshot import snapshot_async + # Re-export async text search function from text_search.py from sentience.text_search import find_text_rect_async -# ========== Phase 2B: Supporting Utilities (Future) ========== -# TODO: Re-export when implemented -# from sentience.read import read_async -# from sentience.overlay import show_overlay_async, clear_overlay_async -# from sentience.expect import expect_async, ExpectationAsync +# ========== Phase 2A: Core Utilities ========== +# Re-export async wait function from wait.py +from sentience.wait import wait_for_async # ========== Phase 2C: Agent Layer (Future) ========== # TODO: Re-export when implemented @@ -64,9 +67,6 @@ # from sentience.recorder import RecorderAsync # from sentience.inspector import InspectorAsync -# ========== Query Functions (Pure Functions - No Async Needed) ========== -# Re-export query functions (pure functions, no async needed) -from sentience.query import find, query __all__ = [ # Browser @@ -82,12 +82,12 @@ "wait_for_async", # Re-exported from wait.py "screenshot_async", # Re-exported from screenshot.py "find_text_rect_async", # Re-exported from text_search.py - # Phase 2B: Supporting Utilities (Future - uncomment when implemented) - # "read_async", - # "show_overlay_async", - # "clear_overlay_async", - # "expect_async", - # "ExpectationAsync", + # Phase 2B: Supporting Utilities + "read_async", # Re-exported from read.py + "show_overlay_async", # Re-exported from overlay.py + "clear_overlay_async", # Re-exported from overlay.py + "expect_async", # Re-exported from expect.py + "ExpectationAsync", # Re-exported from expect.py # Phase 2C: Agent Layer (Future - uncomment when implemented) # "SentienceAgentAsync", # "BaseAgentAsync", diff --git a/sentience/browser.py b/sentience/browser.py index 9dd9014..4188e1d 100644 --- a/sentience/browser.py +++ b/sentience/browser.py @@ -10,7 +10,10 @@ from pathlib import Path from urllib.parse import urlparse -from playwright.async_api import BrowserContext as AsyncBrowserContext, Page as AsyncPage, Playwright as AsyncPlaywright, async_playwright +from playwright.async_api import BrowserContext as AsyncBrowserContext +from playwright.async_api import Page as AsyncPage +from playwright.async_api import Playwright as AsyncPlaywright +from playwright.async_api import async_playwright from playwright.sync_api import BrowserContext, Page, Playwright, sync_playwright from sentience._extension_loader import find_extension_path diff --git a/sentience/expect.py b/sentience/expect.py index a98b462..a198724 100644 --- a/sentience/expect.py +++ b/sentience/expect.py @@ -2,12 +2,13 @@ Expect/Assert functionality """ +import asyncio import time -from .browser import SentienceBrowser +from .browser import AsyncSentienceBrowser, SentienceBrowser from .models import Element from .query import query -from .wait import wait_for +from .wait import wait_for, wait_for_async class Expectation: @@ -90,3 +91,98 @@ def expect(browser: SentienceBrowser, selector: str | dict) -> Expectation: Expectation helper """ return Expectation(browser, selector) + + +class ExpectationAsync: + """Assertion helper for element expectations (async)""" + + def __init__(self, browser: AsyncSentienceBrowser, selector: str | dict): + self.browser = browser + self.selector = selector + + async def to_be_visible(self, timeout: float = 10.0) -> Element: + """Assert element is visible (exists and in viewport)""" + result = await wait_for_async(self.browser, self.selector, timeout=timeout) + + if not result.found: + raise AssertionError(f"Element not found: {self.selector} (timeout: {timeout}s)") + + element = result.element + if not element.in_viewport: + raise AssertionError(f"Element found but not visible in viewport: {self.selector}") + + return element + + async def to_exist(self, timeout: float = 10.0) -> Element: + """Assert element exists""" + result = await wait_for_async(self.browser, self.selector, timeout=timeout) + + if not result.found: + raise AssertionError(f"Element does not exist: {self.selector} (timeout: {timeout}s)") + + return result.element + + async def to_have_text(self, expected_text: str, timeout: float = 10.0) -> Element: + """Assert element has specific text""" + result = await wait_for_async(self.browser, self.selector, timeout=timeout) + + if not result.found: + raise AssertionError(f"Element not found: {self.selector} (timeout: {timeout}s)") + + element = result.element + if not element.text or expected_text not in element.text: + raise AssertionError( + f"Element text mismatch. Expected '{expected_text}', got '{element.text}'" + ) + + return element + + async def to_have_count(self, expected_count: int, timeout: float = 10.0) -> None: + """Assert selector matches exactly N elements""" + from .snapshot import snapshot_async + + start_time = time.time() + while time.time() - start_time < timeout: + snap = await snapshot_async(self.browser) + matches = query(snap, self.selector) + + if len(matches) == expected_count: + return + + await asyncio.sleep(0.25) + + # Final check + snap = await snapshot_async(self.browser) + matches = query(snap, self.selector) + actual_count = len(matches) + + raise AssertionError( + f"Element count mismatch. Expected {expected_count}, got {actual_count}" + ) + + +def expect_async(browser: AsyncSentienceBrowser, selector: str | dict) -> ExpectationAsync: + """ + Create expectation helper for assertions (async) + + Args: + browser: AsyncSentienceBrowser instance + selector: String DSL or dict query + + Returns: + ExpectationAsync helper + + Example: + # Assert element is visible + element = await expect_async(browser, "role=button").to_be_visible() + + # Assert element has text + element = await expect_async(browser, "h1").to_have_text("Welcome") + + # Assert element exists + element = await expect_async(browser, "role=link").to_exist() + + # Assert count + await expect_async(browser, "role=button").to_have_count(5) + """ + return ExpectationAsync(browser, selector) diff --git a/sentience/overlay.py b/sentience/overlay.py index f8e9fb2..2529f38 100644 --- a/sentience/overlay.py +++ b/sentience/overlay.py @@ -4,7 +4,7 @@ from typing import Any -from .browser import SentienceBrowser +from .browser import AsyncSentienceBrowser, SentienceBrowser from .models import Element, Snapshot @@ -113,3 +113,110 @@ def clear_overlay(browser: SentienceBrowser) -> None: } """ ) + + +async def show_overlay_async( + browser: AsyncSentienceBrowser, + elements: list[Element] | list[dict[str, Any]] | Snapshot, + target_element_id: int | None = None, +) -> None: + """ + Display visual overlay highlighting elements in the browser (async) + + This function shows a Shadow DOM overlay with color-coded borders around + detected elements. Useful for debugging, learning, and validating element detection. + + Args: + browser: AsyncSentienceBrowser instance + elements: Can be: + - List of Element objects (from snapshot.elements) + - List of raw element dicts (from snapshot result or API response) + - Snapshot object (will use snapshot.elements) + target_element_id: Optional ID of element to highlight in red (default: None) + + Color Coding: + - Red: Target element (when target_element_id is specified) + - Blue: Primary elements (is_primary=true) + - Green: Regular interactive elements + + Visual Indicators: + - Border thickness and opacity scale with importance score + - Semi-transparent fill for better visibility + - Importance badges showing scores + - Star icon for primary elements + - Target emoji for the target element + + Auto-clear: Overlay automatically disappears after 5 seconds + + Example: + # Show overlay from snapshot + snap = await snapshot_async(browser) + await show_overlay_async(browser, snap) + + # Show overlay with custom elements + elements = [{"id": 1, "bbox": {"x": 100, "y": 100, "width": 200, "height": 50}, ...}] + await show_overlay_async(browser, elements) + + # Show overlay with target element highlighted in red + await show_overlay_async(browser, snap, target_element_id=42) + + # Clear overlay manually before 5 seconds + await clear_overlay_async(browser) + """ + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + # Handle different input types + if isinstance(elements, Snapshot): + # Extract elements from Snapshot object + elements_list = [el.model_dump() for el in elements.elements] + elif isinstance(elements, list) and len(elements) > 0: + # Check if it's a list of Element objects or dicts + if hasattr(elements[0], "model_dump"): + # List of Element objects + elements_list = [el.model_dump() for el in elements] + else: + # Already a list of dicts + elements_list = elements + else: + raise ValueError("elements must be a Snapshot, list of Element objects, or list of dicts") + + # Call extension API + await browser.page.evaluate( + """ + (args) => { + if (window.sentience && window.sentience.showOverlay) { + window.sentience.showOverlay(args.elements, args.targetId); + } else { + console.warn('[Sentience SDK] showOverlay not available - is extension loaded?'); + } + } + """, + {"elements": elements_list, "targetId": target_element_id}, + ) + + +async def clear_overlay_async(browser: AsyncSentienceBrowser) -> None: + """ + Clear the visual overlay manually (before 5-second auto-clear) (async) + + Args: + browser: AsyncSentienceBrowser instance + + Example: + await show_overlay_async(browser, snap) + # ... inspect overlay ... + await clear_overlay_async(browser) # Remove immediately + """ + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + await browser.page.evaluate( + """ + () => { + if (window.sentience && window.sentience.clearOverlay) { + window.sentience.clearOverlay(); + } + } + """ + ) diff --git a/sentience/read.py b/sentience/read.py index 33fc8a0..59cf82b 100644 --- a/sentience/read.py +++ b/sentience/read.py @@ -4,7 +4,7 @@ from typing import Literal -from .browser import SentienceBrowser +from .browser import AsyncSentienceBrowser, SentienceBrowser def read( @@ -94,3 +94,92 @@ def read( ) return result + + +async def read_async( + browser: AsyncSentienceBrowser, + output_format: Literal["raw", "text", "markdown"] = "raw", + enhance_markdown: bool = True, +) -> dict: + """ + Read page content as raw HTML, text, or markdown (async) + + Args: + browser: AsyncSentienceBrowser instance + output_format: Output format - "raw" (default, returns HTML for external processing), + "text" (plain text), or "markdown" (lightweight or enhanced markdown). + enhance_markdown: If True and output_format is "markdown", uses markdownify for better conversion. + If False, uses the extension's lightweight markdown converter. + + Returns: + dict with: + - status: "success" or "error" + - url: Current page URL + - format: "raw", "text", or "markdown" + - content: Page content as string + - length: Content length in characters + - error: Error message if status is "error" + + Examples: + # Get raw HTML (default) - can be used with markdownify for better conversion + result = await read_async(browser) + html_content = result["content"] + + # Get high-quality markdown (uses markdownify internally) + result = await read_async(browser, output_format="markdown") + markdown = result["content"] + + # Get plain text + result = await read_async(browser, output_format="text") + text = result["content"] + """ + if not browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + if output_format == "markdown" and enhance_markdown: + # Get raw HTML from the extension first + raw_html_result = await browser.page.evaluate( + """ + (options) => { + return window.sentience.read(options); + } + """, + {"format": "raw"}, + ) + + if raw_html_result.get("status") == "success": + html_content = raw_html_result["content"] + try: + # Use markdownify for enhanced markdown conversion + from markdownify import MarkdownifyError, markdownify + + markdown_content = markdownify(html_content, heading_style="ATX", wrap=True) + return { + "status": "success", + "url": raw_html_result["url"], + "format": "markdown", + "content": markdown_content, + "length": len(markdown_content), + } + except ImportError: + print( + "Warning: 'markdownify' not installed. Install with 'pip install markdownify' for enhanced markdown. Falling back to extension's markdown." + ) + except MarkdownifyError as e: + print(f"Warning: markdownify failed ({e}), falling back to extension's markdown.") + except Exception as e: + print( + f"Warning: An unexpected error occurred with markdownify ({e}), falling back to extension's markdown." + ) + + # If not enhanced markdown, or fallback, call extension with requested format + result = await browser.page.evaluate( + """ + (options) => { + return window.sentience.read(options); + } + """, + {"format": output_format}, + ) + + return result diff --git a/tests/test_async_api.py b/tests/test_async_api.py index a6a29d5..bd60356 100644 --- a/tests/test_async_api.py +++ b/tests/test_async_api.py @@ -7,13 +7,18 @@ from sentience.async_api import ( AsyncSentienceBrowser, + ExpectationAsync, + clear_overlay_async, click_async, click_rect_async, + expect_async, find, find_text_rect_async, press_async, query, + read_async, screenshot_async, + show_overlay_async, snapshot_async, type_text_async, wait_for_async, @@ -333,3 +338,147 @@ async def test_async_find_text_rect(): assert hasattr(match, "rect") assert hasattr(match, "viewport_rect") assert hasattr(match, "in_viewport") + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_read(): + """Test async read function""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + # Wait a bit more for extension to be ready + await browser.page.wait_for_timeout(500) + + # Test raw HTML format + result = await read_async(browser, output_format="raw") + assert result["status"] == "success" + assert "content" in result + assert "url" in result + assert "format" in result + assert result["format"] == "raw" + assert len(result["content"]) > 0 + + # Test text format + result = await read_async(browser, output_format="text") + assert result["status"] == "success" + assert result["format"] == "text" + assert len(result["content"]) > 0 + + # Test markdown format (may fallback to extension's markdown) + result = await read_async(browser, output_format="markdown") + assert result["status"] == "success" + assert result["format"] == "markdown" + assert len(result["content"]) > 0 + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_show_overlay(): + """Test async show_overlay function""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Get snapshot + snap = await snapshot_async(browser) + assert len(snap.elements) > 0 + + # Show overlay with snapshot + await show_overlay_async(browser, snap) + # No exception means success + + # Show overlay with target element + if len(snap.elements) > 0: + target_id = snap.elements[0].id + await show_overlay_async(browser, snap, target_element_id=target_id) + + # Show overlay with element list + elements = [el.model_dump() for el in snap.elements[:5]] # First 5 elements + await show_overlay_async(browser, elements) + + # Clear overlay + await clear_overlay_async(browser) + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_clear_overlay(): + """Test async clear_overlay function""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Clear overlay (should not raise even if no overlay is shown) + await clear_overlay_async(browser) + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_expect_to_be_visible(): + """Test async expect to_be_visible""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Expect a link to be visible (more reliable on example.com) + element = await expect_async(browser, "role=link").to_be_visible() + assert element is not None + assert element.in_viewport is True + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_expect_to_exist(): + """Test async expect to_exist""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Expect a link to exist (more reliable on example.com) + element = await expect_async(browser, "role=link").to_exist() + assert element is not None + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_expect_to_have_text(): + """Test async expect to_have_text""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Expect link to have "more" text (common on example.com) + element = await expect_async(browser, "role=link").to_have_text("more") + assert element is not None + assert "more" in element.text.lower() + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_expect_to_have_count(): + """Test async expect to_have_count""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Expect at least one link (more reliable on example.com) + await expect_async(browser, "role=link").to_have_count(1) + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_async_expectation_class(): + """Test ExpectationAsync class directly""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Create expectation instance + expectation = ExpectationAsync(browser, "role=link") + assert expectation.browser == browser + assert expectation.selector == "role=link" + + # Use expectation methods + element = await expectation.to_exist() + assert element is not None diff --git a/tests/test_browser.py b/tests/test_browser.py index e94f5cc..da4afe3 100644 --- a/tests/test_browser.py +++ b/tests/test_browser.py @@ -168,4 +168,3 @@ def test_from_page_with_api_key(): finally: context.close() browser_instance.close() - From b7b4ffb391aef77033205ffa173327bd7caf8edf Mon Sep 17 00:00:00 2001 From: rcholic Date: Thu, 1 Jan 2026 07:49:31 -0800 Subject: [PATCH 3/7] verified tests --- examples/basic_agent_async.py | 95 +++++ examples/hello_async.py | 66 +++ examples/query_demo_async.py | 50 +++ examples/read_markdown_async.py | 59 +++ examples/semantic_wait_demo_async.py | 120 ++++++ examples/wait_and_click_async.py | 64 +++ pyproject.toml | 2 +- sentience/__init__.py | 2 +- sentience/agent.py | 582 ++++++++++++++++++++++++++- sentience/async_api.py | 35 +- sentience/base_agent.py | 93 +++++ sentience/inspector.py | 182 ++++++++- sentience/recorder.py | 224 ++++++++++- tests/test_async_api.py | 191 +++++++++ 14 files changed, 1739 insertions(+), 26 deletions(-) create mode 100644 examples/basic_agent_async.py create mode 100644 examples/hello_async.py create mode 100644 examples/query_demo_async.py create mode 100644 examples/read_markdown_async.py create mode 100644 examples/semantic_wait_demo_async.py create mode 100644 examples/wait_and_click_async.py diff --git a/examples/basic_agent_async.py b/examples/basic_agent_async.py new file mode 100644 index 0000000..f1781d5 --- /dev/null +++ b/examples/basic_agent_async.py @@ -0,0 +1,95 @@ +""" +Example: Basic agent usage (Async version) +Demonstrates SentienceAgentAsync for natural language automation +""" + +import asyncio +import os + +from sentience.async_api import AsyncSentienceBrowser, SentienceAgentAsync +from sentience.llm_provider import LLMProvider, LLMResponse + + +# Simple mock LLM provider for demonstration +# In production, use OpenAIProvider, AnthropicProvider, etc. +class MockLLMProvider(LLMProvider): + """Mock LLM provider for testing""" + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + # Simple mock that returns CLICK action + return LLMResponse( + content="CLICK(1)", + model_name="mock-model", + prompt_tokens=100, + completion_tokens=10, + total_tokens=110, + ) + + def supports_json_mode(self) -> bool: + return True + + @property + def model_name(self) -> str: + return "mock-model" + + +async def main(): + # Get API key from environment variable (optional - uses free tier if not set) + api_key = os.environ.get("SENTIENCE_API_KEY") + + async with AsyncSentienceBrowser(api_key=api_key, headless=False) as browser: + # Navigate to a page + await browser.goto("https://example.com", wait_until="domcontentloaded") + + # Create LLM provider + # In production, use: llm = OpenAIProvider(api_key="your-key", model="gpt-4o") + llm = MockLLMProvider() + + # Create agent + agent = SentienceAgentAsync(browser, llm, verbose=True) + + print("=== Basic Agent Demo ===\n") + + # Example 1: Simple action + print("1. Executing simple action...") + try: + result = await agent.act("Click the first link") + print(f" Result: success={result.success}, action={result.action}") + if result.element_id: + print(f" Clicked element ID: {result.element_id}") + except Exception as e: + print(f" Error: {e}") + + print() + + # Example 2: Check history + print("2. Agent execution history:") + history = agent.get_history() + print(f" Total actions: {len(history)}") + for i, entry in enumerate(history, 1): + print(f" {i}. {entry.goal} -> {entry.action} (success: {entry.success})") + + print() + + # Example 3: Token statistics + print("3. Token usage statistics:") + stats = agent.get_token_stats() + print(f" Total tokens: {stats.total_tokens}") + print(f" Prompt tokens: {stats.total_prompt_tokens}") + print(f" Completion tokens: {stats.total_completion_tokens}") + + print() + + # Example 4: Clear history + print("4. Clearing history...") + agent.clear_history() + print(f" History length after clear: {len(agent.get_history())}") + + print("\n✅ Basic agent demo complete!") + print("\nNote: This example uses a mock LLM provider.") + print("In production, use a real LLM provider like OpenAIProvider or AnthropicProvider.") + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/examples/hello_async.py b/examples/hello_async.py new file mode 100644 index 0000000..c44a8b9 --- /dev/null +++ b/examples/hello_async.py @@ -0,0 +1,66 @@ +""" +Example: Verify extension bridge is loaded (Async version) +""" + +import asyncio +import os + +from sentience.async_api import AsyncSentienceBrowser + + +async def main(): + # Get API key from environment variable (optional - uses free tier if not set) + api_key = os.environ.get("SENTIENCE_API_KEY") + + try: + async with AsyncSentienceBrowser(api_key=api_key, headless=False) as browser: + # Navigate to a page to ensure extension is active + await browser.goto("https://example.com", wait_until="domcontentloaded") + + # Check if extension API is available + bridge_ok = await browser.page.evaluate( + """ + () => { + return typeof window.sentience !== 'undefined' && + typeof window.sentience.snapshot === 'function'; + } + """ + ) + print(f"bridge_ok={bridge_ok}") + + if bridge_ok: + print("✅ Extension loaded successfully!") + # Try a quick snapshot to verify it works + try: + result = await browser.page.evaluate("window.sentience.snapshot({ limit: 1 })") + if result.get("status") == "success": + print(f"✅ Snapshot test: Found {len(result.get('elements', []))} elements") + else: + print(f"⚠️ Snapshot returned: {result.get('status')}") + except Exception as e: + print(f"⚠️ Snapshot test failed: {e}") + else: + print("❌ Extension not loaded") + # Debug info + debug_info = await browser.page.evaluate( + """ + () => { + return { + sentience_defined: typeof window.sentience !== 'undefined', + registry_defined: typeof window.sentience_registry !== 'undefined', + snapshot_defined: typeof window.sentience?.snapshot !== 'undefined' + }; + } + """ + ) + print(f"Debug info: {debug_info}") + except Exception as e: + print(f"❌ Error: {e}") + import traceback + + traceback.print_exc() + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/examples/query_demo_async.py b/examples/query_demo_async.py new file mode 100644 index 0000000..b0a7bdf --- /dev/null +++ b/examples/query_demo_async.py @@ -0,0 +1,50 @@ +""" +Example: Query engine demonstration (Async version) +""" + +import asyncio +import os + +from sentience.async_api import AsyncSentienceBrowser, find, query, snapshot_async + + +async def main(): + # Get API key from environment variable (optional - uses free tier if not set) + api_key = os.environ.get("SENTIENCE_API_KEY") + + async with AsyncSentienceBrowser(api_key=api_key, headless=False) as browser: + # Navigate to a page with links + await browser.goto("https://example.com", wait_until="domcontentloaded") + + snap = await snapshot_async(browser) + + # Query examples + print("=== Query Examples ===\n") + + # Find all buttons + buttons = query(snap, "role=button") + print(f"Found {len(buttons)} buttons") + + # Find all links + links = query(snap, "role=link") + print(f"Found {len(links)} links") + + # Find clickable elements + clickables = query(snap, "clickable=true") + print(f"Found {len(clickables)} clickable elements") + + # Find element with text containing "More" + more_link = find(snap, "text~'More'") + if more_link: + print(f"\nFound 'More' link: {more_link.text} (id: {more_link.id})") + else: + print("\nNo 'More' link found") + + # Complex query: clickable links + clickable_links = query(snap, "role=link clickable=true") + print(f"\nFound {len(clickable_links)} clickable links") + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/examples/read_markdown_async.py b/examples/read_markdown_async.py new file mode 100644 index 0000000..41a0c14 --- /dev/null +++ b/examples/read_markdown_async.py @@ -0,0 +1,59 @@ +""" +Example: Reading page content and converting to markdown (Async version) + +This example shows how to use the read_async() function to get page content +and convert it to high-quality markdown using markdownify. +""" + +import asyncio +import os + +from markdownify import markdownify + +from sentience.async_api import AsyncSentienceBrowser, read_async + + +async def main(): + # Get API key from environment variable (optional - uses free tier if not set) + api_key = os.environ.get("SENTIENCE_API_KEY") + + # Initialize browser + async with AsyncSentienceBrowser(api_key=api_key, headless=True) as browser: + # Navigate to a page + await browser.goto("https://example.com", wait_until="domcontentloaded") + + # Method 1: Get raw HTML (default) and convert with markdownify + print("=== Method 1: Raw HTML + markdownify (Recommended) ===") + result = await read_async(browser) # output_format="raw" is default + html_content = result["content"] + + # Convert to markdown using markdownify (better quality) + markdown = markdownify( + html_content, + heading_style="ATX", # Use # for headings + bullets="-", # Use - for lists + strip=["script", "style", "nav", "footer", "header"], # Strip unwanted tags + ) + print(f"Markdown length: {len(markdown)} characters") + print(markdown[:500]) # Print first 500 chars + print("\n") + + # Method 2: Get high-quality markdown directly (uses markdownify internally) + print("=== Method 2: Direct markdown (High-quality via markdownify) ===") + result = await read_async(browser, output_format="markdown") + high_quality_markdown = result["content"] + print(f"Markdown length: {len(high_quality_markdown)} characters") + print(high_quality_markdown[:500]) # Print first 500 chars + print("\n") + + # Method 3: Get plain text + print("=== Method 3: Plain text ===") + result = await read_async(browser, output_format="text") + text_content = result["content"] + print(f"Text length: {len(text_content)} characters") + print(text_content[:500]) # Print first 500 chars + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/examples/semantic_wait_demo_async.py b/examples/semantic_wait_demo_async.py new file mode 100644 index 0000000..f887475 --- /dev/null +++ b/examples/semantic_wait_demo_async.py @@ -0,0 +1,120 @@ +""" +Example: Semantic wait_for using query DSL (Async version) +Demonstrates waiting for elements using semantic selectors +""" + +import asyncio +import os + +from sentience.async_api import AsyncSentienceBrowser, click_async, wait_for_async + + +async def main(): + # Get API key from environment variable (optional - uses free tier if not set) + api_key = os.environ.get("SENTIENCE_API_KEY") + + async with AsyncSentienceBrowser(api_key=api_key, headless=False) as browser: + # Navigate to example.com + await browser.goto("https://example.com", wait_until="domcontentloaded") + + print("=== Semantic wait_for_async Demo ===\n") + + # Example 1: Wait for element by role + print("1. Waiting for link element (role=link)") + wait_result = await wait_for_async(browser, "role=link", timeout=5.0) + if wait_result.found: + print(f" ✅ Found after {wait_result.duration_ms}ms") + print(f" Element: '{wait_result.element.text}' (id: {wait_result.element.id})") + else: + print(f" ❌ Not found (timeout: {wait_result.timeout})") + print() + + # Example 2: Wait for element by role and text + print("2. Waiting for link with specific text") + wait_result = await wait_for_async(browser, "role=link text~'Example'", timeout=5.0) + if wait_result.found: + print(f" ✅ Found after {wait_result.duration_ms}ms") + print(f" Element text: '{wait_result.element.text}'") + else: + print(" ❌ Not found") + print() + + # Example 3: Wait for clickable element + print("3. Waiting for clickable element") + wait_result = await wait_for_async(browser, "clickable=true", timeout=5.0) + if wait_result.found: + print(f" ✅ Found clickable element after {wait_result.duration_ms}ms") + print(f" Role: {wait_result.element.role}") + print(f" Text: '{wait_result.element.text}'") + print(f" Is clickable: {wait_result.element.visual_cues.is_clickable}") + else: + print(" ❌ Not found") + print() + + # Example 4: Wait for element with importance threshold + print("4. Waiting for important element (importance > 100)") + wait_result = await wait_for_async(browser, "importance>100", timeout=5.0) + if wait_result.found: + print(f" ✅ Found important element after {wait_result.duration_ms}ms") + print(f" Importance: {wait_result.element.importance}") + print(f" Role: {wait_result.element.role}") + else: + print(" ❌ Not found") + print() + + # Example 5: Wait and then click + print("5. Wait for element, then click it") + wait_result = await wait_for_async(browser, "role=link", timeout=5.0) + if wait_result.found: + print(" ✅ Found element, clicking...") + click_result = await click_async(browser, wait_result.element.id) + print( + f" Click result: success={click_result.success}, outcome={click_result.outcome}" + ) + if click_result.url_changed: + print(f" ✅ Navigation occurred: {browser.page.url}") + else: + print(" ❌ Element not found, cannot click") + print() + + # Example 6: Using local extension (fast polling) + print("6. Using local extension with auto-optimized interval") + print(" When use_api=False, interval auto-adjusts to 0.25s (fast)") + wait_result = await wait_for_async(browser, "role=link", timeout=5.0, use_api=False) + if wait_result.found: + print(f" ✅ Found after {wait_result.duration_ms}ms") + print(" (Used local extension, polled every 0.25 seconds)") + print() + + # Example 7: Using remote API (slower polling) + print("7. Using remote API with auto-optimized interval") + print(" When use_api=True, interval auto-adjusts to 1.5s (network-friendly)") + if api_key: + wait_result = await wait_for_async(browser, "role=link", timeout=5.0, use_api=True) + if wait_result.found: + print(f" ✅ Found after {wait_result.duration_ms}ms") + print(" (Used remote API, polled every 1.5 seconds)") + else: + print(" ⚠️ Skipped (no API key set)") + print() + + # Example 8: Custom interval override + print("8. Custom interval override (manual control)") + print(" You can still specify custom interval if needed") + wait_result = await wait_for_async( + browser, "role=link", timeout=5.0, interval=0.5, use_api=False + ) + if wait_result.found: + print(f" ✅ Found after {wait_result.duration_ms}ms") + print(" (Custom interval: 0.5 seconds)") + print() + + print("✅ Semantic wait_for_async demo complete!") + print("\nNote: wait_for_async uses the semantic query DSL to find elements.") + print("This is more robust than CSS selectors because it understands") + print("the semantic meaning of elements (role, text, clickability, etc.).") + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/examples/wait_and_click_async.py b/examples/wait_and_click_async.py new file mode 100644 index 0000000..1d2f754 --- /dev/null +++ b/examples/wait_and_click_async.py @@ -0,0 +1,64 @@ +""" +Example: Wait for element and click (Async version) +""" + +import asyncio +import os + +from sentience.async_api import ( + AsyncSentienceBrowser, + click_async, + expect_async, + find, + snapshot_async, + wait_for_async, +) + + +async def main(): + # Get API key from environment variable (optional - uses free tier if not set) + api_key = os.environ.get("SENTIENCE_API_KEY") + + async with AsyncSentienceBrowser(api_key=api_key, headless=False) as browser: + # Navigate to example.com + await browser.goto("https://example.com", wait_until="domcontentloaded") + + # Take initial snapshot + snap = await snapshot_async(browser) + + # Find a link + link = find(snap, "role=link") + + if link: + print(f"Found link: {link.text} (id: {link.id})") + + # Click it + result = await click_async(browser, link.id) + print(f"Click result: success={result.success}, outcome={result.outcome}") + + print(f"New URL: {browser.page.url}") + else: + print("No link found") + + # Example: Wait for element using wait_for_async + print("\n=== Wait Example ===") + await browser.goto("https://example.com", wait_until="domcontentloaded") + + wait_result = await wait_for_async(browser, "role=link", timeout=5.0) + if wait_result.found: + print(f"✅ Found element after {wait_result.duration_ms}ms") + else: + print(f"❌ Element not found (timeout: {wait_result.timeout})") + + # Example: Expect assertion + print("\n=== Expect Example ===") + try: + element = await expect_async(browser, "role=link").to_exist(timeout=5.0) + print(f"✅ Element exists: {element.text}") + except AssertionError as e: + print(f"❌ Assertion failed: {e}") + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/pyproject.toml b/pyproject.toml index ac5a67b..06a3772 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sentienceapi" -version = "0.90.16" +version = "0.90.17" description = "Python SDK for Sentience AI Agent Browser Automation" readme = "README.md" requires-python = ">=3.11" diff --git a/sentience/__init__.py b/sentience/__init__.py index 96643ab..bae871d 100644 --- a/sentience/__init__.py +++ b/sentience/__init__.py @@ -70,7 +70,7 @@ ) from .wait import wait_for -__version__ = "0.90.16" +__version__ = "0.90.17" __all__ = [ # Core SDK diff --git a/sentience/agent.py b/sentience/agent.py index 404e507..9cf4367 100644 --- a/sentience/agent.py +++ b/sentience/agent.py @@ -3,13 +3,14 @@ Implements observe-think-act loop for natural language commands """ +import asyncio import re import time from typing import TYPE_CHECKING, Any, Optional -from .actions import click, press, type_text -from .base_agent import BaseAgent -from .browser import SentienceBrowser +from .actions import click, click_async, press, press_async, type_text, type_text_async +from .base_agent import BaseAgent, BaseAgentAsync +from .browser import AsyncSentienceBrowser, SentienceBrowser from .llm_provider import LLMProvider, LLMResponse from .models import ( ActionHistory, @@ -21,7 +22,7 @@ SnapshotOptions, TokenStats, ) -from .snapshot import snapshot +from .snapshot import snapshot, snapshot_async if TYPE_CHECKING: from .agent_config import AgentConfig @@ -670,3 +671,576 @@ def _extract_keywords(self, text: str) -> list[str]: } words = text.split() return [w for w in words if w not in stopwords and len(w) > 2] + + +class SentienceAgentAsync(BaseAgentAsync): + """ + High-level async agent that combines Sentience SDK with any LLM provider. + + Uses observe-think-act loop to execute natural language commands: + 1. OBSERVE: Get snapshot of current page state + 2. THINK: Query LLM to decide next action + 3. ACT: Execute action using SDK + + Example: + >>> from sentience.async_api import AsyncSentienceBrowser + >>> from sentience.agent import SentienceAgentAsync + >>> from sentience.llm_provider import OpenAIProvider + >>> + >>> async with AsyncSentienceBrowser() as browser: + >>> await browser.goto("https://google.com") + >>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o") + >>> agent = SentienceAgentAsync(browser, llm) + >>> await agent.act("Click the search box") + >>> await agent.act("Type 'magic mouse' into the search field") + >>> await agent.act("Press Enter key") + """ + + def __init__( + self, + browser: AsyncSentienceBrowser, + llm: LLMProvider, + default_snapshot_limit: int = 50, + verbose: bool = True, + tracer: Optional["Tracer"] = None, + config: Optional["AgentConfig"] = None, + ): + """ + Initialize Sentience Agent (async) + + Args: + browser: AsyncSentienceBrowser instance + llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.) + default_snapshot_limit: Default maximum elements to include in context (default: 50) + verbose: Print execution logs (default: True) + tracer: Optional Tracer instance for execution tracking (default: None) + config: Optional AgentConfig for advanced configuration (default: None) + """ + self.browser = browser + self.llm = llm + self.default_snapshot_limit = default_snapshot_limit + self.verbose = verbose + self.tracer = tracer + self.config = config + + # Execution history + self.history: list[dict[str, Any]] = [] + + # Token usage tracking (will be converted to TokenStats on get_token_stats()) + self._token_usage_raw = { + "total_prompt_tokens": 0, + "total_completion_tokens": 0, + "total_tokens": 0, + "by_action": [], + } + + # Step counter for tracing + self._step_count = 0 + + async def act( # noqa: C901 + self, + goal: str, + max_retries: int = 2, + snapshot_options: SnapshotOptions | None = None, + ) -> AgentActionResult: + """ + Execute a high-level goal using observe → think → act loop (async) + + Args: + goal: Natural language instruction (e.g., "Click the Sign In button") + max_retries: Number of retries on failure (default: 2) + snapshot_options: Optional SnapshotOptions for this specific action + + Returns: + AgentActionResult with execution details + + Example: + >>> result = await agent.act("Click the search box") + >>> print(result.success, result.action, result.element_id) + True click 42 + """ + if self.verbose: + print(f"\n{'=' * 70}") + print(f"🤖 Agent Goal: {goal}") + print(f"{'=' * 70}") + + # Generate step ID for tracing + self._step_count += 1 + step_id = f"step-{self._step_count}" + + # Emit step_start trace event if tracer is enabled + if self.tracer: + pre_url = self.browser.page.url if self.browser.page else None + self.tracer.emit_step_start( + step_id=step_id, + step_index=self._step_count, + goal=goal, + attempt=0, + pre_url=pre_url, + ) + + for attempt in range(max_retries + 1): + try: + # 1. OBSERVE: Get refined semantic snapshot + start_time = time.time() + + # Use provided options or create default + snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit) + # Only set goal if not already provided + if snap_opts.goal is None: + snap_opts.goal = goal + + # Call snapshot with options object (matches TypeScript API) + snap = await snapshot_async(self.browser, snap_opts) + + if snap.status != "success": + raise RuntimeError(f"Snapshot failed: {snap.error}") + + # Apply element filtering based on goal + filtered_elements = self.filter_elements(snap, goal) + + # Emit snapshot trace event if tracer is enabled + if self.tracer: + # Include element data for live overlay visualization + # Use filtered_elements for overlay (only relevant elements) + elements_data = [ + { + "id": el.id, + "bbox": { + "x": el.bbox.x, + "y": el.bbox.y, + "width": el.bbox.width, + "height": el.bbox.height, + }, + "role": el.role, + "text": el.text[:50] if el.text else "", # Truncate for brevity + } + for el in filtered_elements[:50] # Limit to first 50 for performance + ] + + self.tracer.emit( + "snapshot", + { + "url": snap.url, + "element_count": len(snap.elements), + "timestamp": snap.timestamp, + "elements": elements_data, # Add element data for overlay + }, + step_id=step_id, + ) + + # Create filtered snapshot + filtered_snap = Snapshot( + status=snap.status, + timestamp=snap.timestamp, + url=snap.url, + viewport=snap.viewport, + elements=filtered_elements, + screenshot=snap.screenshot, + screenshot_format=snap.screenshot_format, + error=snap.error, + ) + + # 2. GROUND: Format elements for LLM context + context = self._build_context(filtered_snap, goal) + + # 3. THINK: Query LLM for next action + llm_response = self._query_llm(context, goal) + + # Emit LLM query trace event if tracer is enabled + if self.tracer: + self.tracer.emit( + "llm_query", + { + "prompt_tokens": llm_response.prompt_tokens, + "completion_tokens": llm_response.completion_tokens, + "model": llm_response.model_name, + "response": llm_response.content[:200], # Truncate for brevity + }, + step_id=step_id, + ) + + if self.verbose: + print(f"🧠 LLM Decision: {llm_response.content}") + + # Track token usage + self._track_tokens(goal, llm_response) + + # Parse action from LLM response + action_str = self._extract_action_from_response(llm_response.content) + + # 4. EXECUTE: Parse and run action + result_dict = await self._execute_action(action_str, filtered_snap) + + duration_ms = int((time.time() - start_time) * 1000) + + # Create AgentActionResult from execution result + result = AgentActionResult( + success=result_dict["success"], + action=result_dict["action"], + goal=goal, + duration_ms=duration_ms, + attempt=attempt, + element_id=result_dict.get("element_id"), + text=result_dict.get("text"), + key=result_dict.get("key"), + outcome=result_dict.get("outcome"), + url_changed=result_dict.get("url_changed"), + error=result_dict.get("error"), + message=result_dict.get("message"), + ) + + # Emit action execution trace event if tracer is enabled + if self.tracer: + post_url = self.browser.page.url if self.browser.page else None + + # Include element data for live overlay visualization + elements_data = [ + { + "id": el.id, + "bbox": { + "x": el.bbox.x, + "y": el.bbox.y, + "width": el.bbox.width, + "height": el.bbox.height, + }, + "role": el.role, + "text": el.text[:50] if el.text else "", + } + for el in filtered_snap.elements[:50] + ] + + self.tracer.emit( + "action", + { + "action": result.action, + "element_id": result.element_id, + "success": result.success, + "outcome": result.outcome, + "duration_ms": duration_ms, + "post_url": post_url, + "elements": elements_data, # Add element data for overlay + "target_element_id": result.element_id, # Highlight target in red + }, + step_id=step_id, + ) + + # 5. RECORD: Track history + self.history.append( + { + "goal": goal, + "action": action_str, + "result": result.model_dump(), # Store as dict + "success": result.success, + "attempt": attempt, + "duration_ms": duration_ms, + } + ) + + if self.verbose: + status = "✅" if result.success else "❌" + print(f"{status} Completed in {duration_ms}ms") + + # Emit step completion trace event if tracer is enabled + if self.tracer: + self.tracer.emit( + "step_end", + { + "success": result.success, + "duration_ms": duration_ms, + "action": result.action, + }, + step_id=step_id, + ) + + return result + + except Exception as e: + # Emit error trace event if tracer is enabled + if self.tracer: + self.tracer.emit_error(step_id=step_id, error=str(e), attempt=attempt) + + if attempt < max_retries: + if self.verbose: + print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}") + await asyncio.sleep(1.0) # Brief delay before retry + continue + else: + # Create error result + error_result = AgentActionResult( + success=False, + action="error", + goal=goal, + duration_ms=0, + attempt=attempt, + error=str(e), + ) + self.history.append( + { + "goal": goal, + "action": "error", + "result": error_result.model_dump(), + "success": False, + "attempt": attempt, + "duration_ms": 0, + } + ) + raise RuntimeError(f"Failed after {max_retries} retries: {e}") + + def _build_context(self, snap: Snapshot, goal: str) -> str: + """Convert snapshot elements to token-efficient prompt string (same as sync version)""" + lines = [] + # Note: elements are already filtered by filter_elements() in act() + for el in snap.elements: + # Extract visual cues + cues = [] + if el.visual_cues.is_primary: + cues.append("PRIMARY") + if el.visual_cues.is_clickable: + cues.append("CLICKABLE") + if el.visual_cues.background_color_name: + cues.append(f"color:{el.visual_cues.background_color_name}") + + # Format element line + cues_str = f" {{{','.join(cues)}}}" if cues else "" + text_preview = ( + (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "") + ) + + lines.append( + f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} ' + f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})" + ) + + return "\n".join(lines) + + def _extract_action_from_response(self, response: str) -> str: + """Extract action command from LLM response (same as sync version)""" + # Remove markdown code blocks if present + response = re.sub(r"```[\w]*\n?", "", response) + response = response.strip() + + # Try to find action patterns in the response + # Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH() + action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))' + + match = re.search(action_pattern, response, re.IGNORECASE) + if match: + return match.group(1) + + # If no pattern match, return the original response (will likely fail parsing) + return response + + def _query_llm(self, dom_context: str, goal: str) -> LLMResponse: + """Query LLM with standardized prompt template (same as sync version)""" + system_prompt = f"""You are an AI web automation agent. + +GOAL: {goal} + +VISIBLE ELEMENTS (sorted by importance): +{dom_context} + +VISUAL CUES EXPLAINED: +- {{PRIMARY}}: Main call-to-action element on the page +- {{CLICKABLE}}: Element is clickable +- {{color:X}}: Background color name + +CRITICAL RESPONSE FORMAT: +You MUST respond with ONLY ONE of these exact action formats: +- CLICK(id) - Click element by ID +- TYPE(id, "text") - Type text into element +- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc) +- FINISH() - Task complete + +DO NOT include any explanation, reasoning, or natural language. +DO NOT use markdown formatting or code blocks. +DO NOT say "The next step is..." or anything similar. + +CORRECT Examples: +CLICK(42) +TYPE(15, "magic mouse") +PRESS("Enter") +FINISH() + +INCORRECT Examples (DO NOT DO THIS): +"The next step is to click..." +"I will type..." +```CLICK(42)``` +""" + + user_prompt = "Return the single action command:" + + return self.llm.generate(system_prompt, user_prompt, temperature=0.0) + + async def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]: + """ + Parse action string and execute SDK call (async) + + Args: + action_str: Action string from LLM (e.g., "CLICK(42)") + snap: Current snapshot (for context) + + Returns: + Execution result dictionary + """ + # Parse CLICK(42) + if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE): + element_id = int(match.group(1)) + result = await click_async(self.browser, element_id) + return { + "success": result.success, + "action": "click", + "element_id": element_id, + "outcome": result.outcome, + "url_changed": result.url_changed, + } + + # Parse TYPE(42, "hello world") + elif match := re.match( + r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)', + action_str, + re.IGNORECASE, + ): + element_id = int(match.group(1)) + text = match.group(2) + result = await type_text_async(self.browser, element_id, text) + return { + "success": result.success, + "action": "type", + "element_id": element_id, + "text": text, + "outcome": result.outcome, + } + + # Parse PRESS("Enter") + elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE): + key = match.group(1) + result = await press_async(self.browser, key) + return { + "success": result.success, + "action": "press", + "key": key, + "outcome": result.outcome, + } + + # Parse FINISH() + elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE): + return { + "success": True, + "action": "finish", + "message": "Task marked as complete", + } + + else: + raise ValueError( + f"Unknown action format: {action_str}\n" + f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()' + ) + + def _track_tokens(self, goal: str, llm_response: LLMResponse): + """Track token usage for analytics (same as sync version)""" + if llm_response.prompt_tokens: + self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens + if llm_response.completion_tokens: + self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens + if llm_response.total_tokens: + self._token_usage_raw["total_tokens"] += llm_response.total_tokens + + self._token_usage_raw["by_action"].append( + { + "goal": goal, + "prompt_tokens": llm_response.prompt_tokens or 0, + "completion_tokens": llm_response.completion_tokens or 0, + "total_tokens": llm_response.total_tokens or 0, + "model": llm_response.model_name, + } + ) + + def get_token_stats(self) -> TokenStats: + """Get token usage statistics (same as sync version)""" + by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]] + return TokenStats( + total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"], + total_completion_tokens=self._token_usage_raw["total_completion_tokens"], + total_tokens=self._token_usage_raw["total_tokens"], + by_action=by_action, + ) + + def get_history(self) -> list[ActionHistory]: + """Get execution history (same as sync version)""" + return [ActionHistory(**h) for h in self.history] + + def clear_history(self) -> None: + """Clear execution history and reset token counters (same as sync version)""" + self.history.clear() + self._token_usage_raw = { + "total_prompt_tokens": 0, + "total_completion_tokens": 0, + "total_tokens": 0, + "by_action": [], + } + + def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]: + """Filter elements from snapshot based on goal context (same as sync version)""" + elements = snapshot.elements + + # If no goal provided, return all elements (up to limit) + if not goal: + return elements[: self.default_snapshot_limit] + + goal_lower = goal.lower() + + # Extract keywords from goal + keywords = self._extract_keywords(goal_lower) + + # Boost elements matching goal keywords + scored_elements = [] + for el in elements: + score = el.importance + + # Boost if element text matches goal + if el.text and any(kw in el.text.lower() for kw in keywords): + score += 0.3 + + # Boost if role matches goal intent + if "click" in goal_lower and el.visual_cues.is_clickable: + score += 0.2 + if "type" in goal_lower and el.role in ["textbox", "searchbox"]: + score += 0.2 + if "search" in goal_lower: + # Filter out non-interactive elements for search tasks + if el.role in ["link", "img"] and not el.visual_cues.is_primary: + score -= 0.5 + + scored_elements.append((score, el)) + + # Re-sort by boosted score + scored_elements.sort(key=lambda x: x[0], reverse=True) + elements = [el for _, el in scored_elements] + + return elements[: self.default_snapshot_limit] + + def _extract_keywords(self, text: str) -> list[str]: + """Extract meaningful keywords from goal text (same as sync version)""" + stopwords = { + "the", + "a", + "an", + "and", + "or", + "but", + "in", + "on", + "at", + "to", + "for", + "of", + "with", + "by", + "from", + "as", + "is", + "was", + } + words = text.split() + return [w for w in words if w not in stopwords and len(w) > 2] diff --git a/sentience/async_api.py b/sentience/async_api.py index fc4e168..70fec1b 100644 --- a/sentience/async_api.py +++ b/sentience/async_api.py @@ -25,12 +25,18 @@ # Re-export async action functions from actions.py from sentience.actions import click_async, click_rect_async, press_async, type_text_async +# ========== Phase 2C: Agent Layer ========== +# Re-export async agent classes from agent.py and base_agent.py +from sentience.agent import SentienceAgentAsync +from sentience.base_agent import BaseAgentAsync + # ========== Browser ========== # Re-export AsyncSentienceBrowser from browser.py (moved there for better organization) from sentience.browser import AsyncSentienceBrowser # Re-export async expect functions from expect.py from sentience.expect import ExpectationAsync, expect_async +from sentience.inspector import InspectorAsync, inspect_async # Re-export async overlay functions from overlay.py from sentience.overlay import clear_overlay_async, show_overlay_async @@ -43,6 +49,10 @@ # Re-export async read function from read.py from sentience.read import read_async +# ========== Phase 2D: Developer Tools ========== +# Re-export async recorder and inspector from their modules +from sentience.recorder import RecorderAsync, record_async + # Re-export async screenshot function from screenshot.py from sentience.screenshot import screenshot_async @@ -57,17 +67,6 @@ # Re-export async wait function from wait.py from sentience.wait import wait_for_async -# ========== Phase 2C: Agent Layer (Future) ========== -# TODO: Re-export when implemented -# from sentience.agent import SentienceAgentAsync -# from sentience.base_agent import BaseAgentAsync - -# ========== Phase 2D: Developer Tools (Future) ========== -# TODO: Re-export when implemented -# from sentience.recorder import RecorderAsync -# from sentience.inspector import InspectorAsync - - __all__ = [ # Browser "AsyncSentienceBrowser", # Re-exported from browser.py @@ -88,12 +87,14 @@ "clear_overlay_async", # Re-exported from overlay.py "expect_async", # Re-exported from expect.py "ExpectationAsync", # Re-exported from expect.py - # Phase 2C: Agent Layer (Future - uncomment when implemented) - # "SentienceAgentAsync", - # "BaseAgentAsync", - # Phase 2D: Developer Tools (Future - uncomment when implemented) - # "RecorderAsync", - # "InspectorAsync", + # Phase 2C: Agent Layer + "SentienceAgentAsync", # Re-exported from agent.py + "BaseAgentAsync", # Re-exported from base_agent.py + # Phase 2D: Developer Tools + "RecorderAsync", # Re-exported from recorder.py + "record_async", # Re-exported from recorder.py + "InspectorAsync", # Re-exported from inspector.py + "inspect_async", # Re-exported from inspector.py # Query Functions "find", # Re-exported from query.py "query", # Re-exported from query.py diff --git a/sentience/base_agent.py b/sentience/base_agent.py index 07ce76f..a7c1e3c 100644 --- a/sentience/base_agent.py +++ b/sentience/base_agent.py @@ -99,3 +99,96 @@ def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[E >>> # filtered now contains only relevant elements """ return snapshot.elements + + +class BaseAgentAsync(ABC): + """ + Abstract base class for all async Sentience agents. + + Provides a standard interface for: + - Executing natural language goals (act) + - Tracking execution history + - Monitoring token usage + - Filtering elements based on goals + + Subclasses must implement: + - act(): Execute a natural language goal (async) + - get_history(): Return execution history + - get_token_stats(): Return token usage statistics + - clear_history(): Reset history and token counters + + Subclasses can override: + - filter_elements(): Customize element filtering logic + """ + + @abstractmethod + async def act(self, goal: str, **kwargs) -> AgentActionResult: + """ + Execute a natural language goal using the agent (async). + + Args: + goal: Natural language instruction (e.g., "Click the login button") + **kwargs: Additional parameters (implementation-specific) + + Returns: + AgentActionResult with execution details + + Raises: + RuntimeError: If execution fails after retries + """ + pass + + @abstractmethod + def get_history(self) -> list[ActionHistory]: + """ + Get the execution history of all actions taken. + + Returns: + List of ActionHistory entries + """ + pass + + @abstractmethod + def get_token_stats(self) -> TokenStats: + """ + Get token usage statistics for the agent session. + + Returns: + TokenStats with cumulative token counts + """ + pass + + @abstractmethod + def clear_history(self) -> None: + """ + Clear execution history and reset token counters. + + This resets the agent to a clean state. + """ + pass + + def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]: + """ + Filter elements from a snapshot based on goal context. + + Default implementation returns all elements unchanged. + Subclasses can override to implement custom filtering logic + such as: + - Removing irrelevant elements based on goal keywords + - Boosting importance of matching elements + - Filtering by role, size, or visual properties + + Args: + snapshot: Current page snapshot + goal: User's goal (can inform filtering strategy) + + Returns: + Filtered list of elements (default: all elements) + + Example: + >>> agent = SentienceAgentAsync(browser, llm) + >>> snap = await snapshot_async(browser) + >>> filtered = agent.filter_elements(snap, goal="Click login") + >>> # filtered now contains only relevant elements + """ + return snapshot.elements diff --git a/sentience/inspector.py b/sentience/inspector.py index 04d128e..8a84c9f 100644 --- a/sentience/inspector.py +++ b/sentience/inspector.py @@ -2,7 +2,7 @@ Inspector tool - helps developers see what the agent "sees" """ -from .browser import SentienceBrowser +from .browser import AsyncSentienceBrowser, SentienceBrowser class Inspector: @@ -183,3 +183,183 @@ def inspect(browser: SentienceBrowser) -> Inspector: Inspector instance """ return Inspector(browser) + + +class InspectorAsync: + """Inspector for debugging - shows element info on hover/click (async)""" + + def __init__(self, browser: AsyncSentienceBrowser): + self.browser = browser + self._active = False + self._last_element_id: int | None = None + + async def start(self) -> None: + """Start inspection mode - prints element info on mouse move/click (async)""" + if not self.browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + self._active = True + + # Inject inspector script into page + await self.browser.page.evaluate( + """ + (() => { + // Remove existing inspector if any + if (window.__sentience_inspector_active) { + return; + } + + window.__sentience_inspector_active = true; + window.__sentience_last_element_id = null; + + // Get element at point + function getElementAtPoint(x, y) { + const el = document.elementFromPoint(x, y); + if (!el) return null; + + // Find element in registry + if (window.sentience_registry) { + for (let i = 0; i < window.sentience_registry.length; i++) { + if (window.sentience_registry[i] === el) { + return i; + } + } + } + return null; + } + + // Mouse move handler + function handleMouseMove(e) { + if (!window.__sentience_inspector_active) return; + + const elementId = getElementAtPoint(e.clientX, e.clientY); + if (elementId === null || elementId === window.__sentience_last_element_id) { + return; + } + + window.__sentience_last_element_id = elementId; + + // Get element info from snapshot if available + if (window.sentience && window.sentience_registry) { + const el = window.sentience_registry[elementId]; + if (el) { + const rect = el.getBoundingClientRect(); + const text = el.getAttribute('aria-label') || + el.value || + el.placeholder || + el.alt || + (el.innerText || '').substring(0, 50); + + const role = el.getAttribute('role') || el.tagName.toLowerCase(); + + console.log(`[Sentience Inspector] Element #${elementId}: role=${role}, text="${text}", bbox=(${Math.round(rect.x)}, ${Math.round(rect.y)}, ${Math.round(rect.width)}, ${Math.round(rect.height)})`); + } + } + } + + // Click handler + function handleClick(e) { + if (!window.__sentience_inspector_active) return; + + e.preventDefault(); + e.stopPropagation(); + + const elementId = getElementAtPoint(e.clientX, e.clientY); + if (elementId === null) return; + + // Get full element info + if (window.sentience && window.sentience_registry) { + const el = window.sentience_registry[elementId]; + if (el) { + const rect = el.getBoundingClientRect(); + const info = { + id: elementId, + tag: el.tagName.toLowerCase(), + role: el.getAttribute('role') || 'generic', + text: el.getAttribute('aria-label') || + el.value || + el.placeholder || + el.alt || + (el.innerText || '').substring(0, 100), + bbox: { + x: Math.round(rect.x), + y: Math.round(rect.y), + width: Math.round(rect.width), + height: Math.round(rect.height) + }, + attributes: { + id: el.id || null, + class: el.className || null, + name: el.name || null, + type: el.type || null + } + }; + + console.log('[Sentience Inspector] Clicked element:', JSON.stringify(info, null, 2)); + + // Also try to get from snapshot if available + window.sentience.snapshot({ limit: 100 }).then(snap => { + const element = snap.elements.find(el => el.id === elementId); + if (element) { + console.log('[Sentience Inspector] Snapshot element:', JSON.stringify(element, null, 2)); + } + }).catch(() => {}); + } + } + } + + // Add event listeners + document.addEventListener('mousemove', handleMouseMove, true); + document.addEventListener('click', handleClick, true); + + // Store cleanup function + window.__sentience_inspector_cleanup = () => { + document.removeEventListener('mousemove', handleMouseMove, true); + document.removeEventListener('click', handleClick, true); + window.__sentience_inspector_active = false; + }; + + console.log('[Sentience Inspector] ✅ Inspection mode active. Hover elements to see info, click to see full details.'); + })(); + """ + ) + + async def stop(self) -> None: + """Stop inspection mode (async)""" + if not self.browser.page: + return + + self._active = False + + # Cleanup inspector + await self.browser.page.evaluate( + """ + () => { + if (window.__sentience_inspector_cleanup) { + window.__sentience_inspector_cleanup(); + } + } + """ + ) + + async def __aenter__(self): + """Context manager entry""" + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Context manager exit""" + await self.stop() + + +def inspect_async(browser: AsyncSentienceBrowser) -> InspectorAsync: + """ + Create an inspector instance (async) + + Args: + browser: AsyncSentienceBrowser instance + + Returns: + InspectorAsync instance + """ + return InspectorAsync(browser) diff --git a/sentience/recorder.py b/sentience/recorder.py index 44ef70f..c5297ee 100644 --- a/sentience/recorder.py +++ b/sentience/recorder.py @@ -6,9 +6,9 @@ from datetime import datetime from typing import Any -from .browser import SentienceBrowser +from .browser import AsyncSentienceBrowser, SentienceBrowser from .models import Element, Snapshot -from .snapshot import snapshot +from .snapshot import snapshot, snapshot_async class TraceStep: @@ -367,3 +367,223 @@ def record(browser: SentienceBrowser, capture_snapshots: bool = False) -> Record Recorder instance """ return Recorder(browser, capture_snapshots=capture_snapshots) + + +class RecorderAsync: + """Recorder for capturing user actions (async)""" + + def __init__(self, browser: AsyncSentienceBrowser, capture_snapshots: bool = False): + self.browser = browser + self.capture_snapshots = capture_snapshots + self.trace: Trace | None = None + self._active = False + self._mask_patterns: list[str] = [] # Patterns to mask (e.g., "password", "email") + + async def start(self) -> None: + """Start recording""" + if not self.browser.page: + raise RuntimeError("Browser not started. Call await browser.start() first.") + + self._active = True + start_url = self.browser.page.url + self.trace = Trace(start_url) + + # Set up event listeners in the browser + self._setup_listeners() + + def stop(self) -> None: + """Stop recording""" + self._active = False + self._cleanup_listeners() + + def add_mask_pattern(self, pattern: str) -> None: + """Add a pattern to mask in recorded text (e.g., "password", "email")""" + self._mask_patterns.append(pattern.lower()) + + def _should_mask(self, text: str) -> bool: + """Check if text should be masked""" + text_lower = text.lower() + return any(pattern in text_lower for pattern in self._mask_patterns) + + def _setup_listeners(self) -> None: + """Set up event listeners to capture actions""" + # Note: We'll capture actions through the SDK methods rather than DOM events + # This is cleaner and more reliable + pass + + def _cleanup_listeners(self) -> None: + """Clean up event listeners""" + pass + + async def _infer_selector(self, element_id: int) -> str | None: # noqa: C901 + """ + Infer a semantic selector for an element (async) + + Uses heuristics to build a robust selector: + - role=... text~"..." + - If text empty: use name/aria-label/placeholder + - Include clickable=true when relevant + - Validate against snapshot (should match 1 element) + """ + try: + # Take a snapshot to get element info + snap = await snapshot_async(self.browser) + + # Find the element in the snapshot + element = None + for el in snap.elements: + if el.id == element_id: + element = el + break + + if not element: + return None + + # Build candidate selector + parts = [] + + # Add role + if element.role and element.role != "generic": + parts.append(f"role={element.role}") + + # Add text if available + if element.text: + # Use contains match for text + text_part = element.text.replace('"', '\\"')[:50] # Limit length + parts.append(f'text~"{text_part}"') + else: + # Try to get name/aria-label/placeholder from DOM + try: + el = await self.browser.page.evaluate( + f""" + () => {{ + const el = window.sentience_registry[{element_id}]; + if (!el) return null; + return {{ + name: el.name || null, + ariaLabel: el.getAttribute('aria-label') || null, + placeholder: el.placeholder || null + }}; + }} + """ + ) + + if el: + if el.get("name"): + parts.append(f'name="{el["name"]}"') + elif el.get("ariaLabel"): + parts.append(f'text~"{el["ariaLabel"]}"') + elif el.get("placeholder"): + parts.append(f'text~"{el["placeholder"]}"') + except Exception: + pass + + # Add clickable if relevant + if element.visual_cues.is_clickable: + parts.append("clickable=true") + + if not parts: + return None + + selector = " ".join(parts) + + # Validate selector - should match exactly 1 element + matches = [el for el in snap.elements if self._match_element(el, selector)] + + if len(matches) == 1: + return selector + elif len(matches) > 1: + # Add more constraints (importance threshold, near-center) + # For now, just return the selector with a note + return selector + else: + # Selector doesn't match - return None (will use element_id) + return None + + except Exception: + return None + + def _match_element(self, element: Element, selector: str) -> bool: + """Simple selector matching (basic implementation)""" + # This is a simplified version - in production, use the full query engine + from .query import match_element, parse_selector + + try: + query_dict = parse_selector(selector) + return match_element(element, query_dict) + except Exception: + return False + + def record_navigation(self, url: str) -> None: + """Record a navigation event""" + if self._active and self.trace: + self.trace.add_navigation(url) + + async def record_click(self, element_id: int, selector: str | None = None) -> None: + """Record a click event with smart selector inference (async)""" + if self._active and self.trace: + # If no selector provided, try to infer one + if selector is None: + selector = await self._infer_selector(element_id) + + # Optionally capture snapshot + if self.capture_snapshots: + try: + snap = await snapshot_async(self.browser) + step = TraceStep( + ts=int((datetime.now() - self.trace._start_time).total_seconds() * 1000), + type="click", + element_id=element_id, + selector=selector, + snapshot=snap, + ) + self.trace.add_step(step) + except Exception: + # If snapshot fails, just record without it + self.trace.add_click(element_id, selector) + else: + self.trace.add_click(element_id, selector) + + async def record_type(self, element_id: int, text: str, selector: str | None = None) -> None: + """Record a type event with smart selector inference (async)""" + if self._active and self.trace: + # If no selector provided, try to infer one + if selector is None: + selector = await self._infer_selector(element_id) + + mask = self._should_mask(text) + self.trace.add_type(element_id, text, selector, mask=mask) + + def record_press(self, key: str) -> None: + """Record a key press event""" + if self._active and self.trace: + self.trace.add_press(key) + + def save(self, filepath: str) -> None: + """Save trace to file""" + if not self.trace: + raise RuntimeError("No trace to save. Start recording first.") + self.trace.save(filepath) + + async def __aenter__(self): + """Context manager entry""" + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Context manager exit""" + self.stop() + + +def record_async(browser: AsyncSentienceBrowser, capture_snapshots: bool = False) -> RecorderAsync: + """ + Create a recorder instance (async) + + Args: + browser: AsyncSentienceBrowser instance + capture_snapshots: Whether to capture snapshots at each step + + Returns: + RecorderAsync instance + """ + return RecorderAsync(browser, capture_snapshots=capture_snapshots) diff --git a/tests/test_async_api.py b/tests/test_async_api.py index bd60356..26e69ad 100644 --- a/tests/test_async_api.py +++ b/tests/test_async_api.py @@ -7,16 +7,22 @@ from sentience.async_api import ( AsyncSentienceBrowser, + BaseAgentAsync, ExpectationAsync, + InspectorAsync, + RecorderAsync, + SentienceAgentAsync, clear_overlay_async, click_async, click_rect_async, expect_async, find, find_text_rect_async, + inspect_async, press_async, query, read_async, + record_async, screenshot_async, show_overlay_async, snapshot_async, @@ -482,3 +488,188 @@ async def test_async_expectation_class(): # Use expectation methods element = await expectation.to_exist() assert element is not None + + +# ========== Phase 2C: Agent Layer Tests ========== + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_base_agent_async_interface(): + """Test BaseAgentAsync is an abstract class""" + # BaseAgentAsync should be abstract and cannot be instantiated + assert issubclass(BaseAgentAsync, BaseAgentAsync) + # Check that it has the required abstract methods + assert hasattr(BaseAgentAsync, "act") + assert hasattr(BaseAgentAsync, "get_history") + assert hasattr(BaseAgentAsync, "get_token_stats") + assert hasattr(BaseAgentAsync, "clear_history") + assert hasattr(BaseAgentAsync, "filter_elements") + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_sentience_agent_async_initialization(): + """Test SentienceAgentAsync can be initialized""" + from sentience.llm_provider import LLMProvider, LLMResponse + + # Create a simple mock LLM provider + class MockLLMProvider(LLMProvider): + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + return LLMResponse( + content="CLICK(1)", + model_name="mock", + prompt_tokens=10, + completion_tokens=5, + total_tokens=15, + ) + + def supports_json_mode(self) -> bool: + return True + + @property + def model_name(self) -> str: + return "mock-model" + + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + # Create a mock LLM provider + llm = MockLLMProvider() + agent = SentienceAgentAsync(browser, llm, verbose=False) + + assert agent.browser == browser + assert agent.llm == llm + assert agent.default_snapshot_limit == 50 + assert len(agent.history) == 0 + + # Test history methods + history = agent.get_history() + assert isinstance(history, list) + assert len(history) == 0 + + stats = agent.get_token_stats() + assert stats.total_tokens == 0 + assert stats.total_prompt_tokens == 0 + assert stats.total_completion_tokens == 0 + + # Test clear_history + agent.clear_history() + assert len(agent.history) == 0 + + +# ========== Phase 2D: Developer Tools Tests ========== + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_recorder_async_initialization(): + """Test RecorderAsync can be initialized""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + recorder = RecorderAsync(browser, capture_snapshots=False) + assert recorder.browser == browser + assert recorder.capture_snapshots is False + assert recorder._active is False + assert recorder.trace is None + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_recorder_async_context_manager(): + """Test RecorderAsync context manager""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + async with RecorderAsync(browser) as recorder: + assert recorder._active is True + assert recorder.trace is not None + assert recorder.trace.start_url == browser.page.url + + # After context exit, recorder should be stopped + assert recorder._active is False + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_recorder_async_record_methods(): + """Test RecorderAsync record methods""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + recorder = RecorderAsync(browser) + await recorder.start() + + # Record navigation + recorder.record_navigation("https://example.com/page2") + assert len(recorder.trace.steps) == 1 + assert recorder.trace.steps[0].type == "navigation" + + # Record press + recorder.record_press("Enter") + assert len(recorder.trace.steps) == 2 + assert recorder.trace.steps[1].type == "press" + assert recorder.trace.steps[1].key == "Enter" + + recorder.stop() + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_record_async_function(): + """Test record_async convenience function""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + recorder = record_async(browser, capture_snapshots=False) + assert isinstance(recorder, RecorderAsync) + assert recorder.browser == browser + assert recorder.capture_snapshots is False + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_inspector_async_initialization(): + """Test InspectorAsync can be initialized""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + inspector = InspectorAsync(browser) + assert inspector.browser == browser + assert inspector._active is False + assert inspector._last_element_id is None + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_inspector_async_context_manager(): + """Test InspectorAsync context manager""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + async with InspectorAsync(browser) as inspector: + assert inspector._active is True + + # After context exit, inspector should be stopped + assert inspector._active is False + + +@pytest.mark.asyncio +@pytest.mark.requires_extension +async def test_inspect_async_function(): + """Test inspect_async convenience function""" + async with AsyncSentienceBrowser() as browser: + await browser.goto("https://example.com") + await browser.page.wait_for_load_state("networkidle") + + inspector = inspect_async(browser) + assert isinstance(inspector, InspectorAsync) + assert inspector.browser == browser From f958e54734928ae4ba1b6590f8a09be770958dc7 Mon Sep 17 00:00:00 2001 From: rcholic Date: Thu, 1 Jan 2026 07:54:19 -0800 Subject: [PATCH 4/7] add dep --- pyproject.toml | 1 + sentience/snapshot.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 06a3772..1b502ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "pydantic>=2.0.0", "jsonschema>=4.0.0", "requests>=2.31.0", # For server-side API calls + "aiohttp>=3.9.0", # For async API calls "playwright-stealth>=1.0.6", # Bot evasion and stealth mode "markdownify>=0.11.6", # Enhanced HTML to Markdown conversion ] diff --git a/sentience/snapshot.py b/sentience/snapshot.py index dcacd17..c55ea0c 100644 --- a/sentience/snapshot.py +++ b/sentience/snapshot.py @@ -8,7 +8,6 @@ import time from typing import Any, Optional -import aiohttp import requests from .browser import AsyncSentienceBrowser, SentienceBrowser @@ -468,6 +467,9 @@ async def _snapshot_via_api_async( } try: + # Lazy import aiohttp - only needed for async API calls + import aiohttp + async with aiohttp.ClientSession() as session: async with session.post( f"{browser.api_url}/v1/snapshot", From 2d3fb2dd132d1d929bc24f4c41a748c53aa66b62 Mon Sep 17 00:00:00 2001 From: rcholic Date: Thu, 1 Jan 2026 08:02:35 -0800 Subject: [PATCH 5/7] change to httpx --- pyproject.toml | 4 ++-- sentience/snapshot.py | 21 ++++++++++----------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1b502ab..6956f63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,8 +25,8 @@ dependencies = [ "playwright>=1.40.0", "pydantic>=2.0.0", "jsonschema>=4.0.0", - "requests>=2.31.0", # For server-side API calls - "aiohttp>=3.9.0", # For async API calls + "requests>=2.31.0", # For server-side API calls (sync) + "httpx>=0.25.0", # For async API calls "playwright-stealth>=1.0.6", # Bot evasion and stealth mode "markdownify>=0.11.6", # Enhanced HTML to Markdown conversion ] diff --git a/sentience/snapshot.py b/sentience/snapshot.py index c55ea0c..4f74bb6 100644 --- a/sentience/snapshot.py +++ b/sentience/snapshot.py @@ -467,18 +467,17 @@ async def _snapshot_via_api_async( } try: - # Lazy import aiohttp - only needed for async API calls - import aiohttp + # Lazy import httpx - only needed for async API calls + import httpx - async with aiohttp.ClientSession() as session: - async with session.post( + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( f"{browser.api_url}/v1/snapshot", - data=payload_json, + content=payload_json, headers=headers, - timeout=aiohttp.ClientTimeout(total=30), - ) as response: - response.raise_for_status() - api_result = await response.json() + ) + response.raise_for_status() + api_result = response.json() # Merge API result with local data snapshot_data = { @@ -509,9 +508,9 @@ async def _snapshot_via_api_async( return Snapshot(**snapshot_data) except ImportError: - # Fallback to requests if aiohttp not available (shouldn't happen in async context) + # Fallback to requests if httpx not available (shouldn't happen in async context) raise RuntimeError( - "aiohttp is required for async API calls. Install it with: pip install aiohttp" + "httpx is required for async API calls. Install it with: pip install httpx" ) except Exception as e: raise RuntimeError(f"API request failed: {e}") From 21939eb6208d68a76df2d9eac09c92e2e3af351a Mon Sep 17 00:00:00 2001 From: rcholic Date: Thu, 1 Jan 2026 08:02:52 -0800 Subject: [PATCH 6/7] change to httpx --- examples/basic_agent_async.py | 1 - examples/hello_async.py | 1 - examples/query_demo_async.py | 1 - examples/read_markdown_async.py | 1 - examples/semantic_wait_demo_async.py | 1 - examples/wait_and_click_async.py | 1 - 6 files changed, 6 deletions(-) diff --git a/examples/basic_agent_async.py b/examples/basic_agent_async.py index f1781d5..6559ec2 100644 --- a/examples/basic_agent_async.py +++ b/examples/basic_agent_async.py @@ -92,4 +92,3 @@ async def main(): if __name__ == "__main__": asyncio.run(main()) - diff --git a/examples/hello_async.py b/examples/hello_async.py index c44a8b9..63cf150 100644 --- a/examples/hello_async.py +++ b/examples/hello_async.py @@ -63,4 +63,3 @@ async def main(): if __name__ == "__main__": asyncio.run(main()) - diff --git a/examples/query_demo_async.py b/examples/query_demo_async.py index b0a7bdf..c4c98ba 100644 --- a/examples/query_demo_async.py +++ b/examples/query_demo_async.py @@ -47,4 +47,3 @@ async def main(): if __name__ == "__main__": asyncio.run(main()) - diff --git a/examples/read_markdown_async.py b/examples/read_markdown_async.py index 41a0c14..66268ac 100644 --- a/examples/read_markdown_async.py +++ b/examples/read_markdown_async.py @@ -56,4 +56,3 @@ async def main(): if __name__ == "__main__": asyncio.run(main()) - diff --git a/examples/semantic_wait_demo_async.py b/examples/semantic_wait_demo_async.py index f887475..0125543 100644 --- a/examples/semantic_wait_demo_async.py +++ b/examples/semantic_wait_demo_async.py @@ -117,4 +117,3 @@ async def main(): if __name__ == "__main__": asyncio.run(main()) - diff --git a/examples/wait_and_click_async.py b/examples/wait_and_click_async.py index 1d2f754..59f6b91 100644 --- a/examples/wait_and_click_async.py +++ b/examples/wait_and_click_async.py @@ -61,4 +61,3 @@ async def main(): if __name__ == "__main__": asyncio.run(main()) - From 770958f677d492a485ec6224752c2bcf0d0e15fd Mon Sep 17 00:00:00 2001 From: rcholic Date: Thu, 1 Jan 2026 08:30:12 -0800 Subject: [PATCH 7/7] fix tests --- tests/test_video_recording.py | 4 ++-- tests/test_wait.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_video_recording.py b/tests/test_video_recording.py index 2a9fcca..d1400ce 100644 --- a/tests/test_video_recording.py +++ b/tests/test_video_recording.py @@ -117,7 +117,7 @@ def test_no_video_recording_when_disabled(): try: browser.page.goto("https://example.com") - browser.page.wait_for_load_state("networkidle") + browser.page.wait_for_load_state("networkidle", timeout=10000) video_path = browser.close() @@ -188,7 +188,7 @@ def test_video_recording_multiple_sessions(): try: browser.page.goto("https://example.com") - browser.page.wait_for_load_state("networkidle") + browser.page.wait_for_load_state("networkidle", timeout=10000) output_path = video_dir / f"video_{i}.webm" video_path = browser.close(output_path=str(output_path)) diff --git a/tests/test_wait.py b/tests/test_wait.py index 05a3ab2..7150708 100644 --- a/tests/test_wait.py +++ b/tests/test_wait.py @@ -10,7 +10,7 @@ def test_wait_for(): # Auto-detect headless mode (True in CI, False locally) with SentienceBrowser() as browser: browser.page.goto("https://example.com") - browser.page.wait_for_load_state("networkidle") + browser.page.wait_for_load_state("networkidle", timeout=10000) result = wait_for(browser, "role=link", timeout=5.0) assert result.found is True @@ -23,7 +23,7 @@ def test_wait_for_timeout(): """Test wait_for timeout""" with SentienceBrowser() as browser: browser.page.goto("https://example.com") - browser.page.wait_for_load_state("networkidle") + browser.page.wait_for_load_state("networkidle", timeout=10000) # Wait for non-existent element result = wait_for(browser, "role=button text~'NonExistentButton'", timeout=1.0) @@ -35,7 +35,7 @@ def test_expect_to_exist(): """Test expect().to_exist()""" with SentienceBrowser() as browser: browser.page.goto("https://example.com") - browser.page.wait_for_load_state("networkidle") + browser.page.wait_for_load_state("networkidle", timeout=10000) element = expect(browser, "role=link").to_exist(timeout=5.0) assert element is not None @@ -46,7 +46,7 @@ def test_expect_to_be_visible(): """Test expect().to_be_visible()""" with SentienceBrowser() as browser: browser.page.goto("https://example.com") - browser.page.wait_for_load_state("networkidle") + browser.page.wait_for_load_state("networkidle", timeout=10000) element = expect(browser, "role=link").to_be_visible(timeout=5.0) assert element is not None