diff --git a/README.md b/README.md index e0e93d6..0f10408 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ The SDK is open under ELv2; the core semantic geometry and reliability logic runs in Sentience-hosted services. -## Installation +## πŸ“¦ Installation ```bash # Install from PyPI @@ -22,11 +22,12 @@ pip install transformers torch # For local LLMs pip install -e . ``` -## Quick Start: Choose Your Abstraction Level +## πŸš€ Quick Start: Choose Your Abstraction Level Sentience SDK offers **three abstraction levels** - use what fits your needs: -### 🎯 **Level 3: Natural Language (Easiest)** - For non-technical users +
+🎯 Level 3: Natural Language (Easiest) - For non-technical users ```python from sentience import SentienceBrowser, ConversationalAgent @@ -47,7 +48,10 @@ with browser: **Code required:** 3-5 lines **Technical knowledge:** None -### βš™οΈ **Level 2: Technical Commands (Recommended)** - For AI developers +
+ +
+βš™οΈ Level 2: Technical Commands (Recommended) - For AI developers ```python from sentience import SentienceBrowser, SentienceAgent @@ -68,7 +72,10 @@ with browser: **Code required:** 10-15 lines **Technical knowledge:** Medium (Python basics) -### πŸ”§ **Level 1: Direct SDK (Most Control)** - For production automation +
+ +
+πŸ”§ Level 1: Direct SDK (Most Control) - For production automation ```python from sentience import SentienceBrowser, snapshot, find, click @@ -91,7 +98,12 @@ with SentienceBrowser(headless=False) as browser: **Code required:** 20-50 lines **Technical knowledge:** High (SDK API, selectors) -## Real-World Example: Amazon Shopping Bot +
+ +--- + +
+

πŸ’Ό Real-World Example: Amazon Shopping Bot

This example demonstrates navigating Amazon, finding products, and adding items to cart: @@ -139,26 +151,39 @@ with SentienceBrowser(headless=False) as browser: print(f"Added to cart: {cart_result.success}") ``` -**See the complete tutorial**: [Amazon Shopping Guide](../docs/AMAZON_SHOPPING_GUIDE.md) +**πŸ“– See the complete tutorial:** [Amazon Shopping Guide](../docs/AMAZON_SHOPPING_GUIDE.md) + +
+ +--- + +## πŸ“š Core Features -## Core Features +
+

🌐 Browser Control

-### Browser Control - **`SentienceBrowser`** - Playwright browser with Sentience extension pre-loaded - **`browser.goto(url)`** - Navigate with automatic extension readiness checks - Automatic bot evasion and stealth mode - Configurable headless/headed mode -### Snapshot - Intelligent Page Analysis -- **`snapshot(browser, screenshot=True)`** - Capture page state with AI-ranked elements +
+ +
+

πŸ“Έ Snapshot - Intelligent Page Analysis

+ +**`snapshot(browser, screenshot=True, show_overlay=False)`** - Capture page state with AI-ranked elements + +Features: - Returns semantic elements with roles, text, importance scores, and bounding boxes - Optional screenshot capture (PNG/JPEG) +- Optional visual overlay to see what elements are detected - Pydantic models for type safety - **`snapshot.save(filepath)`** - Export to JSON **Example:** ```python -snap = snapshot(browser, screenshot=True) +snap = snapshot(browser, screenshot=True, show_overlay=True) # Access structured data print(f"URL: {snap.url}") @@ -170,7 +195,11 @@ for element in snap.elements: print(f"{element.role}: {element.text} (importance: {element.importance})") ``` -### Query Engine - Semantic Element Selection +
+ +
+

πŸ” Query Engine - Semantic Element Selection

+ - **`query(snapshot, selector)`** - Find all matching elements - **`find(snapshot, selector)`** - Find single best match (by importance) - Powerful query DSL with multiple operators @@ -200,7 +229,11 @@ first_row = query(snap, "bbox.y<600") **πŸ“– [Complete Query DSL Guide](docs/QUERY_DSL.md)** - All operators, fields, and advanced patterns -### Actions - Interact with Elements +
+ +
+

πŸ‘† Actions - Interact with Elements

+ - **`click(browser, element_id)`** - Click element by ID - **`click_rect(browser, rect)`** - Click at center of rectangle (coordinate-based) - **`type_text(browser, element_id, text)`** - Type into input fields @@ -239,7 +272,11 @@ if element: }) ``` -### Wait & Assertions +
+ +
+

⏱️ Wait & Assertions

+ - **`wait_for(browser, selector, timeout=5.0, interval=None, use_api=None)`** - Wait for element to appear - **`expect(browser, selector)`** - Assertion helper with fluent API @@ -271,11 +308,56 @@ expect(browser, "role=button").to_have_text("Submit") expect(browser, "role=link").to_have_count(10) ``` -### Content Reading -- **`read(browser, format="text|markdown|raw")`** - Extract page content - - `format="text"` - Plain text extraction - - `format="markdown"` - High-quality markdown conversion (uses markdownify) - - `format="raw"` - Cleaned HTML (default) +
+ +
+

🎨 Visual Overlay - Debug Element Detection

+ +- **`show_overlay(browser, elements, target_element_id=None)`** - Display visual overlay highlighting elements +- **`clear_overlay(browser)`** - Clear overlay manually + +Show color-coded borders around detected elements to debug, validate, and understand what Sentience sees: + +```python +from sentience import show_overlay, clear_overlay + +# Take snapshot once +snap = snapshot(browser) + +# Show overlay anytime without re-snapshotting +show_overlay(browser, snap) # Auto-clears after 5 seconds + +# Highlight specific target element in red +button = find(snap, "role=button text~'Submit'") +show_overlay(browser, snap, target_element_id=button.id) + +# Clear manually before 5 seconds +import time +time.sleep(2) +clear_overlay(browser) +``` + +**Color Coding:** +- πŸ”΄ Red: Target element +- πŸ”΅ Blue: Primary elements (`is_primary=true`) +- 🟒 Green: Regular interactive elements + +**Visual Indicators:** +- Border thickness/opacity scales with importance +- Semi-transparent fill +- Importance badges +- Star icons for primary elements +- Auto-clear after 5 seconds + +
+ +
+

πŸ“„ Content Reading

+ +**`read(browser, format="text|markdown|raw")`** - Extract page content +- `format="text"` - Plain text extraction +- `format="markdown"` - High-quality markdown conversion (uses markdownify) +- `format="raw"` - Cleaned HTML (default) **Example:** ```python @@ -290,11 +372,15 @@ result = read(browser, format="text") print(result["content"]) # Plain text ``` -### Screenshots -- **`screenshot(browser, format="png|jpeg", quality=80)`** - Standalone screenshot capture - - Returns base64-encoded data URL - - PNG or JPEG format - - Quality control for JPEG (1-100) +
+ +
+

πŸ“· Screenshots

+ +**`screenshot(browser, format="png|jpeg", quality=80)`** - Standalone screenshot capture +- Returns base64-encoded data URL +- PNG or JPEG format +- Quality control for JPEG (1-100) **Example:** ```python @@ -313,7 +399,14 @@ with open("screenshot.png", "wb") as f: data_url = screenshot(browser, format="jpeg", quality=85) ``` -## Element Properties +
+ +--- + +## πŸ“‹ Reference + +
+

Element Properties

Elements returned by `snapshot()` have the following properties: @@ -329,7 +422,10 @@ element.is_occluded # Is element covered by other elements? element.z_index # CSS stacking order ``` -## Query DSL Reference +
+ +
+

Query DSL Reference

### Basic Operators @@ -352,32 +448,14 @@ element.z_index # CSS stacking order - **Position**: `bbox.x`, `bbox.y`, `bbox.width`, `bbox.height` - **Layering**: `z_index` -## Examples - -See the `examples/` directory for complete working examples: - -- **`hello.py`** - Extension bridge verification -- **`basic_agent.py`** - Basic snapshot and element inspection -- **`query_demo.py`** - Query engine demonstrations -- **`wait_and_click.py`** - Waiting for elements and performing actions -- **`read_markdown.py`** - Content extraction and markdown conversion - -## Testing +
-```bash -# Run all tests -pytest tests/ +--- -# Run specific test file -pytest tests/test_snapshot.py +## βš™οΈ Configuration -# Run with verbose output -pytest -v tests/ -``` - -## Configuration - -### Viewport Size +
+

Viewport Size

Default viewport is **1280x800** pixels. You can customize it using Playwright's API: @@ -389,7 +467,10 @@ with SentienceBrowser(headless=False) as browser: browser.goto("https://example.com") ``` -### Headless Mode +
+ +
+

Headless Mode

```python # Headed mode (default in dev, shows browser window) @@ -402,7 +483,10 @@ browser = SentienceBrowser(headless=True) browser = SentienceBrowser() # headless=True if CI=true, else False ``` -### Residential Proxy Support +
+ +
+

🌍 Residential Proxy Support

Use residential proxies to route traffic and protect your IP address. Supports HTTP, HTTPS, and SOCKS5 with automatic SSL certificate handling: @@ -432,7 +516,10 @@ with browser: See `examples/residential_proxy_agent.py` for complete examples. -### Authentication Session Injection +
+ +
+

πŸ” Authentication Session Injection

Inject pre-recorded authentication sessions (cookies + localStorage) to start your agent already logged in, bypassing login screens, 2FA, and CAPTCHAs. This saves tokens and reduces costs by eliminating login steps. @@ -467,7 +554,14 @@ browser.start() See `examples/auth_injection_agent.py` for complete examples. -## Best Practices +
+ +--- + +## πŸ’‘ Best Practices + +
+Click to expand best practices ### 1. Wait for Dynamic Content ```python @@ -507,7 +601,14 @@ snap = snapshot(browser) snap = snapshot(browser, screenshot=True) ``` -## Troubleshooting +
+ +--- + +## πŸ› οΈ Troubleshooting + +
+Click to expand common issues and solutions ### "Extension failed to load" **Solution:** Build the extension first: @@ -527,9 +628,14 @@ cd sentience-chrome - Check visibility: `element.in_viewport and not element.is_occluded` - Scroll to element: `browser.page.evaluate(f"window.sentience_registry[{element.id}].scrollIntoView()")` -## Advanced Features (v0.12.0+) +
-### Agent Tracing & Debugging +--- + +## πŸ”¬ Advanced Features (v0.12.0+) + +
+

πŸ“Š Agent Tracing & Debugging

The SDK now includes built-in tracing infrastructure for debugging and analyzing agent behavior: @@ -585,7 +691,10 @@ with browser: - Train custom models from successful runs - Monitor production agents -### Snapshot Utilities +
+ +
+

🧰 Snapshot Utilities

New utility functions for working with snapshots: @@ -607,16 +716,53 @@ print(llm_context) # Output: [1]
+ +--- + +## πŸ“– Documentation - **πŸ“– [Amazon Shopping Guide](../docs/AMAZON_SHOPPING_GUIDE.md)** - Complete tutorial with real-world example - **πŸ“– [Query DSL Guide](docs/QUERY_DSL.md)** - Advanced query patterns and operators - **πŸ“„ [API Contract](../spec/SNAPSHOT_V1.md)** - Snapshot API specification - **πŸ“„ [Type Definitions](../spec/sdk-types.md)** - TypeScript/Python type definitions -## License +--- + +## πŸ’» Examples & Testing + +
+

Examples

+ +See the `examples/` directory for complete working examples: + +- **`hello.py`** - Extension bridge verification +- **`basic_agent.py`** - Basic snapshot and element inspection +- **`query_demo.py`** - Query engine demonstrations +- **`wait_and_click.py`** - Waiting for elements and performing actions +- **`read_markdown.py`** - Content extraction and markdown conversion + +
+ +
+

Testing

+ +```bash +# Run all tests +pytest tests/ + +# Run specific test file +pytest tests/test_snapshot.py + +# Run with verbose output +pytest -v tests/ +``` + +
+ +--- -πŸ“œ **License** +## πŸ“œ License This SDK is licensed under the **Elastic License 2.0 (ELv2)**. @@ -626,9 +772,9 @@ The Elastic License 2.0 allows you to use, modify, and distribute this SDK for i - This SDK is a **client-side library** that communicates with proprietary Sentience services and browser components. -- The Sentience backend services (including semantic geometry grounding, ranking, visual cues, and trace processing) are **not open source** and are governed by Sentience’s Terms of Service. +- The Sentience backend services (including semantic geometry grounding, ranking, visual cues, and trace processing) are **not open source** and are governed by Sentience's Terms of Service. -- Use of this SDK does **not** grant rights to operate, replicate, or reimplement Sentience’s hosted services. +- Use of this SDK does **not** grant rights to operate, replicate, or reimplement Sentience's hosted services. For commercial usage, hosted offerings, or enterprise deployments, please contact Sentience to obtain a commercial license. diff --git a/pyproject.toml b/pyproject.toml index 2f540ad..8c88e6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sentienceapi" -version = "0.90.1" +version = "0.90.2" description = "Python SDK for Sentience AI Agent Browser Automation" readme = "README.md" requires-python = ">=3.11" diff --git a/sentience/__init__.py b/sentience/__init__.py index a03de3d..ee5915a 100644 --- a/sentience/__init__.py +++ b/sentience/__init__.py @@ -45,6 +45,7 @@ Viewport, WaitResult, ) +from .overlay import clear_overlay, show_overlay from .query import find, query from .read import read from .recorder import Recorder, Trace, TraceStep, record @@ -63,7 +64,7 @@ ) from .wait import wait_for -__version__ = "0.90.1" +__version__ = "0.90.2" __all__ = [ # Core SDK @@ -93,6 +94,8 @@ "generate", "read", "screenshot", + "show_overlay", + "clear_overlay", # Agent Layer (Phase 1 & 2) "BaseAgent", "LLMProvider", diff --git a/sentience/models.py b/sentience/models.py index 9be5221..50fb320 100644 --- a/sentience/models.py +++ b/sentience/models.py @@ -117,6 +117,7 @@ class SnapshotOptions(BaseModel): save_trace: bool = False # Save raw_elements to JSON for benchmarking/training trace_path: str | None = None # Path to save trace (default: "trace_{timestamp}.json") goal: str | None = None # Optional goal/task description for the snapshot + show_overlay: bool = False # Show visual overlay highlighting elements in browser class Config: arbitrary_types_allowed = True diff --git a/sentience/overlay.py b/sentience/overlay.py new file mode 100644 index 0000000..f8e9fb2 --- /dev/null +++ b/sentience/overlay.py @@ -0,0 +1,115 @@ +""" +Visual overlay utilities - show/clear element highlights in browser +""" + +from typing import Any + +from .browser import SentienceBrowser +from .models import Element, Snapshot + + +def show_overlay( + browser: SentienceBrowser, + elements: list[Element] | list[dict[str, Any]] | Snapshot, + target_element_id: int | None = None, +) -> None: + """ + Display visual overlay highlighting elements in the browser + + This function shows a Shadow DOM overlay with color-coded borders around + detected elements. Useful for debugging, learning, and validating element detection. + + Args: + browser: SentienceBrowser instance + elements: Can be: + - List of Element objects (from snapshot.elements) + - List of raw element dicts (from snapshot result or API response) + - Snapshot object (will use snapshot.elements) + target_element_id: Optional ID of element to highlight in red (default: None) + + Color Coding: + - Red: Target element (when target_element_id is specified) + - Blue: Primary elements (is_primary=true) + - Green: Regular interactive elements + + Visual Indicators: + - Border thickness and opacity scale with importance score + - Semi-transparent fill for better visibility + - Importance badges showing scores + - Star icon for primary elements + - Target emoji for the target element + + Auto-clear: Overlay automatically disappears after 5 seconds + + Example: + # Show overlay from snapshot + snap = snapshot(browser) + show_overlay(browser, snap) + + # Show overlay with custom elements + elements = [{"id": 1, "bbox": {"x": 100, "y": 100, "width": 200, "height": 50}, ...}] + show_overlay(browser, elements) + + # Show overlay with target element highlighted in red + show_overlay(browser, snap, target_element_id=42) + + # Clear overlay manually before 5 seconds + clear_overlay(browser) + """ + if not browser.page: + raise RuntimeError("Browser not started. Call browser.start() first.") + + # Handle different input types + if isinstance(elements, Snapshot): + # Extract elements from Snapshot object + elements_list = [el.model_dump() for el in elements.elements] + elif isinstance(elements, list) and len(elements) > 0: + # Check if it's a list of Element objects or dicts + if hasattr(elements[0], "model_dump"): + # List of Element objects + elements_list = [el.model_dump() for el in elements] + else: + # Already a list of dicts + elements_list = elements + else: + raise ValueError("elements must be a Snapshot, list of Element objects, or list of dicts") + + # Call extension API + browser.page.evaluate( + """ + (args) => { + if (window.sentience && window.sentience.showOverlay) { + window.sentience.showOverlay(args.elements, args.targetId); + } else { + console.warn('[Sentience SDK] showOverlay not available - is extension loaded?'); + } + } + """, + {"elements": elements_list, "targetId": target_element_id}, + ) + + +def clear_overlay(browser: SentienceBrowser) -> None: + """ + Clear the visual overlay manually (before 5-second auto-clear) + + Args: + browser: SentienceBrowser instance + + Example: + show_overlay(browser, snap) + # ... inspect overlay ... + clear_overlay(browser) # Remove immediately + """ + if not browser.page: + raise RuntimeError("Browser not started. Call browser.start() first.") + + browser.page.evaluate( + """ + () => { + if (window.sentience && window.sentience.clearOverlay) { + window.sentience.clearOverlay(); + } + } + """ + ) diff --git a/sentience/snapshot.py b/sentience/snapshot.py index 9a0f6f7..6c90693 100644 --- a/sentience/snapshot.py +++ b/sentience/snapshot.py @@ -44,6 +44,7 @@ def snapshot( use_api: bool | None = None, save_trace: bool = False, trace_path: str | None = None, + show_overlay: bool = False, ) -> Snapshot: """ Take a snapshot of the current page @@ -57,6 +58,7 @@ def snapshot( If None, uses API if api_key is set, otherwise uses local extension. save_trace: Whether to save raw_elements to JSON for benchmarking/training trace_path: Path to save trace file. If None, uses "trace_{timestamp}.json" + show_overlay: Show visual overlay highlighting elements in browser Returns: Snapshot object @@ -69,6 +71,7 @@ def snapshot( use_api=use_api, save_trace=save_trace, trace_path=trace_path, + show_overlay=show_overlay, ) # Determine if we should use server-side API @@ -143,6 +146,21 @@ def _snapshot_via_extension( if options.save_trace: _save_trace_to_file(result.get("raw_elements", []), options.trace_path) + # Show visual overlay if requested + if options.show_overlay: + raw_elements = result.get("raw_elements", []) + if raw_elements: + browser.page.evaluate( + """ + (elements) => { + if (window.sentience && window.sentience.showOverlay) { + window.sentience.showOverlay(elements, null); + } + } + """, + raw_elements, + ) + # Validate and parse with Pydantic snapshot_obj = Snapshot(**result) return snapshot_obj @@ -231,6 +249,21 @@ def _snapshot_via_api( "error": api_result.get("error"), } + # Show visual overlay if requested (use API-ranked elements) + if options.show_overlay: + elements = api_result.get("elements", []) + if elements: + browser.page.evaluate( + """ + (elements) => { + if (window.sentience && window.sentience.showOverlay) { + window.sentience.showOverlay(elements, null); + } + } + """, + elements, + ) + return Snapshot(**snapshot_data) except requests.exceptions.RequestException as e: raise RuntimeError(f"API request failed: {e}")