From 40f8b97959e76f9d9323d1d85b9c7d7287eacd3d Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Sat, 24 Jan 2026 10:40:31 -0800 Subject: [PATCH] updated readme --- README.md | 1208 +++++------------------------------------------------ 1 file changed, 109 insertions(+), 1099 deletions(-) diff --git a/README.md b/README.md index 488b065..6a4cea2 100644 --- a/README.md +++ b/README.md @@ -1,1170 +1,180 @@ # Sentience TypeScript SDK -**Semantic snapshots and Jest-style assertions for reliable AI web agents with time-travel traces** +> **A verification & control layer for AI agents that operate browsers** -## πŸ“¦ Installation +Sentience is built for **AI agent developers** who already use Playwright / CDP / LangGraph and care about **flakiness, cost, determinism, evals, and debugging**. -```bash -# Install from npm -npm install sentienceapi - -# Install Playwright browsers (required) -npx playwright install chromium -``` - -**For local development:** - -```bash -npm install -npm run build -``` - -## Jest for AI Web Agent - -### Semantic snapshots and assertions that let agents act, verify, and know when they're done. - -Use `AgentRuntime` to add Jest-style assertions to your agent loops. Verify browser state, check task completion, and get clear feedback on what's working: - -```typescript -import { - SentienceBrowser, - AgentRuntime, - HumanHandoffSolver, - urlContains, - exists, - allOf, - isEnabled, - isChecked, - valueEquals, -} from 'sentienceapi'; -import { createTracer } from 'sentienceapi'; -import { Page } from 'playwright'; - -// Create browser and tracer -const browser = await SentienceBrowser.create({ apiKey: process.env.SENTIENCE_API_KEY }); -const tracer = await createTracer({ runId: 'my-run', uploadTrace: false }); - -// Create browser adapter for AgentRuntime -const browserAdapter = { - snapshot: async (_page: Page, options?: Record) => { - return await browser.snapshot(options); - }, -}; -const runtime = new AgentRuntime(browserAdapter, browser.getPage(), tracer); - -// Navigate and take snapshot -await browser.getPage().goto('https://example.com'); -runtime.beginStep('Verify page loaded'); -await runtime.snapshot(); - -// Run assertions (Jest-style) -runtime.assert(urlContains('example.com'), 'on_correct_domain'); -runtime.assert(exists('role=heading'), 'has_heading'); -runtime.assert(allOf([exists('role=button'), exists('role=link')]), 'has_interactive_elements'); - -// v1: state-aware assertions (when Gateway refinement is enabled) -runtime.assert(isEnabled('role=button'), 'button_enabled'); -runtime.assert(isChecked("role=checkbox name~'subscribe'"), 'subscribe_checked_if_present'); -runtime.assert( - valueEquals("role=textbox name~'email'", 'user@example.com'), - 'email_value_if_present' -); - -// v2: retry loop with snapshot confidence gating + exhaustion -const ok = await runtime - .check(exists('role=heading'), 'heading_eventually_visible', true) - .eventually({ timeoutMs: 10_000, pollMs: 250, minConfidence: 0.7, maxSnapshotAttempts: 3 }); -console.log('eventually() result:', ok); - -// CAPTCHA handling (detection + handoff + verify) -runtime.setCaptchaOptions({ - policy: 'callback', - handler: HumanHandoffSolver(), -}); - -// Check task completion -if (runtime.assertDone(exists("text~'Example'"), 'task_complete')) { - console.log('βœ… Task completed!'); -} - -console.log(`Task done: ${runtime.isTaskDone}`); -``` - -#### CAPTCHA strategies (Batteries Included) - -```typescript -import { ExternalSolver, HumanHandoffSolver, VisionSolver } from 'sentienceapi'; - -// Human-in-loop -runtime.setCaptchaOptions({ policy: 'callback', handler: HumanHandoffSolver() }); - -// Vision verification only -runtime.setCaptchaOptions({ policy: 'callback', handler: VisionSolver() }); - -// External system/webhook -runtime.setCaptchaOptions({ - policy: 'callback', - handler: ExternalSolver(async ctx => { - await fetch(process.env.CAPTCHA_WEBHOOK_URL!, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ runId: ctx.runId, url: ctx.url }), - }); - }), -}); -``` - -### Failure Artifact Buffer (Phase 1) - -Capture a short ring buffer of screenshots and persist them when a required assertion fails. - -```typescript -runtime.enableFailureArtifacts({ bufferSeconds: 15, captureOnAction: true, fps: 0 }); - -// After each action, record it (best-effort). -await runtime.recordAction('CLICK'); -``` - -### Redaction callback (Phase 3) - -Provide a user-defined callback to redact snapshots and decide whether to persist frames. The SDK does not implement image/video redaction. - -```typescript -import { RedactionContext, RedactionResult } from 'sentienceapi'; - -const redact = (_ctx: RedactionContext): RedactionResult => { - return { dropFrames: true }; -}; - -runtime.enableFailureArtifacts({ onBeforePersist: redact }); -``` - -**See examples:** [`examples/asserts/`](examples/asserts/) - -## πŸš€ Quick Start: Choose Your Abstraction Level - -Sentience SDK offers **4 levels of abstraction** - choose based on your needs: - -
-πŸ’¬ Level 4: Conversational Agent (Highest Abstraction) - NEW in v0.3.0 - -Complete automation with natural conversation. Just describe what you want, and the agent plans and executes everything: - -```typescript -import { SentienceBrowser, ConversationalAgent, OpenAIProvider } from 'sentienceapi'; - -const browser = await SentienceBrowser.create({ apiKey: process.env.SENTIENCE_API_KEY }); -const llm = new OpenAIProvider(process.env.OPENAI_API_KEY!, 'gpt-4o'); -const agent = new ConversationalAgent({ llmProvider: llm, browser }); - -// Navigate to starting page -await browser.getPage().goto('https://amazon.com'); - -// ONE command does it all - automatic planning and execution! -const response = await agent.execute( - "Search for 'wireless mouse' and tell me the price of the top result" -); -console.log(response); // "I found the top result for wireless mouse on Amazon. It's priced at $24.99..." - -// Follow-up questions maintain context -const followUp = await agent.chat('Add it to cart'); -console.log(followUp); - -await browser.close(); -``` - -**When to use:** Complex multi-step tasks, conversational interfaces, maximum convenience -**Code reduction:** 99% less code - describe goals in natural language -**Requirements:** OpenAI or Anthropic API key - -
- -
-πŸ€– Level 3: Agent (Natural Language Commands) - Recommended for Most Users - -Zero coding knowledge needed. Just write what you want in plain English: - -```typescript -import { SentienceBrowser, SentienceAgent, OpenAIProvider } from 'sentienceapi'; - -const browser = await SentienceBrowser.create({ apiKey: process.env.SENTIENCE_API_KEY }); -const llm = new OpenAIProvider(process.env.OPENAI_API_KEY!, 'gpt-4o-mini'); -const agent = new SentienceAgent(browser, llm); - -await browser.getPage().goto('https://www.amazon.com'); - -// Just natural language commands - agent handles everything! -await agent.act('Click the search box'); -await agent.act("Type 'wireless mouse' into the search field"); -await agent.act('Press Enter key'); -await agent.act('Click the first product result'); - -// Automatic token tracking -console.log(`Tokens used: ${agent.getTokenStats().totalTokens}`); -await browser.close(); -``` - -**When to use:** Quick automation, non-technical users, rapid prototyping -**Code reduction:** 95-98% less code vs manual approach -**Requirements:** OpenAI API key (or Anthropic for Claude) - -
- -
-πŸ”§ Level 2: Direct SDK (Technical Control) - -Full control with semantic selectors. For technical users who want precision: - -```typescript -import { SentienceBrowser, snapshot, find, click, typeText, press } from 'sentienceapi'; - -const browser = await SentienceBrowser.create({ apiKey: process.env.SENTIENCE_API_KEY }); -await browser.getPage().goto('https://www.amazon.com'); - -// Get semantic snapshot -const snap = await snapshot(browser); - -// Find elements using query DSL -const searchBox = find(snap, 'role=textbox text~"search"'); -await click(browser, searchBox!.id); - -// Type and submit -await typeText(browser, searchBox!.id, 'wireless mouse'); -await press(browser, 'Enter'); - -await browser.close(); -``` - -**When to use:** Need precise control, debugging, custom workflows -**Code reduction:** Still 80% less code vs raw Playwright -**Requirements:** Only Sentience API key - -
- -
-βš™οΈ Level 1: Raw Playwright (Maximum Control) - -For when you need complete low-level control (rare): - -```typescript -import { chromium } from 'playwright'; - -const browser = await chromium.launch(); -const page = await browser.newPage(); -await page.goto('https://www.amazon.com'); -await page.fill('#twotabsearchtextbox', 'wireless mouse'); -await page.press('#twotabsearchtextbox', 'Enter'); -await browser.close(); -``` - -**When to use:** Very specific edge cases, custom browser configs -**Tradeoffs:** No semantic intelligence, brittle selectors, more code - -
- ---- - -### Human-like Typing - -Add realistic delays between keystrokes to mimic human typing: - -```typescript -// Type instantly (default) -await typeText(browser, elementId, 'Hello World'); +Often described as _Jest for Browser AI Agents_ - but applied to end-to-end agent runs (not unit tests). -// Type with human-like delay (~10ms between keystrokes) -await typeText(browser, elementId, 'Hello World', false, 10); -``` - -### Scroll to Element - -Scroll elements into view with smooth animation: - -```typescript -const snap = await snapshot(browser); -const button = find(snap, 'role=button text~"Submit"'); - -// Scroll element into view with smooth animation -await scrollTo(browser, button.id); - -// Scroll instantly to top of viewport -await scrollTo(browser, button.id, 'instant', 'start'); -``` - ---- - -
-

πŸ“Š Agent Execution Tracing (NEW in v0.3.1)

- -Record complete agent execution traces for debugging, analysis, and replay. Traces capture every step, snapshot, LLM decision, and action in a structured JSONL format. - -### Quick Start: Agent with Tracing - -```typescript -import { - SentienceBrowser, - SentienceAgent, - OpenAIProvider, - Tracer, - JsonlTraceSink, -} from 'sentienceapi'; -import { randomUUID } from 'crypto'; - -const browser = await SentienceBrowser.create({ apiKey: process.env.SENTIENCE_API_KEY }); -const llm = new OpenAIProvider(process.env.OPENAI_API_KEY!, 'gpt-4o'); - -// Create a tracer -const runId = randomUUID(); -const sink = new JsonlTraceSink(`traces/${runId}.jsonl`); -const tracer = new Tracer(runId, sink); - -// Create agent with tracer -const agent = new SentienceAgent(browser, llm, 50, true, tracer); - -// Emit run_start -tracer.emitRunStart('SentienceAgent', 'gpt-4o'); - -try { - await browser.getPage().goto('https://google.com'); - - // Every action is automatically traced! - await agent.act('Click the search box'); - await agent.act("Type 'sentience ai' into the search field"); - await agent.act('Press Enter'); - - tracer.emitRunEnd(3); -} finally { - // Flush trace to disk - await agent.closeTracer(); - await browser.close(); -} - -console.log(`βœ… Trace saved to: traces/${runId}.jsonl`); -``` +The core loop is: -### What Gets Traced +> **Agent β†’ Snapshot β†’ Action β†’ Verification β†’ Artifact** -Each agent action generates multiple events: +## What Sentience is -1. **step_start** - Before action execution (goal, URL, attempt) -2. **snapshot** - Page state with all interactive elements -3. **llm_response** - LLM decision (model, tokens, response) -4. **action** - Executed action (type, element ID, success) -5. **error** - Any failures (error message, retry attempt) +- A **verification-first runtime** (`AgentRuntime`) for browser agents +- Treats the browser as an adapter (Playwright / CDP); **`AgentRuntime` is the product** +- A **controlled perception** layer (semantic snapshots; pruning/limits; lowers token usage by filtering noise from what models see) +- A **debugging layer** (structured traces + failure artifacts) +- Enables **local LLM small models (3B-7B)** for browser automation (privacy, compliance, and cost control) +- Keeps vision models **optional** (use as a fallback when DOM/snapshot structure falls short, e.g. ``) -### Schema Compatibility +## What Sentience is not -Traces are **100% compatible** with Python SDK traces - use the same tools to analyze traces from both TypeScript and Python agents! +- Not a browser driver +- Not a Playwright replacement +- Not a vision-first agent framework -**See full example:** [examples/agent-with-tracing.ts](examples/agent-with-tracing.ts) +## Install -
- -
-

πŸ” Agent Runtime Verification

- -`AgentRuntime` provides assertion predicates for runtime verification in agent loops, enabling programmatic verification of browser state during execution. - -```typescript -import { SentienceBrowser } from 'sentienceapi'; -import { AgentRuntime, urlContains, exists, allOf } from 'sentienceapi'; -import { createTracer } from 'sentienceapi'; - -const browser = new SentienceBrowser(); -await browser.start(); -const tracer = await createTracer({ runId: 'my-run', uploadTrace: false }); -const runtime = new AgentRuntime(browser, browser.getPage(), tracer); - -// Navigate and take snapshot -await browser.getPage().goto('https://example.com'); -runtime.beginStep('Verify page'); -await runtime.snapshot(); - -// Run assertions -runtime.assert(urlContains('example.com'), 'on_correct_domain'); -runtime.assert(exists('role=heading'), 'has_heading'); -runtime.assertDone(exists("text~'Example'"), 'task_complete'); - -console.log(`Task done: ${runtime.isTaskDone}`); +```bash +npm install sentienceapi +npx playwright install chromium ``` -**See example:** [examples/agent-runtime-verification.ts](examples/agent-runtime-verification.ts) +## Conceptual example (why this exists) -
+- Steps are **gated by verifiable UI assertions** +- If progress can’t be proven, the run **fails with evidence** +- This is how you make runs **reproducible** and **debuggable**, and how you run evals reliably ---- +## Quickstart: a verification-first loop -
-

πŸ’Ό Real-World Example: Assertion-driven navigation

+```ts +import { SentienceBrowser, AgentRuntime } from 'sentienceapi'; +import { JsonlTraceSink, Tracer } from 'sentienceapi'; +import { exists, urlContains } from 'sentienceapi'; +import type { Page } from 'playwright'; -This example shows how to use **assertions + `.eventually()`** to make an agent loop resilient: +async function main(): Promise { + const tracer = new Tracer('demo', new JsonlTraceSink('trace.jsonl')); -```typescript -import { SentienceBrowser, AgentRuntime, urlContains, exists } from 'sentienceapi'; -import { createTracer } from 'sentienceapi'; + const browser = new SentienceBrowser(); + await browser.start(); + const page = browser.getPage(); + if (!page) throw new Error('no page'); -async function main() { - const browser = await SentienceBrowser.create({ apiKey: process.env.SENTIENCE_API_KEY }); - const tracer = await createTracer({ runId: 'verified-run', uploadTrace: false }); + await page.goto('https://example.com'); - const adapter = { - snapshot: async (_page: any, options?: Record) => { - return await browser.snapshot(options); - }, - }; - const runtime = new AgentRuntime(adapter as any, browser.getPage() as any, tracer); + // AgentRuntime needs a snapshot provider; SentienceBrowser.snapshot() does not depend on Page, + // so we wrap it to fit the runtime interface. + const runtime = new AgentRuntime( + { snapshot: async (_page: Page, options?: Record) => browser.snapshot(options) }, + page, + tracer + ); - await browser.getPage().goto('https://example.com'); - runtime.beginStep('Verify we are on the right page'); + runtime.beginStep('Verify homepage'); + await runtime.snapshot({ limit: 60 }); - await runtime - .check(urlContains('example.com'), 'on_domain', true) - .eventually({ timeoutMs: 10_000, pollMs: 250, minConfidence: 0.7, maxSnapshotAttempts: 3 }); + runtime.assert(urlContains('example.com'), 'on_domain', true); + runtime.assert(exists('role=heading'), 'has_heading'); - runtime.assert(exists('role=heading'), 'heading_present'); + runtime.assertDone(exists("text~'Example'"), 'task_complete'); - await tracer.close(); await browser.close(); } -main().catch(console.error); -``` - -
- ---- - -## πŸ“š Core Features - -
-

🌐 Browser Control

- -- **`SentienceBrowser`** - Playwright browser with Sentience extension pre-loaded -- **`browser.goto(url)`** - Navigate with automatic extension readiness checks -- Automatic bot evasion and stealth mode -- Configurable headless/headed mode - -
- -
-

πŸ“Έ Snapshot - Intelligent Page Analysis

- -**`snapshot(browser, options?)`** - Capture page state with AI-ranked elements - -Features: - -- Returns semantic elements with roles, text, importance scores, and bounding boxes -- Optional screenshot capture (PNG/JPEG) -- Optional visual overlay to see what elements are detected -- TypeScript types for type safety - -**Example:** - -```typescript -const snap = await snapshot(browser, { screenshot: true, show_overlay: true }); - -// Access structured data -console.log(`URL: ${snap.url}`); -console.log(`Viewport: ${snap.viewport.width}x${snap.viewport.height}`); -console.log(`Elements: ${snap.elements.length}`); - -// Iterate over elements -for (const element of snap.elements) { - console.log(`${element.role}: ${element.text} (importance: ${element.importance})`); -} -``` - -
- -
-

πŸ” Query Engine - Semantic Element Selection

- -- **`query(snapshot, selector)`** - Find all matching elements -- **`find(snapshot, selector)`** - Find single best match (by importance) -- Powerful query DSL with multiple operators - -**Query Examples:** - -```typescript -// Find by role and text -const button = find(snap, 'role=button text="Sign in"'); - -// Substring match (case-insensitive) -const link = find(snap, 'role=link text~"more info"'); - -// Spatial filtering -const topLeft = find(snap, 'bbox.x<=100 bbox.y<=200'); - -// Multiple conditions (AND logic) -const primaryBtn = find(snap, 'role=button clickable=true visible=true importance>800'); - -// Prefix/suffix matching -const startsWith = find(snap, 'text^="Add"'); -const endsWith = find(snap, 'text$="Cart"'); - -// Numeric comparisons -const important = query(snap, 'importance>=700'); -const firstRow = query(snap, 'bbox.y<600'); -``` - -**πŸ“– [Complete Query DSL Guide](docs/QUERY_DSL.md)** - All operators, fields, and advanced patterns - -
- -
-

πŸ‘† Actions - Interact with Elements

- -- **`click(browser, elementId)`** - Click element by ID -- **`clickRect(browser, rect)`** - Click at center of rectangle (coordinate-based) -- **`typeText(browser, elementId, text)`** - Type into input fields -- **`press(browser, key)`** - Press keyboard keys (Enter, Escape, Tab, etc.) - -All actions return `ActionResult` with success status, timing, and outcome: - -```typescript -const result = await click(browser, element.id); - -console.log(`Success: ${result.success}`); -console.log(`Outcome: ${result.outcome}`); // "navigated", "dom_updated", "error" -console.log(`Duration: ${result.duration_ms}ms`); -console.log(`URL changed: ${result.url_changed}`); -``` - -**Coordinate-based clicking:** - -```typescript -import { clickRect } from './src'; - -// Click at center of rectangle (x, y, width, height) -await clickRect(browser, { x: 100, y: 200, w: 50, h: 30 }); - -// With visual highlight (default: red border for 2 seconds) -await clickRect(browser, { x: 100, y: 200, w: 50, h: 30 }, true, 2.0); - -// Using element's bounding box -const snap = await snapshot(browser); -const element = find(snap, 'role=button'); -if (element) { - await clickRect(browser, { - x: element.bbox.x, - y: element.bbox.y, - w: element.bbox.width, - h: element.bbox.height, - }); -} -``` - -
- -
-

⏱️ Wait & Assertions

- -- **`waitFor(browser, selector, timeout?, interval?, useApi?)`** - Wait for element to appear -- **`expect(browser, selector)`** - Assertion helper with fluent API - -**Examples:** - -```typescript -// Wait for element (auto-detects optimal interval based on API usage) -const result = await waitFor(browser, 'role=button text="Submit"', 10000); -if (result.found) { - console.log(`Found after ${result.duration_ms}ms`); -} - -// Use local extension with fast polling (250ms interval) -const result = await waitFor(browser, 'role=button', 5000, undefined, false); - -// Use remote API with network-friendly polling (1500ms interval) -const result = await waitFor(browser, 'role=button', 5000, undefined, true); - -// Custom interval override -const result = await waitFor(browser, 'role=button', 5000, 500, false); - -// Semantic wait conditions -await waitFor(browser, 'clickable=true', 5000); // Wait for clickable element -await waitFor(browser, 'importance>100', 5000); // Wait for important element -await waitFor(browser, 'role=link visible=true', 5000); // Wait for visible link - -// Assertions -await expect(browser, 'role=button text="Submit"').toExist(5000); -await expect(browser, 'role=heading').toBeVisible(); -await expect(browser, 'role=button').toHaveText('Submit'); -await expect(browser, 'role=link').toHaveCount(10); -``` - -
- -
-

🎨 Visual Overlay - Debug Element Detection

- -- **`showOverlay(browser, elements, targetElementId?)`** - Display visual overlay highlighting elements -- **`clearOverlay(browser)`** - Clear overlay manually - -Show color-coded borders around detected elements to debug, validate, and understand what Sentience sees: - -```typescript -import { showOverlay, clearOverlay } from 'sentienceapi'; - -// Take snapshot once -const snap = await snapshot(browser); - -// Show overlay anytime without re-snapshotting -await showOverlay(browser, snap); // Auto-clears after 5 seconds - -// Highlight specific target element in red -const button = find(snap, 'role=button text~"Submit"'); -await showOverlay(browser, snap, button.id); - -// Clear manually before 5 seconds -await new Promise(resolve => setTimeout(resolve, 2000)); -await clearOverlay(browser); -``` - -**Color Coding:** - -- πŸ”΄ Red: Target element -- πŸ”΅ Blue: Primary elements (`is_primary=true`) -- 🟒 Green: Regular interactive elements - -**Visual Indicators:** - -- Border thickness/opacity scales with importance -- Semi-transparent fill -- Importance badges -- Star icons for primary elements -- Auto-clear after 5 seconds - -
- -
-

πŸ“„ Content Reading

- -**`read(browser, options?)`** - Extract page content - -- `format: "text"` - Plain text extraction -- `format: "markdown"` - High-quality markdown conversion (uses Turndown) -- `format: "raw"` - Cleaned HTML (default) - -**Example:** - -```typescript -import { read } from './src'; - -// Get markdown content -const result = await read(browser, { format: 'markdown' }); -console.log(result.content); // Markdown text - -// Get plain text -const result = await read(browser, { format: 'text' }); -console.log(result.content); // Plain text -``` - -
- -
-

πŸ“· Screenshots

- -**`screenshot(browser, options?)`** - Standalone screenshot capture - -- Returns base64-encoded data URL -- PNG or JPEG format -- Quality control for JPEG (1-100) - -**Example:** - -```typescript -import { screenshot } from './src'; -import { writeFileSync } from 'fs'; - -// Capture PNG screenshot -const dataUrl = await screenshot(browser, { format: 'png' }); - -// Save to file -const base64Data = dataUrl.split(',')[1]; -const imageData = Buffer.from(base64Data, 'base64'); -writeFileSync('screenshot.png', imageData); - -// JPEG with quality control (smaller file size) -const dataUrl = await screenshot(browser, { format: 'jpeg', quality: 85 }); -``` - -
- -
-

πŸ”Ž Text Search - Find Elements by Visible Text

- -**`findTextRect(page, options)`** - Find text on page and get exact pixel coordinates - -Find buttons, links, or any UI elements by their visible text without needing element IDs or CSS selectors. Returns exact pixel coordinates for each match. - -**Example:** - -```typescript -import { SentienceBrowser, findTextRect, clickRect } from 'sentienceapi'; - -const browser = await SentienceBrowser.create(); -await browser.getPage().goto('https://example.com'); - -// Find "Sign In" button (simple string syntax) -const result = await findTextRect(browser.getPage(), 'Sign In'); -if (result.status === 'success' && result.results) { - const firstMatch = result.results[0]; - console.log(`Found at: (${firstMatch.rect.x}, ${firstMatch.rect.y})`); - console.log(`In viewport: ${firstMatch.in_viewport}`); - - // Click on the found text - if (firstMatch.in_viewport) { - await clickRect(browser, { - x: firstMatch.rect.x, - y: firstMatch.rect.y, - w: firstMatch.rect.width, - h: firstMatch.rect.height, - }); - } -} -``` - -**Advanced Options:** - -```typescript -// Case-sensitive search -const result = await findTextRect(browser.getPage(), { - text: 'LOGIN', - caseSensitive: true, -}); - -// Whole word only (won't match "login" as part of "loginButton") -const result = await findTextRect(browser.getPage(), { - text: 'log', - wholeWord: true, -}); - -// Find multiple matches -const result = await findTextRect(browser.getPage(), { - text: 'Buy', - maxResults: 10, -}); -for (const match of result.results || []) { - if (match.in_viewport) { - console.log(`Found '${match.text}' at (${match.rect.x}, ${match.rect.y})`); - console.log(`Context: ...${match.context.before}[${match.text}]${match.context.after}...`); - } -} +void main(); ``` -**Returns:** Promise with: - -- **`status`**: "success" or "error" -- **`results`**: Array of `TextMatch` objects with: - - `text` - The matched text - - `rect` - Absolute coordinates (with scroll offset) - - `viewport_rect` - Viewport-relative coordinates - - `context` - Surrounding text (before/after) - - `in_viewport` - Whether visible in current viewport - -**Use Cases:** - -- Find buttons/links by visible text without CSS selectors -- Get exact pixel coordinates for click automation -- Verify text visibility and position on page -- Search dynamic content that changes frequently +## Capabilities (lifecycle guarantees) -**Note:** Does not consume API credits (runs locally in browser) +### Controlled perception -**See example:** `examples/find-text-demo.ts` +- **Semantic snapshots** instead of raw DOM dumps +- **Pruning knobs** via `SnapshotOptions` (limit/filter) +- Snapshot diagnostics that help decide when β€œstructure is insufficient” -
- ---- - -## πŸ“‹ Reference - -
-

Element Properties

- -Elements returned by `snapshot()` have the following properties: - -```typescript -element.id; // Unique identifier for interactions -element.role; // ARIA role (button, link, textbox, heading, etc.) -element.text; // Visible text content -element.importance; // AI importance score (0-1000) -element.bbox; // Bounding box (x, y, width, height) -element.visual_cues; // Visual analysis (is_primary, is_clickable, background_color) -element.in_viewport; // Is element visible in current viewport? -element.is_occluded; // Is element covered by other elements? -element.z_index; // CSS stacking order -``` +### Constrained action space -
+- Action primitives operate on **stable IDs / rects** derived from snapshots +- Optional helpers for ordinality (β€œclick the 3rd result”) -
-

Query DSL Reference

+### Verified progress -### Basic Operators +- Predicates like `exists(...)`, `urlMatches(...)`, `isEnabled(...)`, `valueEquals(...)` +- Fluent assertion DSL via `expect(...)` +- Retrying verification via `runtime.check(...).eventually(...)` -| Operator | Description | Example | -| --------- | ---------------------------- | ---------------- | -| `=` | Exact match | `role=button` | -| `!=` | Exclusion | `role!=link` | -| `~` | Substring (case-insensitive) | `text~"sign in"` | -| `^=` | Prefix match | `text^="Add"` | -| `$=` | Suffix match | `text$="Cart"` | -| `>`, `>=` | Greater than | `importance>500` | -| `<`, `<=` | Less than | `bbox.y<600` | +### Explained failure -### Supported Fields - -- **Role**: `role=button|link|textbox|heading|...` -- **Text**: `text`, `text~`, `text^=`, `text$=` -- **Visibility**: `clickable=true|false`, `visible=true|false` -- **Importance**: `importance`, `importance>=N`, `importance - ---- - -## βš™οΈ Configuration - -
-

Viewport Size

- -Default viewport is **1280x800** pixels. You can customize it using Playwright's API: - -```typescript -const browser = new SentienceBrowser(); -await browser.start(); - -// Set custom viewport before navigating -await browser.getPage().setViewportSize({ width: 1920, height: 1080 }); - -await browser.goto('https://example.com'); -``` +- JSONL trace events (`Tracer` + `JsonlTraceSink`) +- Optional failure artifact bundles (snapshots, diagnostics, step timelines, frames/clip) +- Deterministic failure semantics: when required assertions can’t be proven, the run fails with artifacts you can replay -
+### Framework interoperability -
-

Headless Mode

+- Bring your own LLM and orchestration (LangGraph, custom loops) +- Register explicit LLM-callable tools with `ToolRegistry` -```typescript -// Headed mode (shows browser window) -const browser = new SentienceBrowser(undefined, undefined, false); +## ToolRegistry (LLM-callable tools) -// Headless mode -const browser = new SentienceBrowser(undefined, undefined, true); +```ts +import { ToolRegistry, registerDefaultTools } from 'sentienceapi'; -// Auto-detect based on environment (default) -const browser = new SentienceBrowser(); // headless=true if CI=true, else false +const registry = new ToolRegistry(); +registerDefaultTools(registry); +const toolsForLLM = registry.llmTools(); ``` -
+## Permissions (avoid Chrome permission bubbles) -
-

🌍 Residential Proxy Support

+Chrome permission prompts are outside the DOM and can be invisible to snapshots. Prefer setting a policy **before navigation**. -For users running from datacenters (AWS, DigitalOcean, etc.), you can configure a residential proxy to prevent IP-based detection by Cloudflare, Akamai, and other anti-bot services. - -**Supported Formats:** - -- HTTP: `http://username:password@host:port` -- HTTPS: `https://username:password@host:port` -- SOCKS5: `socks5://username:password@host:port` +```ts +import { SentienceBrowser } from 'sentienceapi'; +import type { PermissionPolicy } from 'sentienceapi'; -**Usage:** +const policy: PermissionPolicy = { + default: 'clear', + autoGrant: ['geolocation'], + geolocation: { latitude: 37.77, longitude: -122.41, accuracy: 50 }, + origin: 'https://example.com', +}; -```typescript -// Via constructor parameter +// `permissionPolicy` is the last constructor argument; pass `keepAlive` right before it. const browser = new SentienceBrowser( undefined, undefined, - false, - 'http://username:password@residential-proxy.com:8000' -); -await browser.start(); - -// Via environment variable -process.env.SENTIENCE_PROXY = 'http://username:password@proxy.com:8000'; -const browser = new SentienceBrowser(); -await browser.start(); - -// With agent -import { SentienceAgent, OpenAIProvider } from 'sentienceapi'; - -const browser = new SentienceBrowser( - 'your-api-key', + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, undefined, false, - 'http://user:pass@proxy.com:8000' + policy ); await browser.start(); - -const agent = new SentienceAgent(browser, new OpenAIProvider('openai-key')); -await agent.act('Navigate to example.com'); -``` - -**WebRTC Protection:** -The SDK automatically adds WebRTC leak protection flags when a proxy is configured, preventing your real datacenter IP from being exposed via WebRTC even when using proxies. - -**HTTPS Certificate Handling:** -The SDK automatically ignores HTTPS certificate errors when a proxy is configured, as residential proxies often use self-signed certificates for SSL interception. - -
- -
-

πŸ” Authentication Session Injection

- -Inject pre-recorded authentication sessions (cookies + localStorage) to start your agent already logged in, bypassing login screens, 2FA, and CAPTCHAs. This saves tokens and reduces costs by eliminating login steps. - -```typescript -// Workflow 1: Inject pre-recorded session from file -import { SentienceBrowser, saveStorageState } from 'sentienceapi'; - -// Save session after manual login -const browser = new SentienceBrowser(); -await browser.start(); -await browser.getPage().goto('https://example.com'); -// ... log in manually ... -await saveStorageState(browser.getContext(), 'auth.json'); - -// Use saved session in future runs -const browser2 = new SentienceBrowser( - undefined, // apiKey - undefined, // apiUrl - false, // headless - undefined, // proxy - undefined, // userDataDir - 'auth.json' // storageState - inject saved session -); -await browser2.start(); -// Agent starts already logged in! - -// Workflow 2: Persistent sessions (cookies persist across runs) -const browser3 = new SentienceBrowser( - undefined, // apiKey - undefined, // apiUrl - false, // headless - undefined, // proxy - './chrome_profile', // userDataDir - persist cookies - undefined // storageState -); -await browser3.start(); -// First run: Log in -// Second run: Already logged in (cookies persist automatically) -``` - -**Benefits:** - -- Bypass login screens and CAPTCHAs with valid sessions -- Save 5-10 agent steps and hundreds of tokens per run -- Maintain stateful sessions for accessing authenticated pages -- Act as authenticated users (e.g., "Go to my Orders page") - -See `examples/auth-injection-agent.ts` for complete examples. - -
- ---- - -## πŸ’‘ Best Practices - -
-Click to expand best practices - -### 1. Wait for Dynamic Content - -```typescript -await browser.goto('https://example.com'); -await browser.getPage().waitForLoadState('networkidle'); -await new Promise(resolve => setTimeout(resolve, 1000)); // Extra buffer ``` -### 2. Use Multiple Strategies for Finding Elements +If your backend supports it, you can also use ToolRegistry permission tools (`grant_permissions`, `clear_permissions`, `set_geolocation`) mid-run. -```typescript -// Try exact match first -let btn = find(snap, 'role=button text="Add to Cart"'); +## Downloads (verification predicate) -// Fallback to fuzzy match -if (!btn) { - btn = find(snap, 'role=button text~"cart"'); -} -``` - -### 3. Check Element Visibility Before Clicking - -```typescript -if (element.in_viewport && !element.is_occluded) { - await click(browser, element.id); -} -``` - -### 4. Handle Navigation - -```typescript -const result = await click(browser, linkId); -if (result.url_changed) { - await browser.getPage().waitForLoadState('networkidle'); -} -``` - -### 5. Use Screenshots Sparingly - -```typescript -// Fast - no screenshot (only element data) -const snap = await snapshot(browser); +```ts +import { downloadCompleted } from 'sentienceapi'; -// Slower - with screenshot (for debugging/verification) -const snap = await snapshot(browser, { screenshot: true }); +runtime.assert(downloadCompleted('report.csv'), 'download_ok', true); ``` -### 6. Always Close Browser +## Debugging (fast) -```typescript -const browser = new SentienceBrowser(); - -try { - await browser.start(); - // ... your automation code -} finally { - await browser.close(); // Always clean up -} -``` - -
- ---- - -## πŸ› οΈ Troubleshooting - -
-Click to expand common issues and solutions - -### "Extension failed to load" - -**Solution:** Build the extension first: +- **Manual driver CLI**: ```bash -cd sentience-chrome -./build.sh +npx sentience driver --url https://example.com ``` -### "Cannot use import statement outside a module" - -**Solution:** Don't use `node` directly. Use `ts-node` or npm scripts: - -```bash -npx ts-node examples/hello.ts -# or -npm run example:hello -``` - -### "Element not found" - -**Solutions:** - -- Ensure page is loaded: `await browser.getPage().waitForLoadState('networkidle')` -- Use `waitFor()`: `await waitFor(browser, 'role=button', 10000)` -- Debug elements: `console.log(snap.elements.map(el => el.text))` - -### Button not clickable - -**Solutions:** - -- Check visibility: `element.in_viewport && !element.is_occluded` -- Scroll to element: ``await browser.getPage().evaluate(`window.sentience_registry[${element.id}].scrollIntoView()`)`` - -
- ---- - -## πŸ’» Examples & Testing - -
-

Agent Layer Examples (Level 3 - Natural Language)

- -- **`agent-google-search.ts`** - Google search automation with natural language commands -- **`agent-amazon-shopping.ts`** - Amazon shopping bot (6 lines vs 350 lines manual code) -- **`agent-with-anthropic.ts`** - Using Anthropic Claude instead of OpenAI GPT -- **`agent-with-tracing.ts`** - Agent execution tracing for debugging and analysis - -
- -
-

Direct SDK Examples (Level 2 - Technical Control)

- -- **`hello.ts`** - Extension bridge verification -- **`basic-agent.ts`** - Basic snapshot and element inspection -- **`query-demo.ts`** - Query engine demonstrations -- **`wait-and-click.ts`** - Waiting for elements and performing actions -- **`read-markdown.ts`** - Content extraction and markdown conversion - -
- -
-

Running Examples

- -**⚠️ Important**: You cannot use `node` directly to run TypeScript files. Use one of these methods: - -### Option 1: Using npm scripts (recommended) - -```bash -npm run example:hello -npm run example:basic -npm run example:query -npm run example:wait -``` - -### Option 2: Using ts-node directly - -```bash -npx ts-node examples/hello.ts -# or if ts-node is installed globally: -ts-node examples/hello.ts -``` - -### Option 3: Compile then run - -```bash -npm run build -# Then use compiled JavaScript from dist/ -``` - -
- -
-

Testing

- -```bash -# Run all tests -npm test - -# Run with coverage -npm run test:coverage - -# Run specific test file -npm test -- snapshot.test.ts -``` - -
- ---- - -## πŸ“– Documentation - -- **πŸ“– [Amazon Shopping Guide](../docs/AMAZON_SHOPPING_GUIDE.md)** - Complete tutorial with real-world example -- **πŸ“– [Query DSL Guide](docs/QUERY_DSL.md)** - Advanced query patterns and operators -- **πŸ“„ [API Contract](../spec/SNAPSHOT_V1.md)** - Snapshot API specification -- **πŸ“„ [Type Definitions](../spec/sdk-types.md)** - TypeScript/Python type definitions - ---- - -## πŸ“œ License +- **Verification + artifacts + debugging with time-travel traces (Sentience Studio demo)**: -This project is licensed under either of: + -- Apache License, Version 2.0, ([LICENSE-APACHE](./LICENSE-APACHE)) -- MIT license ([LICENSE-MIT](./LICENSE-MIT)) +If the video tag doesn’t render in your GitHub README view, use this link: [`sentience-studio-demo.mp4`](https://github.com/user-attachments/assets/7ffde43b-1074-4d70-bb83-2eb8d0469307) -at your option. +- **Sentience SDK Documentation**: https://www.sentienceapi.com/docs