diff --git a/package-lock.json b/package-lock.json index f44ab75..9f884aa 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "sentienceapi", - "version": "0.96.1", + "version": "0.99.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "sentienceapi", - "version": "0.96.1", + "version": "0.99.0", "license": "(MIT OR Apache-2.0)", "dependencies": { "canvas": "^3.2.1", diff --git a/src/actions.ts b/src/actions.ts index 07fbacc..f73d99e 100644 --- a/src/actions.ts +++ b/src/actions.ts @@ -4,7 +4,7 @@ import { IBrowser } from './protocols/browser-protocol'; import { ActionResult, Snapshot, BBox } from './types'; -import { snapshot } from './snapshot'; +import { snapshot, SnapshotOptions } from './snapshot'; import { BrowserEvaluator } from './utils/browser-evaluator'; import { CursorPolicy, buildHumanCursorPath } from './cursor-policy'; @@ -1008,6 +1008,153 @@ export async function press( }; } +function normalizeKeyToken(token: string): string { + const lookup: Record = { + CMD: 'Meta', + COMMAND: 'Meta', + CTRL: 'Control', + CONTROL: 'Control', + ALT: 'Alt', + OPTION: 'Alt', + SHIFT: 'Shift', + ESC: 'Escape', + ESCAPE: 'Escape', + ENTER: 'Enter', + RETURN: 'Enter', + TAB: 'Tab', + SPACE: 'Space', + }; + const upper = token.trim().toUpperCase(); + return lookup[upper] ?? token.trim(); +} + +function parseKeySequence(sequence: string): string[] { + const parts: string[] = []; + for (const rawPart of sequence.replace(/,/g, ' ').split(/\s+/)) { + let raw = rawPart.trim(); + if (!raw) continue; + if (raw.startsWith('{') && raw.endsWith('}')) { + raw = raw.slice(1, -1); + } + if (raw.includes('+')) { + const combo = raw + .split('+') + .filter(Boolean) + .map(token => normalizeKeyToken(token)) + .join('+'); + parts.push(combo); + } else { + parts.push(normalizeKeyToken(raw)); + } + } + return parts; +} + +/** + * Send a sequence of key presses (e.g., "CMD+H", "CTRL+SHIFT+P"). + */ +export async function sendKeys( + browser: IBrowser, + sequence: string, + takeSnapshot: boolean = false, + delayMs: number = 50 +): Promise { + const page = browser.getPage(); + if (!page) { + throw new Error('Browser not started. Call start() first.'); + } + + const startTime = Date.now(); + const urlBefore = page.url(); + + const keys = parseKeySequence(sequence); + if (keys.length === 0) { + throw new Error('send_keys sequence is empty'); + } + for (const key of keys) { + await page.keyboard.press(key); + if (delayMs > 0) { + await page.waitForTimeout(delayMs); + } + } + + const durationMs = Date.now() - startTime; + const urlAfter = page.url(); + const urlChanged = urlBefore !== urlAfter; + const outcome = urlChanged ? 'navigated' : 'dom_updated'; + + let snapshotAfter: Snapshot | undefined; + if (takeSnapshot) { + snapshotAfter = await snapshot(browser); + } + + return { + success: true, + duration_ms: durationMs, + outcome, + url_changed: urlChanged, + snapshot_after: snapshotAfter, + }; +} + +function buildSearchUrl(query: string, engine: string): string { + const q = encodeURIComponent(query).replace(/%20/g, '+'); + const key = engine.trim().toLowerCase(); + if (key === 'duckduckgo' || key === 'ddg') { + return `https://duckduckgo.com/?q=${q}`; + } + if (key === 'google.com' || key === 'google') { + return `https://www.google.com/search?q=${q}`; + } + if (key === 'bing') { + return `https://www.bing.com/search?q=${q}`; + } + throw new Error(`unsupported search engine: ${engine}`); +} + +/** + * Navigate to a search results page for the given query. + */ +export async function search( + browser: IBrowser, + query: string, + engine: string = 'duckduckgo', + takeSnapshot: boolean = false, + snapshotOptions: SnapshotOptions | undefined = undefined +): Promise { + const page = browser.getPage(); + if (!page) { + throw new Error('Browser not started. Call start() first.'); + } + if (!query.trim()) { + throw new Error('search query is empty'); + } + + const startTime = Date.now(); + const urlBefore = page.url(); + const url = buildSearchUrl(query, engine); + await browser.goto(url); + await page.waitForLoadState('networkidle'); + + const durationMs = Date.now() - startTime; + const urlAfter = page.url(); + const urlChanged = urlBefore !== urlAfter; + const outcome = urlChanged ? 'navigated' : 'dom_updated'; + + let snapshotAfter: Snapshot | undefined; + if (takeSnapshot) { + snapshotAfter = await snapshot(browser, snapshotOptions); + } + + return { + success: true, + duration_ms: durationMs, + outcome, + url_changed: urlChanged, + snapshot_after: snapshotAfter, + }; +} + /** * Click at the center of a rectangle using Playwright's native mouse simulation. * This uses a hybrid approach: calculates center coordinates and uses mouse.click() diff --git a/src/agent-runtime.ts b/src/agent-runtime.ts index ca3aad9..0c8af02 100644 --- a/src/agent-runtime.ts +++ b/src/agent-runtime.ts @@ -41,12 +41,21 @@ import * as fs from 'fs'; import * as path from 'path'; import { Page } from 'playwright'; -import { Snapshot } from './types'; +import { + EvaluateJsRequest, + EvaluateJsResult, + BackendCapabilities, + Snapshot, + TabInfo, + TabListResult, + TabOperationResult, +} from './types'; import { AssertContext, Predicate } from './verification'; import { Tracer } from './tracing/tracer'; import { TraceEventBuilder } from './utils/trace-event-builder'; import { LLMProvider } from './llm-provider'; import { FailureArtifactBuffer, FailureArtifactsOptions } from './failure-artifacts'; +import type { ToolRegistry } from './tools/registry'; import { CaptchaContext, CaptchaHandlingError, @@ -329,9 +338,11 @@ export class AgentRuntime { /** Browser instance for taking snapshots */ readonly browser: BrowserLike; /** Playwright Page for browser interaction */ - readonly page: Page; + page: Page; /** Tracer for event emission */ readonly tracer: Tracer; + /** Optional ToolRegistry for LLM-callable tools */ + readonly toolRegistry?: ToolRegistry; /** Current step identifier */ stepId: string | null = null; @@ -343,6 +354,9 @@ export class AgentRuntime { private stepPreUrl: string | null = null; /** Best-effort download records (Playwright downloads) */ private downloads: Array> = []; + /** Tab registry for tab operations */ + private tabRegistry: Map = new Map(); + private tabIds: WeakMap = new WeakMap(); /** Failure artifact buffer (Phase 1) */ private artifactBuffer: FailureArtifactBuffer | null = null; @@ -380,6 +394,20 @@ export class AgentRuntime { return (2 * common) / (a2.length + b2.length + 1e-9); } + private static stringifyEvalValue(value: any): string { + if (value === null || value === undefined) { + return 'null'; + } + if (Array.isArray(value) || typeof value === 'object') { + try { + return JSON.stringify(value); + } catch { + return String(value); + } + } + return String(value); + } + _recordOutcome( outcome: ReturnType, label: string, @@ -443,10 +471,11 @@ export class AgentRuntime { * @param page - Playwright Page for browser interaction * @param tracer - Tracer for emitting verification events */ - constructor(browser: BrowserLike, page: Page, tracer: Tracer) { + constructor(browser: BrowserLike, page: Page, tracer: Tracer, toolRegistry?: ToolRegistry) { this.browser = browser; this.page = page; this.tracer = tracer; + this.toolRegistry = toolRegistry; // Best-effort download tracking (does not change behavior unless a download occurs). try { @@ -458,6 +487,28 @@ export class AgentRuntime { } } + capabilities(): BackendCapabilities { + const hasTabs = typeof (this as any).listTabs === 'function'; + const hasEval = typeof (this as any).evaluateJs === 'function'; + const hasKeyboard = Boolean((this.page as any)?.keyboard); + const hasDownloads = this.downloads.length >= 0; + let hasFiles = false; + if (this.toolRegistry) { + hasFiles = Boolean(this.toolRegistry.get('read_file')); + } + return { + tabs: hasTabs, + evaluate_js: hasEval, + downloads: hasDownloads, + filesystem_tools: hasFiles, + keyboard: hasKeyboard, + }; + } + + can(name: keyof BackendCapabilities): boolean { + return Boolean(this.capabilities()[name]); + } + /** * Configure CAPTCHA handling (disabled by default unless set). */ @@ -547,6 +598,174 @@ export class AgentRuntime { return this.lastSnapshot; } + /** + * Evaluate JavaScript in the page context. + */ + async evaluateJs(request: EvaluateJsRequest): Promise { + try { + const value = await this.page.evaluate(request.code); + const text = AgentRuntime.stringifyEvalValue(value); + const maxChars = request.max_output_chars ?? 4000; + const truncate = request.truncate ?? true; + let truncated = false; + let finalText = text; + if (truncate && finalText.length > maxChars) { + finalText = `${finalText.slice(0, maxChars)}...`; + truncated = true; + } + return { + ok: true, + value, + text: finalText, + truncated, + }; + } catch (err: any) { + return { ok: false, error: String(err?.message ?? err) }; + } + } + + /** + * List open tabs in the current browser context. + */ + async listTabs(): Promise { + const context = (this.page as any)?.context?.(); + if (!context || typeof context.pages !== 'function') { + return { ok: false, tabs: [], error: 'unsupported_capability' }; + } + this.pruneTabs(); + const pages: Page[] = context.pages(); + const tabs: TabInfo[] = []; + for (const page of pages) { + const tab_id = this.ensureTabId(page); + let title: string | null = null; + try { + title = await page.title(); + } catch { + title = null; + } + let url: string | null = null; + try { + url = page.url(); + } catch { + url = null; + } + tabs.push({ tab_id, url, title, is_active: page === this.page }); + } + return { ok: true, tabs }; + } + + /** + * Open a new tab and navigate to the URL. + */ + async openTab(url: string): Promise { + const context = (this.page as any)?.context?.(); + if (!context || typeof context.newPage !== 'function') { + return { ok: false, error: 'unsupported_capability' }; + } + this.pruneTabs(); + try { + const page = await context.newPage(); + await page.goto(url); + this.page = page; + const tab_id = this.ensureTabId(page); + let title: string | null = null; + try { + title = await page.title(); + } catch { + title = null; + } + return { ok: true, tab: { tab_id, url: page.url?.() ?? url, title, is_active: true } }; + } catch (err: any) { + return { ok: false, error: String(err?.message ?? err) }; + } + } + + /** + * Switch to an existing tab by id. + */ + async switchTab(tab_id: string): Promise { + this.pruneTabs(); + const page = this.tabRegistry.get(tab_id); + if (!page) { + return { ok: false, error: `unknown tab_id: ${tab_id}` }; + } + this.page = page; + try { + await page.bringToFront(); + } catch { + // best-effort + } + let title: string | null = null; + try { + title = await page.title(); + } catch { + title = null; + } + return { + ok: true, + tab: { tab_id, url: page.url?.() ?? null, title, is_active: true }, + }; + } + + /** + * Close a tab by id. + */ + async closeTab(tab_id: string): Promise { + this.pruneTabs(); + const page = this.tabRegistry.get(tab_id); + if (!page) { + return { ok: false, error: `unknown tab_id: ${tab_id}` }; + } + let title: string | null = null; + try { + title = await page.title(); + } catch { + title = null; + } + const wasActive = page === this.page; + try { + await page.close(); + } catch (err: any) { + return { ok: false, error: String(err?.message ?? err) }; + } + this.tabRegistry.delete(tab_id); + if (wasActive) { + const context = (page as any)?.context?.(); + const pages: Page[] = context?.pages?.() ?? []; + if (pages.length > 0) { + this.page = pages[0]; + } + } + return { + ok: true, + tab: { tab_id, url: page.url?.() ?? null, title, is_active: wasActive }, + }; + } + + private ensureTabId(page: Page): string { + const existing = this.tabIds.get(page); + if (existing) { + return existing; + } + const tab_id = `tab-${Date.now()}-${Math.random().toString(16).slice(2)}`; + this.tabIds.set(page, tab_id); + this.tabRegistry.set(tab_id, page); + return tab_id; + } + + private pruneTabs(): void { + for (const [tab_id, page] of this.tabRegistry.entries()) { + try { + const isClosed = (page as any).isClosed?.(); + if (isClosed) { + this.tabRegistry.delete(tab_id); + } + } catch { + // ignore + } + } + } + private isCaptchaDetected(snapshot: Snapshot): boolean { const options = this.captchaOptions; if (!options) { diff --git a/src/agent.ts b/src/agent.ts index c80706f..e50c1e4 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -5,7 +5,7 @@ import { SentienceBrowser } from './browser'; import { snapshot, SnapshotOptions } from './snapshot'; -import { Snapshot } from './types'; +import { Snapshot, StepHookContext } from './types'; import { LLMProvider, LLMResponse } from './llm-provider'; import { Tracer } from './tracing/tracer'; import { randomUUID } from 'crypto'; @@ -192,7 +192,9 @@ export class SentienceAgent { async act( goal: string, maxRetries: number = 2, - snapshotOptions?: SnapshotOptions + snapshotOptions?: SnapshotOptions, + onStepStart?: (ctx: StepHookContext) => void | Promise, + onStepEnd?: (ctx: StepHookContext) => void | Promise ): Promise { if (this.verbose) { console.log('\n' + '='.repeat(70)); @@ -205,11 +207,18 @@ export class SentienceAgent { const stepId = randomUUID(); // Emit step_start event + const pageRef = this.browser.getPage(); + const currentUrl = typeof pageRef?.url === 'function' ? pageRef.url() : 'unknown'; if (this.tracer) { - const page = this.browser.getPage(); - const currentUrl = page ? page.url() : 'unknown'; this.tracer.emitStepStart(stepId, this.stepCount, goal, 0, currentUrl); } + await this.runHook(onStepStart, { + stepId, + stepIndex: this.stepCount, + goal, + attempt: 0, + url: currentUrl, + }); // Track data collected during step execution for step_end emission on failure let stepSnapWithDiff: Snapshot | null = null; @@ -336,9 +345,10 @@ export class SentienceAgent { } // Emit step_end event if tracer is enabled + const postPage = this.browser.getPage(); + const postUrl = typeof postPage?.url === 'function' ? postPage.url() : null; if (this.tracer) { const preUrl = snap.url; - const postUrl = this.browser.getPage()?.url() || null; let postSnapshotDigest: string | undefined; try { const postSnap = await snapshot(this.browser, { @@ -371,6 +381,16 @@ export class SentienceAgent { this.tracer.emit('step_end', stepEndData, stepId); } + await this.runHook(onStepEnd, { + stepId, + stepIndex: this.stepCount, + goal, + attempt, + url: postUrl, + success: result.success, + outcome: result.outcome, + error: result.error, + }); return result; } catch (error: any) { // Emit error event @@ -388,7 +408,8 @@ export class SentienceAgent { // Emit step_end with whatever data we collected before failure // This ensures diff_status and other fields are preserved in traces if (this.tracer && stepSnapWithDiff) { - const postUrl = this.browser.getPage()?.url() || null; + const postPage = this.browser.getPage(); + const postUrl = typeof postPage?.url === 'function' ? postPage.url() : null; const durationMs = Date.now() - stepStartTime; const stepEndData = TraceEventBuilder.buildPartialStepEndData({ @@ -415,6 +436,16 @@ export class SentienceAgent { durationMs: 0, }; this.history.push(errorResult as any); + await this.runHook(onStepEnd, { + stepId, + stepIndex: this.stepCount, + goal, + attempt, + url: stepPreUrl, + success: false, + outcome: 'exception', + error: error.message, + }); throw new Error(`Failed after ${maxRetries} retries: ${error.message}`); } } @@ -491,4 +522,16 @@ export class SentienceAgent { getTracer(): Tracer | undefined { return this.tracer; } + + private async runHook( + hook: ((ctx: StepHookContext) => void | Promise) | undefined, + ctx: StepHookContext + ): Promise { + if (!hook) return; + try { + await Promise.resolve(hook(ctx)); + } catch { + // best-effort hook; ignore errors + } + } } diff --git a/src/browser.ts b/src/browser.ts index a3730ab..b554b8d 100644 --- a/src/browser.ts +++ b/src/browser.ts @@ -12,6 +12,63 @@ import { SnapshotOptions } from './snapshot'; import { IBrowser } from './protocols/browser-protocol'; import { snapshot as snapshotFunction } from './snapshot'; +export function normalizeDomain(domain: string): string { + const raw = domain.trim(); + let host = raw; + if (raw.includes('://')) { + try { + host = new URL(raw).hostname || ''; + } catch { + host = raw; + } + } else { + host = raw.split('/', 1)[0]; + } + host = host.split(':', 1)[0]; + return host.trim().toLowerCase().replace(/^\./, ''); +} + +export function domainMatches(host: string, pattern: string): boolean { + const hostNorm = normalizeDomain(host); + let pat = normalizeDomain(pattern); + if (pat.startsWith('*.')) { + pat = pat.slice(2); + } + return hostNorm === pat || hostNorm.endsWith(`.${pat}`); +} + +export function extractHost(url: string): string | null { + let raw = url.trim(); + if (!raw.includes('://')) { + raw = `https://${raw}`; + } + try { + const parsed = new URL(raw); + return parsed.hostname || null; + } catch { + return null; + } +} + +export function isDomainAllowed( + host: string | null, + allowed?: string[], + prohibited?: string[] +): boolean { + if (!host) return false; + if (prohibited && prohibited.length > 0) { + for (const pattern of prohibited) { + if (domainMatches(host, pattern)) { + return false; + } + } + } + if (allowed && allowed.length > 0) { + return allowed.some(pattern => domainMatches(host, pattern)); + } + return true; +} + export class SentienceBrowser implements IBrowser { private context: BrowserContext | null = null; private browser: Browser | null = null; @@ -28,6 +85,9 @@ export class SentienceBrowser implements IBrowser { private _recordVideoSize?: { width: number; height: number }; private _viewport?: { width: number; height: number }; private _deviceScaleFactor?: number; + private _allowedDomains?: string[]; + private _prohibitedDomains?: string[]; + private _keepAlive: boolean; /** * Create a new SentienceBrowser instance @@ -44,6 +104,9 @@ export class SentienceBrowser implements IBrowser { * @param deviceScaleFactor - Optional device scale factor to emulate high-DPI (Retina) screens. * Examples: 1.0 (default, standard DPI), 2.0 (Retina/high-DPI, like MacBook Pro), 3.0 (very high DPI) * If undefined, defaults to 1.0 (standard DPI). + * @param allowedDomains - Optional list of allowed domains for navigation. + * @param prohibitedDomains - Optional list of prohibited domains for navigation. + * @param keepAlive - Keep browser alive after close() (no teardown). */ constructor( apiKey?: string, @@ -55,7 +118,10 @@ export class SentienceBrowser implements IBrowser { recordVideoDir?: string, recordVideoSize?: { width: number; height: number }, viewport?: { width: number; height: number }, - deviceScaleFactor?: number + deviceScaleFactor?: number, + allowedDomains?: string[], + prohibitedDomains?: string[], + keepAlive: boolean = false ) { this._apiKey = apiKey; @@ -93,6 +159,9 @@ export class SentienceBrowser implements IBrowser { // Device scale factor for high-DPI emulation this._deviceScaleFactor = deviceScaleFactor; + this._allowedDomains = allowedDomains; + this._prohibitedDomains = prohibitedDomains; + this._keepAlive = keepAlive; } async start(): Promise { @@ -488,6 +557,10 @@ export class SentienceBrowser implements IBrowser { if (!page) { throw new Error('Browser not started. Call start() first.'); } + const host = extractHost(url); + if (!isDomainAllowed(host, this._allowedDomains, this._prohibitedDomains)) { + throw new Error(`domain not allowed: ${host}`); + } await page.goto(url, { waitUntil: 'domcontentloaded' }); if (!(await this.waitForExtension(page, 15000))) { @@ -782,6 +855,9 @@ export class SentienceBrowser implements IBrowser { } async close(outputPath?: string): Promise { + if (this._keepAlive) { + return null; + } let tempVideoPath: string | null = null; // Get video path before closing (if recording was enabled) diff --git a/src/cli.ts b/src/cli.ts index c5a86e9..5b7879d 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -2,10 +2,16 @@ * CLI commands for Sentience SDK */ +import * as fs from 'fs'; +import * as path from 'path'; +import * as readline from 'readline'; import { SentienceBrowser } from './browser'; import { inspect } from './inspector'; import { record, Recorder } from './recorder'; import { ScriptGenerator } from './generator'; +import { click, press, typeText } from './actions'; +import { snapshot, SnapshotOptions } from './snapshot'; +import { screenshot } from './screenshot'; async function cmdInspect(args: string[]) { // Parse proxy from args @@ -154,6 +160,212 @@ async function cmdGen(args: string[]) { } } +function printDriverHelp(): void { + console.log('\nCommands:'); + console.log(' open Navigate to URL'); + console.log(' state [limit] List clickable elements (optional limit)'); + console.log(' click Click element by id'); + console.log(' type Type text into element'); + console.log(' press Press a key (e.g., Enter)'); + console.log(' screenshot [path] Save screenshot (png/jpg)'); + console.log(' close Close browser and exit'); + console.log(' help Show this help'); +} + +function parseDriverLine(raw: string): string[] { + const tokens: string[] = []; + const re = /"([^"]*)"|'([^']*)'|[^\s]+/g; + let match: RegExpExecArray | null = null; + while ((match = re.exec(raw)) !== null) { + if (match[1] !== undefined) { + tokens.push(match[1]); + } else if (match[2] !== undefined) { + tokens.push(match[2]); + } else if (match[0]) { + tokens.push(match[0]); + } + } + return tokens; +} + +async function cmdDriver(args: string[]) { + let url: string | undefined; + let limit = 50; + let headless = false; + let proxy: string | undefined; + + for (let i = 0; i < args.length; i++) { + if (args[i] === '--url' && i + 1 < args.length) { + url = args[++i]; + } else if (args[i] === '--limit' && i + 1 < args.length) { + const parsed = Number(args[++i]); + if (!Number.isFinite(parsed) || parsed <= 0) { + console.error('āŒ --limit must be a positive number'); + process.exit(1); + } + limit = Math.floor(parsed); + } else if (args[i] === '--headless') { + headless = true; + } else if (args[i] === '--proxy' && i + 1 < args.length) { + proxy = args[++i]; + } + } + + const browser = new SentienceBrowser(undefined, undefined, headless, proxy); + try { + await browser.start(); + if (url) { + const page = browser.getPage(); + if (!page) throw new Error('Browser not started. Call start() first.'); + await page.goto(url); + await page.waitForLoadState('networkidle'); + } + + console.log("āœ… Manual driver started. Type 'help' for commands."); + const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout, + terminal: true, + }); + let closed = false; + + rl.on('close', () => { + closed = true; + }); + rl.on('SIGINT', () => { + console.log('\nšŸ‘‹ Exiting manual driver.'); + rl.close(); + }); + + const ask = async (): Promise => { + const raw = await new Promise(resolve => rl.question('sentience> ', resolve)); + const trimmed = raw.trim(); + if (!trimmed) { + return; + } + + const parts = parseDriverLine(trimmed); + if (parts.length === 0) return; + const cmd = parts[0].toLowerCase(); + const cmdArgs = parts.slice(1); + + if (cmd === 'help' || cmd === '?') { + printDriverHelp(); + return; + } + + if (cmd === 'open') { + if (cmdArgs.length < 1) { + console.log('āŒ Usage: open '); + return; + } + const target = cmdArgs[0]; + const page = browser.getPage(); + if (!page) throw new Error('Browser not started. Call start() first.'); + await page.goto(target); + await page.waitForLoadState('networkidle'); + console.log(`āœ… Opened ${target}`); + return; + } + + if (cmd === 'state') { + let currentLimit = limit; + if (cmdArgs.length > 0) { + const parsed = Number(cmdArgs[0]); + if (!Number.isFinite(parsed) || parsed <= 0) { + console.log('āŒ Usage: state [limit]'); + return; + } + currentLimit = Math.floor(parsed); + } + const snapOpts: SnapshotOptions = { limit: currentLimit }; + const snap = await snapshot(browser, snapOpts); + const clickables = snap.elements.filter(el => el.visual_cues?.is_clickable); + console.log(`URL: ${snap.url}`); + console.log(`Clickable elements: ${clickables.length}`); + for (const el of clickables) { + let text = (el.text || '').replace(/\n/g, ' ').trim(); + if (text.length > 60) { + text = `${text.slice(0, 57)}...`; + } + console.log(`- id=${el.id} role=${el.role} text='${text}'`); + } + return; + } + + if (cmd === 'click') { + if (cmdArgs.length !== 1) { + console.log('āŒ Usage: click '); + return; + } + const elementId = Number(cmdArgs[0]); + if (!Number.isFinite(elementId)) { + console.log('āŒ element_id must be a number'); + return; + } + await click(browser, elementId); + console.log(`āœ… Clicked element ${elementId}`); + return; + } + + if (cmd === 'type') { + if (cmdArgs.length < 2) { + console.log('āŒ Usage: type '); + return; + } + const elementId = Number(cmdArgs[0]); + if (!Number.isFinite(elementId)) { + console.log('āŒ element_id must be a number'); + return; + } + const text = cmdArgs.slice(1).join(' '); + await typeText(browser, elementId, text); + console.log(`āœ… Typed into element ${elementId}`); + return; + } + + if (cmd === 'press') { + if (cmdArgs.length !== 1) { + console.log('āŒ Usage: press (e.g., "Enter")'); + return; + } + await press(browser, cmdArgs[0]); + console.log(`āœ… Pressed ${cmdArgs[0]}`); + return; + } + + if (cmd === 'screenshot') { + let outPath = cmdArgs[0]; + if (!outPath) { + outPath = `screenshot-${Date.now()}.png`; + } + const ext = path.extname(outPath).toLowerCase(); + const format = ext === '.jpg' || ext === '.jpeg' ? 'jpeg' : 'png'; + const dataUrl = await screenshot(browser, { format }); + const comma = dataUrl.indexOf(','); + const base64 = comma >= 0 ? dataUrl.slice(comma + 1) : dataUrl; + fs.writeFileSync(outPath, Buffer.from(base64, 'base64')); + console.log(`āœ… Saved screenshot to ${outPath}`); + return; + } + + if (cmd === 'close' || cmd === 'exit' || cmd === 'quit') { + console.log('šŸ‘‹ Closing browser.'); + rl.close(); + return; + } + + console.log(`āŒ Unknown command: ${cmd}. Type 'help' for options.`); + }; + + while (!closed) { + await ask(); + } + } finally { + await browser.close(); + } +} + async function main() { const args = process.argv.slice(2); const command = args[0]; @@ -164,6 +376,8 @@ async function main() { await cmdRecord(args.slice(1)); } else if (command === 'gen') { await cmdGen(args.slice(1)); + } else if (command === 'driver') { + await cmdDriver(args.slice(1)); } else { console.log('Usage: sentience [options]'); console.log(''); @@ -171,6 +385,7 @@ async function main() { console.log(' inspect Start inspector mode'); console.log(' record [--url URL] Start recording mode'); console.log(' gen Generate script from trace'); + console.log(' driver [--url URL] Manual driver CLI'); console.log(''); console.log('Options:'); console.log( @@ -185,6 +400,7 @@ async function main() { ' sentience record --proxy http://user:pass@proxy.com:8000 --url https://example.com' ); console.log(' sentience gen trace.json --lang py --output script.py'); + console.log(' sentience driver --url https://example.com'); process.exit(1); } } diff --git a/src/index.ts b/src/index.ts index 6359b7d..86a62d9 100644 --- a/src/index.ts +++ b/src/index.ts @@ -13,8 +13,10 @@ export { clickRect, ClickRect, press, + search, scrollTo, selectOption, + sendKeys, submit, typeText, uncheck, @@ -26,7 +28,7 @@ export { expect, Expectation } from './expect'; export { Inspector, inspect } from './inspector'; export { Recorder, Trace, TraceStep, record } from './recorder'; export { ScriptGenerator, generate } from './generator'; -export { read, ReadOptions, ReadResult } from './read'; +export { read, extract, ReadOptions, ReadResult } from './read'; export { screenshot, ScreenshotOptions } from './screenshot'; export { showOverlay, clearOverlay } from './overlay'; export { findTextRect } from './textSearch'; @@ -90,6 +92,7 @@ export type { RuntimeStep, StepVerification } from './runtime-agent'; export { parseVisionExecutorAction, executeVisionExecutorAction } from './vision-executor'; export * from './captcha/types'; export * from './captcha/strategies'; +export * from './tools'; // Ordinal Support (Phase 3) export { diff --git a/src/read.ts b/src/read.ts index 6ef710a..b3dcaba 100644 --- a/src/read.ts +++ b/src/read.ts @@ -2,9 +2,13 @@ * Read page content - supports raw HTML, text, and markdown formats */ +import { ZodTypeAny } from 'zod'; import { SentienceBrowser } from './browser'; import TurndownService from 'turndown'; import { BrowserEvaluator } from './utils/browser-evaluator'; +import { LLMProvider } from './llm-provider'; +import type { ExtractResult } from './types'; +import { zodToJsonSchema } from './utils/zod'; export interface ReadOptions { format?: 'raw' | 'text' | 'markdown'; @@ -20,6 +24,18 @@ export interface ReadResult { error?: string; } +function extractJsonPayload(text: string): Record { + const fenced = text.match(/```json\s*(\{[\s\S]*?\})\s*```/i); + if (fenced && fenced[1]) { + return JSON.parse(fenced[1]); + } + const inline = text.match(/(\{[\s\S]*\})/); + if (inline && inline[1]) { + return JSON.parse(inline[1]); + } + return JSON.parse(text); +} + /** * Read page content as raw HTML, text, or markdown * @@ -114,3 +130,41 @@ export async function read( return result; } + +/** + * Extract structured data from the current page using read() markdown + LLM. + */ +export async function extract( + browser: SentienceBrowser, + llm: LLMProvider, + query: string, + schema?: ZodTypeAny, + maxChars: number = 12000 +): Promise { + const result = await read(browser, { format: 'markdown', enhanceMarkdown: true }); + if (result.status !== 'success') { + return { ok: false, error: result.error ?? 'read failed' }; + } + + const content = result.content.slice(0, maxChars); + const schemaDesc = schema ? JSON.stringify(zodToJsonSchema(schema)) : ''; + const system = 'You extract structured data from markdown content. Return only JSON. No prose.'; + const user = `QUERY:\n${query}\n\nSCHEMA:\n${schemaDesc}\n\nCONTENT:\n${content}`; + const response = await llm.generate(system, user); + const raw = response.content.trim(); + + if (!schema) { + return { ok: true, data: { text: raw }, raw }; + } + + try { + const payload = extractJsonPayload(raw); + const parsed = schema.safeParse(payload); + if (!parsed.success) { + return { ok: false, error: parsed.error.message, raw }; + } + return { ok: true, data: parsed.data, raw }; + } catch (err: any) { + return { ok: false, error: String(err?.message ?? err), raw }; + } +} diff --git a/src/runtime-agent.ts b/src/runtime-agent.ts index 1f1837c..d72aa54 100644 --- a/src/runtime-agent.ts +++ b/src/runtime-agent.ts @@ -10,7 +10,7 @@ import { AgentRuntime } from './agent-runtime'; import { LLMProvider } from './llm-provider'; import { LLMInteractionHandler } from './utils/llm-interaction-handler'; -import type { Snapshot, Element, BBox } from './types'; +import type { Snapshot, Element, BBox, StepHookContext } from './types'; import type { Predicate } from './verification'; export interface StepVerification { @@ -74,17 +74,33 @@ export class RuntimeAgent { this.structuredLLM = new LLMInteractionHandler(this.executor, false); } - async runStep(opts: { taskGoal: string; step: RuntimeStep }): Promise { - const { taskGoal, step } = opts; - this.runtime.beginStep(step.goal); + async runStep(opts: { + taskGoal: string; + step: RuntimeStep; + onStepStart?: (ctx: StepHookContext) => void | Promise; + onStepEnd?: (ctx: StepHookContext) => void | Promise; + }): Promise { + const { taskGoal, step, onStepStart, onStepEnd } = opts; + const stepId = this.runtime.beginStep(step.goal); + + await this.runHook(onStepStart, { + stepId, + stepIndex: this.runtime.stepIndex, + goal: step.goal, + attempt: 0, + url: this.runtime.lastSnapshot?.url ?? this.runtime.page?.url?.() ?? null, + }); let ok = false; let emitted = false; + let outcome: string | null = null; + let errorMessage: string | null = null; try { const snap = await this.snapshotWithRamp(step); if (await this.shouldShortCircuitToVision(step, snap)) { ok = await this.visionExecutorAttempt({ taskGoal, step, snap }); + outcome = ok ? 'ok' : 'verification_failed'; return ok; } @@ -92,22 +108,29 @@ export class RuntimeAgent { const action = await this.proposeStructuredAction({ taskGoal, step, snap }); await this.executeAction(action, snap); ok = await this.applyVerifications(step); - if (ok) return true; + if (ok) { + outcome = 'ok'; + return true; + } // 2) Optional vision executor fallback (bounded). const enabled = step.visionExecutorEnabled ?? true; const maxAttempts = step.maxVisionExecutorAttempts ?? 1; if (enabled && maxAttempts > 0) { ok = await this.visionExecutorAttempt({ taskGoal, step, snap }); + outcome = ok ? 'ok' : 'verification_failed'; return ok; } + outcome = 'verification_failed'; return false; } catch (error: any) { + errorMessage = String(error?.message ?? error); + outcome = 'exception'; this.runtime.emitStepEnd({ success: false, verifyPassed: false, - error: String(error?.message ?? error), + error: errorMessage, outcome: 'exception', }); emitted = true; @@ -120,6 +143,28 @@ export class RuntimeAgent { outcome: ok ? 'ok' : 'verification_failed', }); } + await this.runHook(onStepEnd, { + stepId, + stepIndex: this.runtime.stepIndex, + goal: step.goal, + attempt: 0, + url: this.runtime.lastSnapshot?.url ?? this.runtime.page?.url?.() ?? null, + success: ok, + outcome, + error: errorMessage, + }); + } + } + + private async runHook( + hook: ((ctx: StepHookContext) => void | Promise) | undefined, + ctx: StepHookContext + ): Promise { + if (!hook) return; + try { + await Promise.resolve(hook(ctx)); + } catch { + // best-effort hook; ignore errors } } diff --git a/src/tools/context.ts b/src/tools/context.ts new file mode 100644 index 0000000..531a1f8 --- /dev/null +++ b/src/tools/context.ts @@ -0,0 +1,44 @@ +import path from 'path'; +import type { BackendCapabilities } from '../types'; +import type { AgentRuntime } from '../agent-runtime'; +import { FileSandbox } from './filesystem'; + +export class UnsupportedCapabilityError extends Error { + readonly error = 'unsupported_capability'; + readonly detail: string; + readonly capability: string; + + constructor(capability: string, detail?: string) { + const message = detail ?? `${capability} not supported by backend`; + super(message); + this.detail = message; + this.capability = capability; + } +} + +export class ToolContext { + readonly runtime: AgentRuntime; + readonly files: FileSandbox; + + constructor(runtime: AgentRuntime, files?: FileSandbox, baseDir?: string) { + this.runtime = runtime; + const root = baseDir + ? path.resolve(baseDir) + : path.resolve(process.cwd(), '.sentience', 'files'); + this.files = files ?? new FileSandbox(root); + } + + capabilities(): BackendCapabilities { + return this.runtime.capabilities(); + } + + can(name: keyof BackendCapabilities): boolean { + return Boolean(this.capabilities()[name]); + } + + require(name: keyof BackendCapabilities): void { + if (!this.can(name)) { + throw new UnsupportedCapabilityError(name); + } + } +} diff --git a/src/tools/defaults.ts b/src/tools/defaults.ts new file mode 100644 index 0000000..ab32340 --- /dev/null +++ b/src/tools/defaults.ts @@ -0,0 +1,346 @@ +import { z } from 'zod'; +import type { AgentRuntime } from '../agent-runtime'; +import type { ActionResult, Snapshot, EvaluateJsResult } from '../types'; +import { ToolContext } from './context'; +import { defineTool, ToolRegistry } from './registry'; + +const snapshotSchema = z + .object({ + status: z.enum(['success', 'error']), + url: z.string(), + elements: z.array(z.any()), + }) + .passthrough(); + +const actionResultSchema = z + .object({ + success: z.boolean(), + duration_ms: z.number(), + outcome: z.enum(['navigated', 'dom_updated', 'no_change', 'error']).optional(), + url_changed: z.boolean().optional(), + snapshot_after: z.any().optional(), + cursor: z.record(z.any()).optional(), + error: z + .object({ + code: z.string(), + reason: z.string(), + recovery_hint: z.string().optional(), + }) + .optional(), + }) + .passthrough(); + +const evaluateJsOutput = z + .object({ + ok: z.boolean(), + value: z.any().optional(), + text: z.string().nullable().optional(), + truncated: z.boolean().optional(), + error: z.string().nullable().optional(), + }) + .passthrough(); + +const snapshotInput = z.object({ + limit: z.number().int().min(1).max(500).default(50), +}); + +const clickInput = z.object({ + element_id: z.number().int().min(1), +}); + +const typeInput = z.object({ + element_id: z.number().int().min(1), + text: z.string().min(1), + clear_first: z.boolean().default(false), +}); + +const scrollInput = z.object({ + delta_y: z.number(), + x: z.number().optional(), + y: z.number().optional(), +}); + +const scrollToElementInput = z.object({ + element_id: z.number().int().min(1), + behavior: z.string().default('instant'), + block: z.string().default('center'), +}); + +const clickRectInput = z.object({ + x: z.number(), + y: z.number(), + width: z.number().min(0), + height: z.number().min(0), +}); + +const pressInput = z.object({ + key: z.string().min(1), +}); + +const evaluateJsInput = z.object({ + code: z.string().min(1).max(8000), + max_output_chars: z.number().int().min(1).max(20000).default(4000), + truncate: z.boolean().default(true), +}); + +function getRuntime(ctx: ToolContext | null, runtime?: ToolContext | AgentRuntime): AgentRuntime { + if (ctx) return ctx.runtime; + if (runtime instanceof ToolContext) return runtime.runtime; + if (runtime) return runtime; + throw new Error('ToolContext with runtime is required'); +} + +function buildOutcome( + urlBefore: string, + urlAfter: string, + success: boolean +): Pick { + const urlChanged = urlBefore !== urlAfter; + const outcome = urlChanged ? 'navigated' : success ? 'dom_updated' : 'error'; + return { outcome, url_changed: urlChanged }; +} + +function bboxCenter(bbox: { + x: number; + y: number; + width: number; + height: number; +}): [number, number] { + return [bbox.x + bbox.width / 2, bbox.y + bbox.height / 2]; +} + +export function registerDefaultTools( + registry: ToolRegistry, + runtime?: ToolContext | AgentRuntime +): ToolRegistry { + registry.register( + defineTool<{ limit: number }, Snapshot, ToolContext | null>({ + name: 'snapshot_state', + description: 'Capture a snapshot of the current page state.', + input: snapshotInput, + output: snapshotSchema, + handler: async (ctx, params): Promise => { + const runtimeRef = getRuntime(ctx, runtime); + const snap = await runtimeRef.snapshot({ + limit: params.limit, + goal: 'tool_snapshot_state', + }); + if (!snap) { + throw new Error('snapshot() returned null'); + } + return snap; + }, + }) + ); + + registry.register( + defineTool<{ element_id: number }, ActionResult, ToolContext | null>({ + name: 'click', + description: 'Click an element by id from the latest snapshot.', + input: clickInput, + output: actionResultSchema, + handler: async (ctx, params): Promise => { + const runtimeRef = getRuntime(ctx, runtime); + const page = runtimeRef.page; + const snap = runtimeRef.lastSnapshot ?? (await runtimeRef.snapshot({ goal: 'tool_click' })); + if (!snap) throw new Error('snapshot() returned null'); + const el = snap.elements.find(e => e.id === params.element_id); + if (!el) throw new Error(`element_id not found: ${params.element_id}`); + const [x, y] = bboxCenter(el.bbox); + const start = Date.now(); + const urlBefore = page.url(); + await page.mouse.click(x, y); + try { + await page.waitForTimeout(250); + } catch { + /* best-effort */ + } + const urlAfter = page.url(); + return { + success: true, + duration_ms: Date.now() - start, + ...buildOutcome(urlBefore, urlAfter, true), + }; + }, + }) + ); + + registry.register( + defineTool< + { element_id: number; text: string; clear_first: boolean }, + ActionResult, + ToolContext | null + >({ + name: 'type', + description: 'Type text into an element by id from the latest snapshot.', + input: typeInput, + output: actionResultSchema, + handler: async (ctx, params): Promise => { + const runtimeRef = getRuntime(ctx, runtime); + const page = runtimeRef.page; + const snap = runtimeRef.lastSnapshot ?? (await runtimeRef.snapshot({ goal: 'tool_type' })); + if (!snap) throw new Error('snapshot() returned null'); + const el = snap.elements.find(e => e.id === params.element_id); + if (!el) throw new Error(`element_id not found: ${params.element_id}`); + const [x, y] = bboxCenter(el.bbox); + const start = Date.now(); + const urlBefore = page.url(); + await page.mouse.click(x, y); + if (params.clear_first) { + const selectAll = process.platform === 'darwin' ? 'Meta+A' : 'Control+A'; + await page.keyboard.press(selectAll); + await page.keyboard.press('Backspace'); + } + await page.keyboard.type(params.text); + const urlAfter = page.url(); + return { + success: true, + duration_ms: Date.now() - start, + ...buildOutcome(urlBefore, urlAfter, true), + }; + }, + }) + ); + + registry.register( + defineTool<{ delta_y: number; x?: number; y?: number }, ActionResult, ToolContext | null>({ + name: 'scroll', + description: 'Scroll the page by a delta amount.', + input: scrollInput, + output: actionResultSchema, + handler: async (ctx, params): Promise => { + const runtimeRef = getRuntime(ctx, runtime); + const page = runtimeRef.page; + const start = Date.now(); + const urlBefore = page.url(); + try { + if (page.mouse?.wheel) { + await page.mouse.wheel(params.x ?? 0, params.delta_y); + } else { + await page.evaluate( + ({ dx, dy }) => { + window.scrollBy(dx || 0, dy || 0); + }, + { dx: params.x ?? 0, dy: params.delta_y } + ); + } + } catch { + // best-effort + } + const urlAfter = page.url(); + return { + success: true, + duration_ms: Date.now() - start, + ...buildOutcome(urlBefore, urlAfter, true), + }; + }, + }) + ); + + registry.register( + defineTool< + { element_id: number; behavior: string; block: string }, + ActionResult, + ToolContext | null + >({ + name: 'scroll_to_element', + description: 'Scroll the page to bring an element into view.', + input: scrollToElementInput, + output: actionResultSchema, + handler: async (ctx, params): Promise => { + const runtimeRef = getRuntime(ctx, runtime); + const page = runtimeRef.page; + const start = Date.now(); + const urlBefore = page.url(); + await page.evaluate( + ({ id, behavior, block }) => { + const el = (window as any).sentience_registry?.[id]; + if (el && el.scrollIntoView) { + el.scrollIntoView({ behavior, block, inline: 'nearest' }); + } + }, + { id: params.element_id, behavior: params.behavior, block: params.block } + ); + const urlAfter = page.url(); + return { + success: true, + duration_ms: Date.now() - start, + ...buildOutcome(urlBefore, urlAfter, true), + }; + }, + }) + ); + + registry.register( + defineTool< + { x: number; y: number; width: number; height: number }, + ActionResult, + ToolContext | null + >({ + name: 'click_rect', + description: 'Click at the center of a rectangle.', + input: clickRectInput, + output: actionResultSchema, + handler: async (ctx, params): Promise => { + const runtimeRef = getRuntime(ctx, runtime); + const page = runtimeRef.page; + const start = Date.now(); + const urlBefore = page.url(); + const x = params.x + params.width / 2; + const y = params.y + params.height / 2; + await page.mouse.click(x, y); + const urlAfter = page.url(); + return { + success: true, + duration_ms: Date.now() - start, + ...buildOutcome(urlBefore, urlAfter, true), + }; + }, + }) + ); + + registry.register( + defineTool<{ key: string }, ActionResult, ToolContext | null>({ + name: 'press', + description: 'Press a key (e.g., Enter).', + input: pressInput, + output: actionResultSchema, + handler: async (ctx, params): Promise => { + const runtimeRef = getRuntime(ctx, runtime); + const page = runtimeRef.page; + const start = Date.now(); + const urlBefore = page.url(); + await page.keyboard.press(params.key); + const urlAfter = page.url(); + return { + success: true, + duration_ms: Date.now() - start, + ...buildOutcome(urlBefore, urlAfter, true), + }; + }, + }) + ); + + registry.register( + defineTool< + { code: string; max_output_chars: number; truncate: boolean }, + EvaluateJsResult, + ToolContext | null + >({ + name: 'evaluate_js', + description: 'Execute JavaScript in the browser context.', + input: evaluateJsInput, + output: evaluateJsOutput, + handler: async (ctx, params): Promise => { + const runtimeRef = getRuntime(ctx, runtime); + return await runtimeRef.evaluateJs({ + code: params.code, + max_output_chars: params.max_output_chars, + truncate: params.truncate, + }); + }, + }) + ); + + return registry; +} diff --git a/src/tools/filesystem.ts b/src/tools/filesystem.ts new file mode 100644 index 0000000..836b705 --- /dev/null +++ b/src/tools/filesystem.ts @@ -0,0 +1,172 @@ +import fs from 'fs'; +import path from 'path'; +import { z } from 'zod'; +import { ToolRegistry, defineTool } from './registry'; +import type { ToolContext } from './context'; + +export class FileSandbox { + readonly baseDir: string; + + constructor(baseDir: string) { + this.baseDir = path.resolve(baseDir); + fs.mkdirSync(this.baseDir, { recursive: true }); + } + + private resolvePath(relPath: string): string { + const candidate = path.resolve(this.baseDir, relPath); + const relative = path.relative(this.baseDir, candidate); + if (relative.startsWith('..') || path.isAbsolute(relative)) { + throw new Error('path escapes sandbox root'); + } + return candidate; + } + + readText(relPath: string): string { + return fs.readFileSync(this.resolvePath(relPath), 'utf-8'); + } + + writeText(relPath: string, content: string, overwrite: boolean = true): number { + const target = this.resolvePath(relPath); + if (fs.existsSync(target) && !overwrite) { + throw new Error('file exists and overwrite is false'); + } + fs.mkdirSync(path.dirname(target), { recursive: true }); + fs.writeFileSync(target, content, 'utf-8'); + return Buffer.byteLength(content, 'utf-8'); + } + + appendText(relPath: string, content: string): number { + const target = this.resolvePath(relPath); + fs.mkdirSync(path.dirname(target), { recursive: true }); + fs.appendFileSync(target, content, 'utf-8'); + return Buffer.byteLength(content, 'utf-8'); + } + + replaceText(relPath: string, oldText: string, newText: string): number { + const target = this.resolvePath(relPath); + const data = fs.readFileSync(target, 'utf-8'); + const replaced = data.split(oldText).length - 1; + fs.writeFileSync(target, data.split(oldText).join(newText), 'utf-8'); + return replaced; + } +} + +const readFileInput = z.object({ + path: z.string().min(1), +}); + +const readFileOutput = z.object({ + content: z.string(), +}); + +const writeFileInput = z.object({ + path: z.string().min(1), + content: z.string(), + overwrite: z.boolean().default(true), +}); + +const writeFileOutput = z.object({ + path: z.string(), + bytes_written: z.number(), +}); + +const appendFileInput = z.object({ + path: z.string().min(1), + content: z.string(), +}); + +const appendFileOutput = z.object({ + path: z.string(), + bytes_written: z.number(), +}); + +const replaceFileInput = z.object({ + path: z.string().min(1), + old: z.string(), + new: z.string(), +}); + +const replaceFileOutput = z.object({ + path: z.string(), + replaced: z.number(), +}); + +export function registerFilesystemTools( + registry: ToolRegistry, + sandbox?: FileSandbox +): ToolRegistry { + const getFiles = (ctx: ToolContext | null) => { + if (ctx) return ctx.files; + if (sandbox) return sandbox; + throw new Error('FileSandbox is required for filesystem tools'); + }; + + registry.register( + defineTool<{ path: string }, { content: string }, ToolContext | null>({ + name: 'read_file', + description: 'Read a file from the sandbox.', + input: readFileInput, + output: readFileOutput, + handler: (ctx, params) => { + const files = getFiles(ctx); + return { content: files.readText(params.path) }; + }, + }) + ); + + registry.register( + defineTool< + { path: string; content: string; overwrite: boolean }, + { path: string; bytes_written: number }, + ToolContext | null + >({ + name: 'write_file', + description: 'Write a file to the sandbox.', + input: writeFileInput, + output: writeFileOutput, + handler: (ctx, params) => { + const files = getFiles(ctx); + const bytes = files.writeText(params.path, params.content, params.overwrite); + return { path: params.path, bytes_written: bytes }; + }, + }) + ); + + registry.register( + defineTool< + { path: string; content: string }, + { path: string; bytes_written: number }, + ToolContext | null + >({ + name: 'append_file', + description: 'Append text to a file in the sandbox.', + input: appendFileInput, + output: appendFileOutput, + handler: (ctx, params) => { + const files = getFiles(ctx); + const bytes = files.appendText(params.path, params.content); + return { path: params.path, bytes_written: bytes }; + }, + }) + ); + + registry.register( + defineTool< + { path: string; old: string; new: string }, + { path: string; replaced: number }, + ToolContext | null + >({ + name: 'replace_file', + description: 'Replace text in a file in the sandbox.', + input: replaceFileInput, + output: replaceFileOutput, + handler: (ctx, params) => { + const files = getFiles(ctx); + const replaced = files.replaceText(params.path, params.old, params.new); + return { path: params.path, replaced }; + }, + }) + ); + + return registry; +} diff --git a/src/tools/index.ts b/src/tools/index.ts new file mode 100644 index 0000000..ddf47a2 --- /dev/null +++ b/src/tools/index.ts @@ -0,0 +1,4 @@ +export { ToolRegistry, defineTool } from './registry'; +export { ToolContext, UnsupportedCapabilityError } from './context'; +export { FileSandbox, registerFilesystemTools } from './filesystem'; +export { registerDefaultTools } from './defaults'; diff --git a/src/tools/registry.ts b/src/tools/registry.ts new file mode 100644 index 0000000..4e5e573 --- /dev/null +++ b/src/tools/registry.ts @@ -0,0 +1,134 @@ +import { ZodTypeAny } from 'zod'; +import { zodToJsonSchema } from '../utils/zod'; + +export type ToolHandler = ( + ctx: TContext | null, + params: TInput +) => Promise | TOutput; + +export interface ToolSpec { + name: string; + description?: string; + input: ZodTypeAny; + output: ZodTypeAny; + handler?: ToolHandler; + parameters?: Record; +} + +export function defineTool( + spec: ToolSpec +): ToolSpec { + return spec; +} + +export class ToolRegistry { + private tools = new Map>(); + + register(spec: ToolSpec): ToolRegistry { + if (this.tools.has(spec.name)) { + throw new Error(`tool already registered: ${spec.name}`); + } + this.tools.set(spec.name, spec); + return this; + } + + get(name: string): ToolSpec | undefined { + return this.tools.get(name); + } + + list(): ToolSpec[] { + return Array.from(this.tools.values()); + } + + llmTools(): Array<{ name: string; description: string; parameters: Record }> { + return this.list().map(spec => ({ + name: spec.name, + description: spec.description ?? '', + parameters: spec.parameters ?? zodToJsonSchema(spec.input), + })); + } + + validateInput(name: string, payload: unknown): TInput { + const spec = this.tools.get(name); + if (!spec) { + throw new Error(`tool not found: ${name}`); + } + const parsed = spec.input.safeParse(payload); + if (!parsed.success) { + throw new Error(parsed.error.message); + } + return parsed.data as TInput; + } + + validateOutput(name: string, payload: unknown): TOutput { + const spec = this.tools.get(name); + if (!spec) { + throw new Error(`tool not found: ${name}`); + } + const parsed = spec.output.safeParse(payload); + if (!parsed.success) { + throw new Error(parsed.error.message); + } + return parsed.data as TOutput; + } + + validateCall(name: string, payload: unknown): { input: TInput; spec: ToolSpec } { + const spec = this.tools.get(name); + if (!spec) { + throw new Error(`tool not found: ${name}`); + } + return { input: this.validateInput(name, payload), spec }; + } + + async execute( + name: string, + payload: unknown, + ctx: { + runtime?: { tracer?: { emit: (...args: any[]) => void }; stepId?: string; step_id?: string }; + } | null = null + ): Promise { + const start = Date.now(); + const { input, spec } = this.validateCall(name, payload); + if (!spec.handler) { + throw new Error(`tool has no handler: ${name}`); + } + + const runtime = ctx?.runtime; + const tracer = runtime?.tracer; + const stepId = runtime?.stepId ?? runtime?.step_id ?? null; + + try { + const result = await Promise.resolve(spec.handler(ctx, input)); + const validated = this.validateOutput(name, result); + if (tracer) { + tracer.emit( + 'tool_call', + { + tool_name: name, + inputs: input, + outputs: validated, + success: true, + duration_ms: Date.now() - start, + }, + stepId || undefined + ); + } + return validated; + } catch (err: any) { + if (tracer) { + tracer.emit( + 'tool_call', + { + tool_name: name, + inputs: input, + success: false, + error: String(err?.message ?? err), + duration_ms: Date.now() - start, + }, + stepId || undefined + ); + } + throw err; + } + } +} diff --git a/src/types.ts b/src/types.ts index 0b8bf94..410cdd9 100644 --- a/src/types.ts +++ b/src/types.ts @@ -164,6 +164,17 @@ export interface Snapshot { modal_grids?: GridInfo[]; } +export interface StepHookContext { + stepId: string; + stepIndex: number; + goal: string; + attempt: number; + url?: string | null; + success?: boolean; + outcome?: string | null; + error?: string | null; +} + export interface SnapshotDiagnosticsMetrics { ready_state?: string | null; quiet_ms?: number | null; @@ -223,6 +234,47 @@ export interface ActionResult { }; } +export interface TabInfo { + tab_id: string; + url?: string | null; + title?: string | null; + is_active: boolean; +} + +export interface TabListResult { + ok: boolean; + tabs: TabInfo[]; + error?: string | null; +} + +export interface TabOperationResult { + ok: boolean; + tab?: TabInfo | null; + error?: string | null; +} + +export interface BackendCapabilities { + tabs: boolean; + evaluate_js: boolean; + downloads: boolean; + filesystem_tools: boolean; + keyboard: boolean; +} + +export interface EvaluateJsRequest { + code: string; + max_output_chars?: number; + truncate?: boolean; +} + +export interface EvaluateJsResult { + ok: boolean; + value?: any; + text?: string | null; + truncated?: boolean; + error?: string | null; +} + export interface WaitResult { found: boolean; element?: Element; @@ -230,6 +282,13 @@ export interface WaitResult { timeout: boolean; } +export interface ExtractResult { + ok: boolean; + data?: any; + raw?: string | null; + error?: string | null; +} + export interface QuerySelectorObject { role?: string; text?: string; diff --git a/src/utils/zod.ts b/src/utils/zod.ts new file mode 100644 index 0000000..ec29c71 --- /dev/null +++ b/src/utils/zod.ts @@ -0,0 +1,87 @@ +import { ZodFirstPartyTypeKind, ZodTypeAny } from 'zod'; + +type JsonSchema = Record; + +type Unwrapped = { + schema: ZodTypeAny; + optional: boolean; + nullable: boolean; +}; + +function unwrapSchema(schema: ZodTypeAny): Unwrapped { + let current = schema; + let optional = false; + let nullable = false; + while (true) { + const typeName = current._def?.typeName as ZodFirstPartyTypeKind | undefined; + if (typeName === ZodFirstPartyTypeKind.ZodOptional) { + optional = true; + current = current._def.innerType; + continue; + } + if (typeName === ZodFirstPartyTypeKind.ZodDefault) { + optional = true; + current = current._def.innerType; + continue; + } + if (typeName === ZodFirstPartyTypeKind.ZodNullable) { + nullable = true; + current = current._def.innerType; + continue; + } + break; + } + return { schema: current, optional, nullable }; +} + +export function zodToJsonSchema(schema: ZodTypeAny): JsonSchema { + const { schema: base, nullable } = unwrapSchema(schema); + const typeName = base._def?.typeName as ZodFirstPartyTypeKind | undefined; + + let result: JsonSchema; + switch (typeName) { + case ZodFirstPartyTypeKind.ZodString: + result = { type: 'string' }; + break; + case ZodFirstPartyTypeKind.ZodNumber: + result = { type: 'number' }; + break; + case ZodFirstPartyTypeKind.ZodBoolean: + result = { type: 'boolean' }; + break; + case ZodFirstPartyTypeKind.ZodLiteral: + result = { const: base._def.value, type: typeof base._def.value }; + break; + case ZodFirstPartyTypeKind.ZodEnum: + result = { type: 'string', enum: base._def.values }; + break; + case ZodFirstPartyTypeKind.ZodArray: + result = { type: 'array', items: zodToJsonSchema(base._def.type) }; + break; + case ZodFirstPartyTypeKind.ZodObject: { + const shape = base._def.shape(); + const properties: Record = {}; + const required: string[] = []; + for (const [key, value] of Object.entries(shape)) { + const { schema: inner, optional } = unwrapSchema(value as ZodTypeAny); + properties[key] = zodToJsonSchema(inner); + if (!optional) required.push(key); + } + result = { type: 'object', properties, required }; + break; + } + case ZodFirstPartyTypeKind.ZodUnion: + result = { + anyOf: base._def.options.map((opt: ZodTypeAny) => zodToJsonSchema(opt)), + }; + break; + default: + result = {}; + break; + } + + if (nullable) { + return { anyOf: [result, { type: 'null' }] }; + } + return result; +} diff --git a/tests/actions.test.ts b/tests/actions.test.ts index 3da90e5..1456dd1 100644 --- a/tests/actions.test.ts +++ b/tests/actions.test.ts @@ -8,6 +8,8 @@ import { check, clear, click, + search, + sendKeys, typeText, press, scrollTo, @@ -138,6 +140,98 @@ describe('Actions', () => { }, 60000); }); + describe('sendKeys', () => { + it('should send key sequences', async () => { + const browser = await createTestBrowser(); + try { + const page = getPageOrThrow(browser); + await page.goto('https://example.com'); + await page.waitForLoadState('networkidle', { timeout: 10000 }); + + const result = await sendKeys(browser, 'CTRL+L'); + expect(result.success).toBe(true); + expect(result.duration_ms).toBeGreaterThan(0); + } finally { + await browser.close(); + } + }, 60000); + + it('should throw on empty sequence', async () => { + const browser = await createTestBrowser(); + try { + const page = getPageOrThrow(browser); + await page.goto('https://example.com'); + await page.waitForLoadState('networkidle', { timeout: 10000 }); + + await expect(sendKeys(browser, '')).rejects.toThrow('empty'); + } finally { + await browser.close(); + } + }, 60000); + }); + + describe('search', () => { + it('should build search URLs', async () => { + const browser = await createTestBrowser(); + try { + const page = getPageOrThrow(browser); + await page.goto('https://example.com'); + await page.waitForLoadState('networkidle', { timeout: 10000 }); + + const result = await search(browser, 'sentience sdk', 'duckduckgo'); + expect(result.success).toBe(true); + expect(result.duration_ms).toBeGreaterThan(0); + + expect((await search(browser, 'sentience sdk', 'google')).success).toBe(true); + expect((await search(browser, 'sentience sdk', 'bing')).success).toBe(true); + expect((await search(browser, 'sentience sdk', 'google.com')).success).toBe(true); + } finally { + await browser.close(); + } + }, 90000); + + it('should reject empty query', async () => { + const browser = await createTestBrowser(); + try { + const page = getPageOrThrow(browser); + await page.goto('https://example.com'); + await page.waitForLoadState('networkidle', { timeout: 10000 }); + + await expect(search(browser, '')).rejects.toThrow('empty'); + } finally { + await browser.close(); + } + }, 60000); + + it('should reject disallowed domains', async () => { + const browser = new SentienceBrowser( + undefined, + undefined, + true, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + ['example.com'] + ); + try { + await browser.start(); + const page = getPageOrThrow(browser); + await page.goto('https://example.com'); + await page.waitForLoadState('networkidle', { timeout: 10000 }); + + await expect(search(browser, 'sentience sdk', 'duckduckgo')).rejects.toThrow( + 'domain not allowed' + ); + } finally { + await browser.close(); + } + }, 60000); + }); + describe('scrollTo', () => { it('should scroll an element into view', async () => { const browser = await createTestBrowser(); diff --git a/tests/agent-runtime-tabs-evaluate.test.ts b/tests/agent-runtime-tabs-evaluate.test.ts new file mode 100644 index 0000000..913beec --- /dev/null +++ b/tests/agent-runtime-tabs-evaluate.test.ts @@ -0,0 +1,95 @@ +import { AgentRuntime } from '../src/agent-runtime'; +import { TraceSink } from '../src/tracing/sink'; +import { Tracer } from '../src/tracing/tracer'; + +class MockSink extends TraceSink { + emit(): void { + // no-op + } + async close(): Promise { + // no-op + } + getSinkType(): string { + return 'MockSink'; + } +} + +function makeTestRuntime(page: any) { + const sink = new MockSink(); + const tracer = new Tracer('test-run', sink); + const browserLike = { + snapshot: async () => ({ + status: 'success', + url: page.url(), + elements: [], + timestamp: 't1', + }), + }; + return new AgentRuntime(browserLike as any, page as any, tracer); +} + +describe('AgentRuntime tabs and evaluateJs', () => { + it('evaluateJs returns normalized text output', async () => { + const page = { + evaluate: jest.fn().mockResolvedValue({ hello: 'world' }), + url: jest.fn().mockReturnValue('https://example.com'), + on: jest.fn(), + }; + const runtime = makeTestRuntime(page); + const result = await runtime.evaluateJs({ code: '({hello:"world"})', max_output_chars: 10 }); + expect(result.ok).toBe(true); + expect(result.text).toBe('{"hello":"world"}'.slice(0, 10) + '...'); + expect(result.truncated).toBe(true); + }); + + it('supports list/open/switch/close tab flow', async () => { + const pages: any[] = []; + const context: any = { + pages: () => pages, + newPage: jest.fn().mockImplementation(async () => { + const page = makePage('https://newtab', pages, context); + pages.push(page); + return page; + }), + }; + + const page1 = makePage('https://example.com', pages, context); + pages.push(page1); + + const runtime = makeTestRuntime(page1); + + const initial = await runtime.listTabs(); + expect(initial.ok).toBe(true); + expect(initial.tabs.length).toBe(1); + const initialTabId = initial.tabs[0].tab_id; + + const opened = await runtime.openTab('https://newtab'); + expect(opened.ok).toBe(true); + expect(opened.tab?.is_active).toBe(true); + + const switched = await runtime.switchTab(initialTabId); + expect(switched.ok).toBe(true); + expect(switched.tab?.tab_id).toBe(initialTabId); + + const closed = await runtime.closeTab(initialTabId); + expect(closed.ok).toBe(true); + }); +}); + +function makePage(url: string, pages: any[], context: any) { + const page: any = { + url: jest.fn().mockReturnValue(url), + title: jest.fn().mockResolvedValue(`Title ${url}`), + goto: jest.fn().mockResolvedValue(undefined), + bringToFront: jest.fn().mockResolvedValue(undefined), + isClosed: jest.fn().mockReturnValue(false), + close: jest.fn().mockImplementation(async () => { + page.isClosed.mockReturnValue(true); + const idx = pages.indexOf(page); + if (idx >= 0) pages.splice(idx, 1); + }), + context: () => context, + on: jest.fn(), + }; + return page; +} diff --git a/tests/agent.test.ts b/tests/agent.test.ts index d44b8d5..40927b4 100644 --- a/tests/agent.test.ts +++ b/tests/agent.test.ts @@ -6,7 +6,15 @@ import { LLMProvider, LLMResponse, OpenAIProvider, AnthropicProvider } from '../src/llm-provider'; import { SentienceAgent } from '../src/agent'; import { SentienceBrowser } from '../src/browser'; -import { Snapshot, Element, BBox, VisualCues, Viewport, ActionResult } from '../src/types'; +import { + Snapshot, + Element, + BBox, + VisualCues, + Viewport, + ActionResult, + StepHookContext, +} from '../src/types'; import * as agentModule from '../src/agent'; import * as snapshotModule from '../src/snapshot'; import * as actionsModule from '../src/actions'; @@ -348,6 +356,44 @@ describe('SentienceAgent', () => { // Check tokens expect(agent.getTokenStats().totalTokens).toBeGreaterThan(0); }); + + it('should invoke step hooks once per act()', async () => { + const browser = createMockBrowser(); + const llm = new MockLLMProvider(['CLICK(1)']); + const agent = new SentienceAgent(browser, llm, 50, false); + + const mockSnapshot = jest.fn().mockResolvedValue(createMockSnapshot()); + jest.spyOn(snapshotModule, 'snapshot').mockImplementation(mockSnapshot); + + const mockClick = jest.fn().mockResolvedValue({ + success: true, + duration_ms: 150, + outcome: 'dom_updated', + url_changed: false, + } as ActionResult); + jest.spyOn(actionsModule, 'click').mockImplementation(mockClick); + + const started: StepHookContext[] = []; + const ended: StepHookContext[] = []; + + const result = await agent.act( + 'Click the button', + 0, + undefined, + ctx => { + started.push(ctx); + }, + ctx => { + ended.push(ctx); + } + ); + + expect(result.success).toBe(true); + expect(started).toHaveLength(1); + expect(ended).toHaveLength(1); + expect(started[0].goal).toBe('Click the button'); + expect(ended[0].success).toBe(true); + }); }); describe('token tracking', () => { diff --git a/tests/browser.test.ts b/tests/browser.test.ts index b3e7a34..393ecaa 100644 --- a/tests/browser.test.ts +++ b/tests/browser.test.ts @@ -2,7 +2,7 @@ * Test browser proxy support and Phase 2 features (viewport, from_existing, from_page) */ -import { SentienceBrowser } from '../src/browser'; +import { SentienceBrowser, domainMatches, extractHost, isDomainAllowed } from '../src/browser'; import { chromium, BrowserContext, Page } from 'playwright'; import * as fs from 'fs'; import * as os from 'os'; @@ -334,3 +334,69 @@ describe('Browser Proxy Support', () => { }, 30000); }); }); + +describe('Browser Domain Policies', () => { + it('should match domains with suffix rules', () => { + expect(domainMatches('sub.example.com', 'example.com')).toBe(true); + expect(domainMatches('example.com', 'example.com')).toBe(true); + expect(domainMatches('example.com', '*.example.com')).toBe(true); + expect(domainMatches('other.com', 'example.com')).toBe(false); + expect(domainMatches('example.com', 'https://example.com')).toBe(true); + expect(domainMatches('localhost', 'http://localhost:3000')).toBe(true); + }); + + it('should enforce allow/deny lists', () => { + expect(isDomainAllowed('a.example.com', ['example.com'], [])).toBe(true); + expect(isDomainAllowed('a.example.com', ['example.com'], ['bad.com'])).toBe(true); + expect(isDomainAllowed('bad.example.com', [], ['example.com'])).toBe(false); + expect(isDomainAllowed('x.com', ['example.com'], [])).toBe(false); + expect(isDomainAllowed('example.com', ['https://example.com'], [])).toBe(true); + }); + + it('should extract host from ports', () => { + expect(extractHost('http://localhost:3000')).toBe('localhost'); + expect(extractHost('localhost:3000')).toBe('localhost'); + }); +}); + +describe('Browser keepAlive', () => { + it('should skip close when keepAlive is true', async () => { + const browser = new SentienceBrowser( + undefined, + undefined, + true, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + true + ); + + const dummyContext = { + closed: false, + close: jest.fn().mockImplementation(() => { + dummyContext.closed = true; + }), + }; + const dummyBrowser = { + closed: false, + close: jest.fn().mockImplementation(() => { + dummyBrowser.closed = true; + }), + }; + (browser as any).context = dummyContext; + (browser as any).browser = dummyBrowser; + (browser as any).extensionPath = null; + (browser as any).userDataDir = null; + + const result = await browser.close(); + expect(result).toBeNull(); + expect(dummyContext.closed).toBe(false); + expect(dummyBrowser.closed).toBe(false); + }); +}); diff --git a/tests/runtime-agent.test.ts b/tests/runtime-agent.test.ts index 2aba8c5..2cea20a 100644 --- a/tests/runtime-agent.test.ts +++ b/tests/runtime-agent.test.ts @@ -5,7 +5,7 @@ import { TraceSink } from '../src/tracing/sink'; import { MockPage } from './mocks/browser-mock'; import { LLMProvider } from '../src/llm-provider'; import type { LLMResponse } from '../src/llm-provider'; -import type { Element, Snapshot } from '../src/types'; +import type { Element, Snapshot, StepHookContext } from '../src/types'; import type { Predicate } from '../src/verification'; class MockSink extends TraceSink { @@ -345,4 +345,45 @@ describe('RuntimeAgent (runtime-backed agent)', () => { expect(vision.visionCalls.length).toBe(1); expect(page.mouseClickCalls).toEqual([{ x: 100, y: 200 }]); }); + + it('invokes step hooks once per step', async () => { + const sink = new MockSink(); + const tracer = new Tracer('run', sink); + const page = new MockPage('https://example.com/start') as any; + + const snapshots: Snapshot[] = [ + { + status: 'success', + url: 'https://example.com/start', + elements: [makeClickableElement(1)], + timestamp: 't1', + }, + ]; + + const browserLike = { + snapshot: async () => snapshots.shift() as Snapshot, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + const executor = new ProviderStub(['CLICK(1)']); + const agent = new RuntimeAgent({ runtime, executor }); + + const started: StepHookContext[] = []; + const ended: StepHookContext[] = []; + + const ok = await agent.runStep({ + taskGoal: 'test', + step: { goal: 'click once', maxSnapshotAttempts: 1 }, + onStepStart: ctx => { + started.push(ctx); + }, + onStepEnd: ctx => { + ended.push(ctx); + }, + }); + + expect(ok).toBe(true); + expect(started).toHaveLength(1); + expect(ended).toHaveLength(1); + }); }); diff --git a/tests/tool-registry.test.ts b/tests/tool-registry.test.ts new file mode 100644 index 0000000..d2c34ff --- /dev/null +++ b/tests/tool-registry.test.ts @@ -0,0 +1,37 @@ +import fs from 'fs'; +import os from 'os'; +import path from 'path'; +import { z } from 'zod'; +import { defineTool, ToolRegistry } from '../src/tools/registry'; +import { FileSandbox, registerFilesystemTools } from '../src/tools/filesystem'; + +describe('ToolRegistry', () => { + it('validates and executes tools', async () => { + const registry = new ToolRegistry(); + registry.register( + defineTool<{ msg: string }, { msg: string }, null>({ + name: 'echo', + description: 'Echo input', + input: z.object({ msg: z.string() }), + output: z.object({ msg: z.string() }), + handler: async (_ctx, params) => ({ msg: params.msg }), + }) + ); + + const result = await registry.execute<{ msg: string }>('echo', { msg: 'hello' }); + expect(result.msg).toBe('hello'); + }); +}); + +describe('Filesystem tools', () => { + it('writes and reads from sandbox', async () => { + const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'sentience-tools-')); + const sandbox = new FileSandbox(tmpRoot); + const registry = new ToolRegistry(); + registerFilesystemTools(registry, sandbox); + + await registry.execute('write_file', { path: 'note.txt', content: 'hi', overwrite: true }); + const result = await registry.execute<{ content: string }>('read_file', { path: 'note.txt' }); + expect(result.content).toBe('hi'); + }); +});