From 01fc4d9c058adae443bd5ae71d9261565cc4557d Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Wed, 21 Jan 2026 18:07:24 -0800 Subject: [PATCH 1/2] Agent with integrated Runtime for verification gate --- examples/runtime_agent_minimal.py | 89 +++++ sentience/__init__.py | 4 + sentience/agent_runtime.py | 2 +- sentience/extension/background.js | 6 +- sentience/extension/content.js | 18 +- sentience/extension/injected_api.js | 80 ++-- sentience/extension/pkg/sentience_core.js | 14 +- sentience/runtime_agent.py | 423 ++++++++++++++++++++++ tests/unit/test_runtime_agent.py | 338 +++++++++++++++++ 9 files changed, 914 insertions(+), 60 deletions(-) create mode 100644 examples/runtime_agent_minimal.py create mode 100644 sentience/runtime_agent.py create mode 100644 tests/unit/test_runtime_agent.py diff --git a/examples/runtime_agent_minimal.py b/examples/runtime_agent_minimal.py new file mode 100644 index 0000000..8595354 --- /dev/null +++ b/examples/runtime_agent_minimal.py @@ -0,0 +1,89 @@ +""" +Example: RuntimeAgent (AgentRuntime-backed) minimal demo. + +This demonstrates the verification-first loop: +snapshot -> propose action (structured executor) -> execute -> verify (AgentRuntime predicates) + +Usage: + python examples/runtime_agent_minimal.py +""" + +import asyncio + +from sentience import AsyncSentienceBrowser +from sentience.agent_runtime import AgentRuntime +from sentience.llm_provider import LLMProvider, LLMResponse +from sentience.runtime_agent import RuntimeAgent, RuntimeStep, StepVerification +from sentience.tracing import JsonlTraceSink, Tracer +from sentience.verification import AssertContext, AssertOutcome, exists, url_contains + + +class FixedActionProvider(LLMProvider): + """A tiny in-process provider for examples/tests.""" + + def __init__(self, action: str): + super().__init__(model="fixed-action") + self._action = action + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + _ = system_prompt, user_prompt, kwargs + return LLMResponse(content=self._action, model_name=self.model_name) + + def supports_json_mode(self) -> bool: + return False + + @property + def model_name(self) -> str: + return "fixed-action" + + +async def main() -> None: + # Local trace (viewable in Studio if uploaded later). + run_id = "runtime-agent-minimal" + tracer = Tracer(run_id=run_id, sink=JsonlTraceSink(f"traces/{run_id}.jsonl")) + + async with AsyncSentienceBrowser(headless=False) as browser: + page = await browser.new_page() + await page.goto("https://example.com") + await page.wait_for_load_state("networkidle") + + runtime = await AgentRuntime.from_sentience_browser(browser=browser, page=page, tracer=tracer) + + # Structured executor (for demo, we just return FINISH()). + executor = FixedActionProvider("FINISH()") + + agent = RuntimeAgent( + runtime=runtime, + executor=executor, + # vision_executor=... (optional) + # vision_verifier=... (optional, for AgentRuntime assertion vision fallback) + ) + + # One step: no action needed; we just verify structure + URL. + def has_example_heading(ctx: AssertContext) -> AssertOutcome: + # Demonstrates custom predicates (you can also use exists/url_contains helpers). + snap = ctx.snapshot + ok = bool(snap and any((el.role == "heading" and (el.text or "").startswith("Example")) for el in snap.elements)) + return AssertOutcome(passed=ok, reason="" if ok else "missing heading", details={}) + + step = RuntimeStep( + goal="Confirm Example Domain page is loaded", + verifications=[ + StepVerification(predicate=url_contains("example.com"), label="url_contains_example", required=True), + StepVerification(predicate=exists("role=heading"), label="has_heading", required=True), + StepVerification(predicate=has_example_heading, label="heading_text_matches", required=False), + ], + max_snapshot_attempts=2, + snapshot_limit_base=60, + ) + + ok = await agent.run_step(task_goal="Open example.com and verify", step=step) + print(f"step ok: {ok}") + + tracer.close() + print(f"trace written to traces/{run_id}.jsonl") + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/sentience/__init__.py b/sentience/__init__.py index 5da3711..b80d313 100644 --- a/sentience/__init__.py +++ b/sentience/__init__.py @@ -89,6 +89,7 @@ from .query import find, query from .read import read from .recorder import Recorder, Trace, TraceStep, record +from .runtime_agent import RuntimeAgent, RuntimeStep, StepVerification from .screenshot import screenshot from .sentience_methods import AgentAction, SentienceMethod from .snapshot import snapshot @@ -210,6 +211,9 @@ "MLXVLMProvider", "SentienceAgent", "SentienceAgentAsync", + "RuntimeAgent", + "RuntimeStep", + "StepVerification", "SentienceVisualAgent", "SentienceVisualAgentAsync", "ConversationalAgent", diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py index 01ba2ff..bac9837 100644 --- a/sentience/agent_runtime.py +++ b/sentience/agent_runtime.py @@ -582,7 +582,7 @@ def assert_done( True if task is complete (assertion passed), False otherwise """ # Convenience wrapper for assert_ with required=True - ok = self.assert_(predicate, label=label, required=True) + ok = self.assertTrue(predicate, label=label, required=True) if ok: self._task_done = True self._task_done_label = label diff --git a/sentience/extension/background.js b/sentience/extension/background.js index 2923f55..aff49b0 100644 --- a/sentience/extension/background.js +++ b/sentience/extension/background.js @@ -28,14 +28,14 @@ async function handleSnapshotProcessing(rawData, options = {}) { const startTime = performance.now(); try { if (!Array.isArray(rawData)) throw new Error("rawData must be an array"); - if (rawData.length > 1e4 && (rawData = rawData.slice(0, 1e4)), await initWASM(), + if (rawData.length > 1e4 && (rawData = rawData.slice(0, 1e4)), await initWASM(), !wasmReady) throw new Error("WASM module not initialized"); let analyzedElements, prunedRawData; try { const wasmPromise = new Promise((resolve, reject) => { try { let result; - result = options.limit || options.filter ? analyze_page_with_options(rawData, options) : analyze_page(rawData), + result = options.limit || options.filter ? analyze_page_with_options(rawData, options) : analyze_page(rawData), resolve(result); } catch (e) { reject(e); @@ -101,4 +101,4 @@ initWASM().catch(err => {}), chrome.runtime.onMessage.addListener((request, send event.preventDefault(); }), self.addEventListener("unhandledrejection", event => { event.preventDefault(); -}); \ No newline at end of file +}); diff --git a/sentience/extension/content.js b/sentience/extension/content.js index b65cfb5..97923a2 100644 --- a/sentience/extension/content.js +++ b/sentience/extension/content.js @@ -82,7 +82,7 @@ if (!elements || !Array.isArray(elements)) return; removeOverlay(); const host = document.createElement("div"); - host.id = OVERLAY_HOST_ID, host.style.cssText = "\n position: fixed !important;\n top: 0 !important;\n left: 0 !important;\n width: 100vw !important;\n height: 100vh !important;\n pointer-events: none !important;\n z-index: 2147483647 !important;\n margin: 0 !important;\n padding: 0 !important;\n ", + host.id = OVERLAY_HOST_ID, host.style.cssText = "\n position: fixed !important;\n top: 0 !important;\n left: 0 !important;\n width: 100vw !important;\n height: 100vh !important;\n pointer-events: none !important;\n z-index: 2147483647 !important;\n margin: 0 !important;\n padding: 0 !important;\n ", document.body.appendChild(host); const shadow = host.attachShadow({ mode: "closed" @@ -94,15 +94,15 @@ let color; color = isTarget ? "#FF0000" : isPrimary ? "#0066FF" : "#00FF00"; const importanceRatio = maxImportance > 0 ? importance / maxImportance : .5, borderOpacity = isTarget ? 1 : isPrimary ? .9 : Math.max(.4, .5 + .5 * importanceRatio), fillOpacity = .2 * borderOpacity, borderWidth = isTarget ? 2 : isPrimary ? 1.5 : Math.max(.5, Math.round(2 * importanceRatio)), hexOpacity = Math.round(255 * fillOpacity).toString(16).padStart(2, "0"), box = document.createElement("div"); - if (box.style.cssText = `\n position: absolute;\n left: ${bbox.x}px;\n top: ${bbox.y}px;\n width: ${bbox.width}px;\n height: ${bbox.height}px;\n border: ${borderWidth}px solid ${color};\n background-color: ${color}${hexOpacity};\n box-sizing: border-box;\n opacity: ${borderOpacity};\n pointer-events: none;\n `, + if (box.style.cssText = `\n position: absolute;\n left: ${bbox.x}px;\n top: ${bbox.y}px;\n width: ${bbox.width}px;\n height: ${bbox.height}px;\n border: ${borderWidth}px solid ${color};\n background-color: ${color}${hexOpacity};\n box-sizing: border-box;\n opacity: ${borderOpacity};\n pointer-events: none;\n `, importance > 0 || isPrimary) { const badge = document.createElement("span"); - badge.textContent = isPrimary ? `⭐${importance}` : `${importance}`, badge.style.cssText = `\n position: absolute;\n top: -18px;\n left: 0;\n background: ${color};\n color: white;\n font-size: 11px;\n font-weight: bold;\n padding: 2px 6px;\n font-family: Arial, sans-serif;\n border-radius: 3px;\n opacity: 0.95;\n white-space: nowrap;\n pointer-events: none;\n `, + badge.textContent = isPrimary ? `⭐${importance}` : `${importance}`, badge.style.cssText = `\n position: absolute;\n top: -18px;\n left: 0;\n background: ${color};\n color: white;\n font-size: 11px;\n font-weight: bold;\n padding: 2px 6px;\n font-family: Arial, sans-serif;\n border-radius: 3px;\n opacity: 0.95;\n white-space: nowrap;\n pointer-events: none;\n `, box.appendChild(badge); } if (isTarget) { const targetIndicator = document.createElement("span"); - targetIndicator.textContent = "🎯", targetIndicator.style.cssText = "\n position: absolute;\n top: -18px;\n right: 0;\n font-size: 16px;\n pointer-events: none;\n ", + targetIndicator.textContent = "🎯", targetIndicator.style.cssText = "\n position: absolute;\n top: -18px;\n right: 0;\n font-size: 16px;\n pointer-events: none;\n ", box.appendChild(targetIndicator); } shadow.appendChild(box); @@ -122,7 +122,7 @@ if (!grids || !Array.isArray(grids)) return; removeOverlay(); const host = document.createElement("div"); - host.id = OVERLAY_HOST_ID, host.style.cssText = "\n position: fixed !important;\n top: 0 !important;\n left: 0 !important;\n width: 100vw !important;\n height: 100vh !important;\n pointer-events: none !important;\n z-index: 2147483647 !important;\n margin: 0 !important;\n padding: 0 !important;\n ", + host.id = OVERLAY_HOST_ID, host.style.cssText = "\n position: fixed !important;\n top: 0 !important;\n left: 0 !important;\n width: 100vw !important;\n height: 100vh !important;\n pointer-events: none !important;\n z-index: 2147483647 !important;\n margin: 0 !important;\n padding: 0 !important;\n ", document.body.appendChild(host); const shadow = host.attachShadow({ mode: "closed" @@ -138,10 +138,10 @@ let labelText = grid.label ? `Grid ${grid.grid_id}: ${grid.label}` : `Grid ${grid.grid_id}`; grid.is_dominant && (labelText = `⭐ ${labelText} (dominant)`); const badge = document.createElement("span"); - if (badge.textContent = labelText, badge.style.cssText = `\n position: absolute;\n top: -18px;\n left: 0;\n background: ${color};\n color: white;\n font-size: 11px;\n font-weight: bold;\n padding: 2px 6px;\n font-family: Arial, sans-serif;\n border-radius: 3px;\n opacity: 0.95;\n white-space: nowrap;\n pointer-events: none;\n `, + if (badge.textContent = labelText, badge.style.cssText = `\n position: absolute;\n top: -18px;\n left: 0;\n background: ${color};\n color: white;\n font-size: 11px;\n font-weight: bold;\n padding: 2px 6px;\n font-family: Arial, sans-serif;\n border-radius: 3px;\n opacity: 0.95;\n white-space: nowrap;\n pointer-events: none;\n `, box.appendChild(badge), isTarget) { const targetIndicator = document.createElement("span"); - targetIndicator.textContent = "🎯", targetIndicator.style.cssText = "\n position: absolute;\n top: -18px;\n right: 0;\n font-size: 16px;\n pointer-events: none;\n ", + targetIndicator.textContent = "🎯", targetIndicator.style.cssText = "\n position: absolute;\n top: -18px;\n right: 0;\n font-size: 16px;\n pointer-events: none;\n ", box.appendChild(targetIndicator); } shadow.appendChild(box); @@ -155,7 +155,7 @@ let overlayTimeout = null; function removeOverlay() { const existing = document.getElementById(OVERLAY_HOST_ID); - existing && existing.remove(), overlayTimeout && (clearTimeout(overlayTimeout), + existing && existing.remove(), overlayTimeout && (clearTimeout(overlayTimeout), overlayTimeout = null); } -}(); \ No newline at end of file +}(); diff --git a/sentience/extension/injected_api.js b/sentience/extension/injected_api.js index 9230b8e..d8c46a4 100644 --- a/sentience/extension/injected_api.js +++ b/sentience/extension/injected_api.js @@ -103,9 +103,9 @@ const iframes = document.querySelectorAll("iframe"); for (const iframe of iframes) { const src = iframe.getAttribute("src") || "", title = iframe.getAttribute("title") || ""; - if (src) for (const [provider, hints] of Object.entries(CAPTCHA_IFRAME_HINTS)) matchHints(src, hints) && (hasIframeHit = !0, + if (src) for (const [provider, hints] of Object.entries(CAPTCHA_IFRAME_HINTS)) matchHints(src, hints) && (hasIframeHit = !0, providerSignals[provider] += 1, addEvidence(evidence.iframe_src_hits, truncateText(src, 120))); - if (title && matchHints(title, [ "captcha", "recaptcha" ]) && (hasContainerHit = !0, + if (title && matchHints(title, [ "captcha", "recaptcha" ]) && (hasContainerHit = !0, addEvidence(evidence.selector_hits, 'iframe[title*="captcha"]')), evidence.iframe_src_hits.length >= 5) break; } } catch (e) {} @@ -114,14 +114,14 @@ for (const script of scripts) { const src = script.getAttribute("src") || ""; if (src) { - for (const [provider, hints] of Object.entries(CAPTCHA_SCRIPT_HINTS)) matchHints(src, hints) && (hasScriptHit = !0, + for (const [provider, hints] of Object.entries(CAPTCHA_SCRIPT_HINTS)) matchHints(src, hints) && (hasScriptHit = !0, providerSignals[provider] += 1, addEvidence(evidence.selector_hits, `script[src*="${hints[0]}"]`)); if (evidence.selector_hits.length >= 5) break; } } } catch (e) {} for (const {selector: selector, provider: provider} of CAPTCHA_CONTAINER_SELECTORS) try { - document.querySelector(selector) && (hasContainerHit = !0, addEvidence(evidence.selector_hits, selector), + document.querySelector(selector) && (hasContainerHit = !0, addEvidence(evidence.selector_hits, selector), "unknown" !== provider && (providerSignals[provider] += 1)); } catch (e) {} const textSnippet = function() { @@ -139,7 +139,7 @@ } catch (e) {} try { let bodyText = document.body?.innerText || ""; - return !bodyText && document.body?.textContent && (bodyText = document.body.textContent), + return !bodyText && document.body?.textContent && (bodyText = document.body.textContent), truncateText(bodyText.replace(/\s+/g, " ").trim(), 2e3); } catch (e) { return ""; @@ -147,21 +147,21 @@ }(); if (textSnippet) { const lowerText = textSnippet.toLowerCase(); - for (const keyword of CAPTCHA_TEXT_KEYWORDS) lowerText.includes(keyword) && (hasKeywordHit = !0, + for (const keyword of CAPTCHA_TEXT_KEYWORDS) lowerText.includes(keyword) && (hasKeywordHit = !0, addEvidence(evidence.text_hits, keyword)); } try { const lowerUrl = (window.location?.href || "").toLowerCase(); - for (const hint of CAPTCHA_URL_HINTS) lowerUrl.includes(hint) && (hasUrlHit = !0, + for (const hint of CAPTCHA_URL_HINTS) lowerUrl.includes(hint) && (hasUrlHit = !0, addEvidence(evidence.url_hits, hint)); } catch (e) {} let confidence = 0; - hasIframeHit && (confidence += .7), hasContainerHit && (confidence += .5), hasScriptHit && (confidence += .5), - hasKeywordHit && (confidence += .3), hasUrlHit && (confidence += .2), confidence = Math.min(1, confidence), + hasIframeHit && (confidence += .7), hasContainerHit && (confidence += .5), hasScriptHit && (confidence += .5), + hasKeywordHit && (confidence += .3), hasUrlHit && (confidence += .2), confidence = Math.min(1, confidence), hasIframeHit && (confidence = Math.max(confidence, .8)), !hasKeywordHit || hasIframeHit || hasContainerHit || hasScriptHit || hasUrlHit || (confidence = Math.min(confidence, .4)); const detected = confidence >= .7; let providerHint = null; - return providerSignals.recaptcha > 0 ? providerHint = "recaptcha" : providerSignals.hcaptcha > 0 ? providerHint = "hcaptcha" : providerSignals.turnstile > 0 ? providerHint = "turnstile" : providerSignals.arkose > 0 ? providerHint = "arkose" : providerSignals.awswaf > 0 ? providerHint = "awswaf" : detected && (providerHint = "unknown"), + return providerSignals.recaptcha > 0 ? providerHint = "recaptcha" : providerSignals.hcaptcha > 0 ? providerHint = "hcaptcha" : providerSignals.turnstile > 0 ? providerHint = "turnstile" : providerSignals.arkose > 0 ? providerHint = "arkose" : providerSignals.awswaf > 0 ? providerHint = "awswaf" : detected && (providerHint = "unknown"), { detected: detected, provider_hint: providerHint, @@ -271,7 +271,7 @@ if (labelEl) { let text = ""; try { - if (text = (labelEl.innerText || "").trim(), !text && labelEl.textContent && (text = labelEl.textContent.trim()), + if (text = (labelEl.innerText || "").trim(), !text && labelEl.textContent && (text = labelEl.textContent.trim()), !text && labelEl.getAttribute) { const ariaLabel = labelEl.getAttribute("aria-label"); ariaLabel && (text = ariaLabel.trim()); @@ -466,7 +466,7 @@ }); const checkStable = () => { const timeSinceLastChange = Date.now() - lastChange, totalWait = Date.now() - startTime; - timeSinceLastChange >= quietPeriod || totalWait >= maxWait ? (observer.disconnect(), + timeSinceLastChange >= quietPeriod || totalWait >= maxWait ? (observer.disconnect(), resolve()) : setTimeout(checkStable, 50); }; checkStable(); @@ -492,7 +492,7 @@ }); const checkQuiet = () => { const timeSinceLastChange = Date.now() - lastChange, totalWait = Date.now() - startTime; - timeSinceLastChange >= quietPeriod || totalWait >= maxWait ? (quietObserver.disconnect(), + timeSinceLastChange >= quietPeriod || totalWait >= maxWait ? (quietObserver.disconnect(), resolve()) : setTimeout(checkQuiet, 50); }; checkQuiet(); @@ -607,7 +607,7 @@ }(el); let safeValue = null, valueRedacted = null; try { - if (void 0 !== el.value || el.getAttribute && null !== el.getAttribute("value")) if (isPasswordInput) safeValue = null, + if (void 0 !== el.value || el.getAttribute && null !== el.getAttribute("value")) if (isPasswordInput) safeValue = null, valueRedacted = "true"; else { const rawValue = void 0 !== el.value ? String(el.value) : String(el.getAttribute("value")); safeValue = rawValue.length > 200 ? rawValue.substring(0, 200) : rawValue, valueRedacted = "false"; @@ -734,8 +734,8 @@ const requestId = `iframe-${idx}-${Date.now()}`, timeout = setTimeout(() => { resolve(null); }, 5e3), listener = event => { - "SENTIENCE_IFRAME_SNAPSHOT_RESPONSE" === event.data?.type && event.data, "SENTIENCE_IFRAME_SNAPSHOT_RESPONSE" === event.data?.type && event.data?.requestId === requestId && (clearTimeout(timeout), - window.removeEventListener("message", listener), event.data.error ? resolve(null) : (event.data.snapshot, + "SENTIENCE_IFRAME_SNAPSHOT_RESPONSE" === event.data?.type && event.data, "SENTIENCE_IFRAME_SNAPSHOT_RESPONSE" === event.data?.type && event.data?.requestId === requestId && (clearTimeout(timeout), + window.removeEventListener("message", listener), event.data.error ? resolve(null) : (event.data.snapshot, resolve({ iframe: iframe, data: event.data.snapshot, @@ -751,7 +751,7 @@ ...options, collectIframes: !0 } - }, "*") : (clearTimeout(timeout), window.removeEventListener("message", listener), + }, "*") : (clearTimeout(timeout), window.removeEventListener("message", listener), resolve(null)); } catch (error) { clearTimeout(timeout), window.removeEventListener("message", listener), resolve(null); @@ -801,7 +801,7 @@ }, 25e3), listener = e => { if ("SENTIENCE_SNAPSHOT_RESULT" === e.data.type && e.data.requestId === requestId) { if (resolved) return; - resolved = !0, clearTimeout(timeout), window.removeEventListener("message", listener), + resolved = !0, clearTimeout(timeout), window.removeEventListener("message", listener), e.data.error ? reject(new Error(e.data.error)) : resolve({ elements: e.data.elements, raw_elements: e.data.raw_elements, @@ -818,7 +818,7 @@ options: options }, "*"); } catch (error) { - resolved || (resolved = !0, clearTimeout(timeout), window.removeEventListener("message", listener), + resolved || (resolved = !0, clearTimeout(timeout), window.removeEventListener("message", listener), reject(new Error(`Failed to send snapshot request: ${error.message}`))); } }); @@ -828,7 +828,7 @@ options.screenshot && (screenshot = await function(options) { return new Promise(resolve => { const requestId = Math.random().toString(36).substring(7), listener = e => { - "SENTIENCE_SCREENSHOT_RESULT" === e.data.type && e.data.requestId === requestId && (window.removeEventListener("message", listener), + "SENTIENCE_SCREENSHOT_RESULT" === e.data.type && e.data.requestId === requestId && (window.removeEventListener("message", listener), resolve(e.data.screenshot)); }; window.addEventListener("message", listener), window.postMessage({ @@ -888,15 +888,15 @@ } if (node.nodeType !== Node.ELEMENT_NODE) return; const tag = node.tagName.toLowerCase(); - if ("h1" === tag && (markdown += "\n# "), "h2" === tag && (markdown += "\n## "), - "h3" === tag && (markdown += "\n### "), "li" === tag && (markdown += "\n- "), insideLink || "p" !== tag && "div" !== tag && "br" !== tag || (markdown += "\n"), - "strong" !== tag && "b" !== tag || (markdown += "**"), "em" !== tag && "i" !== tag || (markdown += "_"), - "a" === tag && (markdown += "[", insideLink = !0), node.shadowRoot ? Array.from(node.shadowRoot.childNodes).forEach(walk) : node.childNodes.forEach(walk), + if ("h1" === tag && (markdown += "\n# "), "h2" === tag && (markdown += "\n## "), + "h3" === tag && (markdown += "\n### "), "li" === tag && (markdown += "\n- "), insideLink || "p" !== tag && "div" !== tag && "br" !== tag || (markdown += "\n"), + "strong" !== tag && "b" !== tag || (markdown += "**"), "em" !== tag && "i" !== tag || (markdown += "_"), + "a" === tag && (markdown += "[", insideLink = !0), node.shadowRoot ? Array.from(node.shadowRoot.childNodes).forEach(walk) : node.childNodes.forEach(walk), "a" === tag) { const href = node.getAttribute("href"); markdown += href ? `](${href})` : "]", insideLink = !1; } - "strong" !== tag && "b" !== tag || (markdown += "**"), "em" !== tag && "i" !== tag || (markdown += "_"), + "strong" !== tag && "b" !== tag || (markdown += "**"), "em" !== tag && "i" !== tag || (markdown += "_"), insideLink || "h1" !== tag && "h2" !== tag && "h3" !== tag && "p" !== tag && "div" !== tag || (markdown += "\n"); }(tempDiv), markdown.replace(/\n{3,}/g, "\n\n").trim(); }(document.body) : function(root) { @@ -909,7 +909,7 @@ const style = window.getComputedStyle(node); if ("none" === style.display || "hidden" === style.visibility) return; const isBlock = "block" === style.display || "flex" === style.display || "P" === node.tagName || "DIV" === node.tagName; - isBlock && (text += " "), node.shadowRoot ? Array.from(node.shadowRoot.childNodes).forEach(walk) : node.childNodes.forEach(walk), + isBlock && (text += " "), node.shadowRoot ? Array.from(node.shadowRoot.childNodes).forEach(walk) : node.childNodes.forEach(walk), isBlock && (text += "\n"); } } else text += node.textContent; @@ -1008,25 +1008,25 @@ } function startRecording(options = {}) { const {highlightColor: highlightColor = "#ff0000", successColor: successColor = "#00ff00", autoDisableTimeout: autoDisableTimeout = 18e5, keyboardShortcut: keyboardShortcut = "Ctrl+Shift+I"} = options; - if (!window.sentience_registry || 0 === window.sentience_registry.length) return alert("Registry empty. Run `await window.sentience.snapshot()` first!"), + if (!window.sentience_registry || 0 === window.sentience_registry.length) return alert("Registry empty. Run `await window.sentience.snapshot()` first!"), () => {}; window.sentience_registry_map = new Map, window.sentience_registry.forEach((el, idx) => { el && window.sentience_registry_map.set(el, idx); }); let highlightBox = document.getElementById("sentience-highlight-box"); - highlightBox || (highlightBox = document.createElement("div"), highlightBox.id = "sentience-highlight-box", - highlightBox.style.cssText = `\n position: fixed;\n pointer-events: none;\n z-index: 2147483647;\n border: 2px solid ${highlightColor};\n background: rgba(255, 0, 0, 0.1);\n display: none;\n transition: all 0.1s ease;\n box-sizing: border-box;\n `, + highlightBox || (highlightBox = document.createElement("div"), highlightBox.id = "sentience-highlight-box", + highlightBox.style.cssText = `\n position: fixed;\n pointer-events: none;\n z-index: 2147483647;\n border: 2px solid ${highlightColor};\n background: rgba(255, 0, 0, 0.1);\n display: none;\n transition: all 0.1s ease;\n box-sizing: border-box;\n `, document.body.appendChild(highlightBox)); let recordingIndicator = document.getElementById("sentience-recording-indicator"); - recordingIndicator || (recordingIndicator = document.createElement("div"), recordingIndicator.id = "sentience-recording-indicator", - recordingIndicator.style.cssText = `\n position: fixed;\n top: 0;\n left: 0;\n right: 0;\n height: 3px;\n background: ${highlightColor};\n z-index: 2147483646;\n pointer-events: none;\n `, + recordingIndicator || (recordingIndicator = document.createElement("div"), recordingIndicator.id = "sentience-recording-indicator", + recordingIndicator.style.cssText = `\n position: fixed;\n top: 0;\n left: 0;\n right: 0;\n height: 3px;\n background: ${highlightColor};\n z-index: 2147483646;\n pointer-events: none;\n `, document.body.appendChild(recordingIndicator)), recordingIndicator.style.display = "block"; const mouseOverHandler = e => { const el = e.target; if (!el || el === highlightBox || el === recordingIndicator) return; const rect = el.getBoundingClientRect(); - highlightBox.style.display = "block", highlightBox.style.top = rect.top + window.scrollY + "px", - highlightBox.style.left = rect.left + window.scrollX + "px", highlightBox.style.width = rect.width + "px", + highlightBox.style.display = "block", highlightBox.style.top = rect.top + window.scrollY + "px", + highlightBox.style.left = rect.left + window.scrollX + "px", highlightBox.style.width = rect.width + "px", highlightBox.style.height = rect.height + "px"; }, clickHandler = e => { e.preventDefault(), e.stopPropagation(); @@ -1103,7 +1103,7 @@ debug_snapshot: rawData }, jsonString = JSON.stringify(snippet, null, 2); navigator.clipboard.writeText(jsonString).then(() => { - highlightBox.style.border = `2px solid ${successColor}`, highlightBox.style.background = "rgba(0, 255, 0, 0.2)", + highlightBox.style.border = `2px solid ${successColor}`, highlightBox.style.background = "rgba(0, 255, 0, 0.2)", setTimeout(() => { highlightBox.style.border = `2px solid ${highlightColor}`, highlightBox.style.background = "rgba(255, 0, 0, 0.1)"; }, 500); @@ -1113,15 +1113,15 @@ }; let timeoutId = null; const stopRecording = () => { - document.removeEventListener("mouseover", mouseOverHandler, !0), document.removeEventListener("click", clickHandler, !0), - document.removeEventListener("keydown", keyboardHandler, !0), timeoutId && (clearTimeout(timeoutId), - timeoutId = null), highlightBox && (highlightBox.style.display = "none"), recordingIndicator && (recordingIndicator.style.display = "none"), + document.removeEventListener("mouseover", mouseOverHandler, !0), document.removeEventListener("click", clickHandler, !0), + document.removeEventListener("keydown", keyboardHandler, !0), timeoutId && (clearTimeout(timeoutId), + timeoutId = null), highlightBox && (highlightBox.style.display = "none"), recordingIndicator && (recordingIndicator.style.display = "none"), window.sentience_registry_map && window.sentience_registry_map.clear(), window.sentience_stopRecording === stopRecording && delete window.sentience_stopRecording; }, keyboardHandler = e => { - (e.ctrlKey || e.metaKey) && e.shiftKey && "I" === e.key && (e.preventDefault(), + (e.ctrlKey || e.metaKey) && e.shiftKey && "I" === e.key && (e.preventDefault(), stopRecording()); }; - return document.addEventListener("mouseover", mouseOverHandler, !0), document.addEventListener("click", clickHandler, !0), + return document.addEventListener("mouseover", mouseOverHandler, !0), document.addEventListener("click", clickHandler, !0), document.addEventListener("keydown", keyboardHandler, !0), autoDisableTimeout > 0 && (timeoutId = setTimeout(() => { stopRecording(); }, autoDisableTimeout)), window.sentience_stopRecording = stopRecording, stopRecording; @@ -1190,4 +1190,4 @@ } }), window.sentience_iframe_handler_setup = !0)); })(); -}(); \ No newline at end of file +}(); diff --git a/sentience/extension/pkg/sentience_core.js b/sentience/extension/pkg/sentience_core.js index bb9cae0..c50ad61 100644 --- a/sentience/extension/pkg/sentience_core.js +++ b/sentience/extension/pkg/sentience_core.js @@ -25,7 +25,7 @@ function __wbg_get_imports() { }, __wbg___wbindgen_bigint_get_as_i64_8fcf4ce7f1ca72a2: function(arg0, arg1) { const v = getObject(arg1), ret = "bigint" == typeof v ? v : void 0; - getDataViewMemory0().setBigInt64(arg0 + 8, isLikeNone(ret) ? BigInt(0) : ret, !0), + getDataViewMemory0().setBigInt64(arg0 + 8, isLikeNone(ret) ? BigInt(0) : ret, !0), getDataViewMemory0().setInt32(arg0 + 0, !isLikeNone(ret), !0); }, __wbg___wbindgen_boolean_get_bbbb1c18aa2f5e25: function(arg0) { @@ -224,7 +224,7 @@ function getArrayU8FromWasm0(ptr, len) { let cachedDataViewMemory0 = null; function getDataViewMemory0() { - return (null === cachedDataViewMemory0 || !0 === cachedDataViewMemory0.buffer.detached || void 0 === cachedDataViewMemory0.buffer.detached && cachedDataViewMemory0.buffer !== wasm.memory.buffer) && (cachedDataViewMemory0 = new DataView(wasm.memory.buffer)), + return (null === cachedDataViewMemory0 || !0 === cachedDataViewMemory0.buffer.detached || void 0 === cachedDataViewMemory0.buffer.detached && cachedDataViewMemory0.buffer !== wasm.memory.buffer) && (cachedDataViewMemory0 = new DataView(wasm.memory.buffer)), cachedDataViewMemory0; } @@ -235,7 +235,7 @@ function getStringFromWasm0(ptr, len) { let cachedUint8ArrayMemory0 = null; function getUint8ArrayMemory0() { - return null !== cachedUint8ArrayMemory0 && 0 !== cachedUint8ArrayMemory0.byteLength || (cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer)), + return null !== cachedUint8ArrayMemory0 && 0 !== cachedUint8ArrayMemory0.byteLength || (cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer)), cachedUint8ArrayMemory0; } @@ -264,7 +264,7 @@ function isLikeNone(x) { function passStringToWasm0(arg, malloc, realloc) { if (void 0 === realloc) { const buf = cachedTextEncoder.encode(arg), ptr = malloc(buf.length, 1) >>> 0; - return getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf), WASM_VECTOR_LEN = buf.length, + return getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf), WASM_VECTOR_LEN = buf.length, ptr; } let len = arg.length, ptr = malloc(len, 1) >>> 0; @@ -319,7 +319,7 @@ const cachedTextEncoder = new TextEncoder; let wasmModule, wasm, WASM_VECTOR_LEN = 0; function __wbg_finalize_init(instance, module) { - return wasm = instance.exports, wasmModule = module, cachedDataViewMemory0 = null, + return wasm = instance.exports, wasmModule = module, cachedDataViewMemory0 = null, cachedUint8ArrayMemory0 = null, wasm; } @@ -360,7 +360,7 @@ function initSync(module) { async function __wbg_init(module_or_path) { if (void 0 !== wasm) return wasm; - void 0 !== module_or_path && Object.getPrototypeOf(module_or_path) === Object.prototype && ({module_or_path: module_or_path} = module_or_path), + void 0 !== module_or_path && Object.getPrototypeOf(module_or_path) === Object.prototype && ({module_or_path: module_or_path} = module_or_path), void 0 === module_or_path && (module_or_path = new URL("sentience_core_bg.wasm", import.meta.url)); const imports = __wbg_get_imports(); ("string" == typeof module_or_path || "function" == typeof Request && module_or_path instanceof Request || "function" == typeof URL && module_or_path instanceof URL) && (module_or_path = fetch(module_or_path)); @@ -368,4 +368,4 @@ async function __wbg_init(module_or_path) { return __wbg_finalize_init(instance, module); } -export { initSync, __wbg_init as default }; \ No newline at end of file +export { initSync, __wbg_init as default }; diff --git a/sentience/runtime_agent.py b/sentience/runtime_agent.py new file mode 100644 index 0000000..8e2be77 --- /dev/null +++ b/sentience/runtime_agent.py @@ -0,0 +1,423 @@ +""" +AgentRuntime-backed agent with optional vision executor fallback. + +This module intentionally keeps the control plane verification-first: +- Actions may be proposed by either a structured executor (DOM snapshot prompt) + or a vision executor (screenshot prompt). +- Verification is always executed via AgentRuntime predicates. +""" + +from __future__ import annotations + +import base64 +import re +from dataclasses import dataclass, field +from typing import Any, Literal + +from .agent_runtime import AgentRuntime +from .backends import actions as backend_actions +from .llm_interaction_handler import LLMInteractionHandler +from .llm_provider import LLMProvider +from .models import BBox, Snapshot +from .verification import AssertContext, AssertOutcome, Predicate + + +@dataclass(frozen=True) +class StepVerification: + predicate: Predicate + label: str + required: bool = True + eventually: bool = True + timeout_s: float = 10.0 + poll_s: float = 0.25 + max_snapshot_attempts: int = 3 + min_confidence: float | None = None + + +@dataclass(frozen=True) +class RuntimeStep: + goal: str + intent: str | None = None + verifications: list[StepVerification] = field(default_factory=list) + + # Snapshot quality policy (handled at agent layer; SDK core unchanged). + snapshot_limit_base: int = 60 + snapshot_limit_step: int = 40 + snapshot_limit_max: int = 220 + max_snapshot_attempts: int = 3 + min_confidence: float | None = None + min_actionables: int | None = None + + # Vision executor fallback (bounded). + vision_executor_enabled: bool = True + max_vision_executor_attempts: int = 1 + + +class RuntimeAgent: + """ + A thin orchestration layer over AgentRuntime: + - snapshot (with limit ramp) + - propose action (structured executor; optionally vision executor fallback) + - execute action (backend-agnostic primitives) + - verify (AgentRuntime predicates) + """ + + def __init__( + self, + *, + runtime: AgentRuntime, + executor: LLMProvider, + vision_executor: LLMProvider | None = None, + vision_verifier: LLMProvider | None = None, + short_circuit_canvas: bool = True, + ) -> None: + self.runtime = runtime + self.executor = executor + self.vision_executor = vision_executor + self.vision_verifier = vision_verifier + self.short_circuit_canvas = short_circuit_canvas + + self._structured_llm = LLMInteractionHandler(executor) + + async def run_step( + self, + *, + task_goal: str, + step: RuntimeStep, + ) -> bool: + self.runtime.begin_step(step.goal) + + snap = await self._snapshot_with_ramp(step=step) + + if await self._should_short_circuit_to_vision(step=step, snap=snap): + ok = await self._vision_executor_attempt(task_goal=task_goal, step=step, snap=snap) + return ok + + # 1) Structured executor attempt. + action = self._propose_structured_action(task_goal=task_goal, step=step, snap=snap) + await self._execute_action(action=action, snap=snap) + ok = await self._apply_verifications(step=step) + if ok: + return True + + # 2) Optional vision executor fallback (bounded). + if step.vision_executor_enabled and step.max_vision_executor_attempts > 0: + ok2 = await self._vision_executor_attempt(task_goal=task_goal, step=step, snap=snap) + return ok2 + + return False + + async def _snapshot_with_ramp(self, *, step: RuntimeStep) -> Snapshot: + limit = step.snapshot_limit_base + last: Snapshot | None = None + + for _attempt in range(max(1, step.max_snapshot_attempts)): + last = await self.runtime.snapshot(limit=limit, goal=step.goal) + if last is None: + limit = min(step.snapshot_limit_max, limit + step.snapshot_limit_step) + continue + + if step.min_confidence is not None: + conf = getattr(getattr(last, "diagnostics", None), "confidence", None) + if isinstance(conf, (int, float)) and conf < step.min_confidence: + limit = min(step.snapshot_limit_max, limit + step.snapshot_limit_step) + continue + + if step.min_actionables is not None: + if self._count_actionables(last) < step.min_actionables: + limit = min(step.snapshot_limit_max, limit + step.snapshot_limit_step) + continue + + return last + + # If we didn't return early, use last snapshot (may be low quality). + if last is None: + raise RuntimeError("snapshot() returned None repeatedly") + return last + + def _propose_structured_action(self, *, task_goal: str, step: RuntimeStep, snap: Snapshot) -> str: + dom_context = self._structured_llm.build_context(snap, step.goal) + combined_goal = f"{task_goal}\n\nSTEP: {step.goal}" + resp = self._structured_llm.query_llm(dom_context, combined_goal) + return self._structured_llm.extract_action(resp.content) + + async def _vision_executor_attempt( + self, + *, + task_goal: str, + step: RuntimeStep, + snap: Snapshot | None, + ) -> bool: + if not self.vision_executor or not self.vision_executor.supports_vision(): + return False + + url = await self._get_url_for_prompt() + image_b64 = await self._screenshot_base64_png() + system_prompt, user_prompt = self._vision_executor_prompts( + task_goal=task_goal, + step=step, + url=url, + snap=snap, + ) + + resp = self.vision_executor.generate_with_image( + system_prompt, + user_prompt, + image_b64, + temperature=0.0, + ) + + action = self._extract_action_from_text(resp.content) + await self._execute_action(action=action, snap=snap) + # Important: vision executor fallback is a *retry* of the same step. + # Clear prior step assertions so required_assertions_passed reflects the final attempt. + self.runtime.flush_assertions() + return await self._apply_verifications(step=step) + + async def _apply_verifications(self, *, step: RuntimeStep) -> bool: + if not step.verifications: + # No explicit verifications provided: treat as pass. + return True + + all_ok = True + for v in step.verifications: + if v.eventually: + ok = await self.runtime.check(v.predicate, label=v.label, required=v.required).eventually( + timeout_s=v.timeout_s, + poll_s=v.poll_s, + max_snapshot_attempts=v.max_snapshot_attempts, + min_confidence=v.min_confidence, + vision_provider=self.vision_verifier, + ) + else: + ok = self.runtime.assert_(v.predicate, label=v.label, required=v.required) + all_ok = all_ok and ok + + # Respect required verifications semantics. + return self.runtime.required_assertions_passed() and all_ok + + async def _execute_action(self, *, action: str, snap: Snapshot | None) -> None: + url = None + try: + url = await self.runtime.get_url() + except Exception: + url = getattr(snap, "url", None) + + await self.runtime.record_action(action, url=url) + + # Coordinate-backed execution (by snapshot id or explicit coordinates). + kind, payload = self._parse_action(action) + + if kind == "finish": + return + + if kind == "press": + await self._press_key_best_effort(payload["key"]) + await self._stabilize_best_effort() + return + + if kind == "click_xy": + await backend_actions.click(self.runtime.backend, (payload["x"], payload["y"])) + await self._stabilize_best_effort() + return + + if kind == "click_rect": + bbox = BBox(x=payload["x"], y=payload["y"], width=payload["w"], height=payload["h"]) + await backend_actions.click(self.runtime.backend, bbox) + await self._stabilize_best_effort() + return + + if snap is None: + raise RuntimeError("Cannot execute CLICK(id)/TYPE(id, ...) without a snapshot") + + if kind == "click": + el = self._find_element(snap, payload["id"]) + if el is None: + raise RuntimeError(f"Element id {payload['id']} not found in snapshot") + await backend_actions.click(self.runtime.backend, el.bbox) + await self._stabilize_best_effort() + return + + if kind == "type": + el = self._find_element(snap, payload["id"]) + if el is None: + raise RuntimeError(f"Element id {payload['id']} not found in snapshot") + await backend_actions.type_text(self.runtime.backend, payload["text"], target=el.bbox) + await self._stabilize_best_effort() + return + + raise ValueError(f"Unknown action kind: {kind}") + + async def _stabilize_best_effort(self) -> None: + try: + await self.runtime.backend.wait_ready_state(state="interactive", timeout_ms=15000) + except Exception: + return + + async def _press_key_best_effort(self, key: str) -> None: + # BrowserBackend does not expose a dedicated keypress primitive; do best-effort JS events. + key_esc = key.replace("\\", "\\\\").replace("'", "\\'") + await self.runtime.backend.eval( + f""" + (() => {{ + const el = document.activeElement || document.body; + const down = new KeyboardEvent('keydown', {{key: '{key_esc}', bubbles: true}}); + const up = new KeyboardEvent('keyup', {{key: '{key_esc}', bubbles: true}}); + el.dispatchEvent(down); + el.dispatchEvent(up); + return true; + }})() + """ + ) + + async def _screenshot_base64_png(self) -> str: + png = await self.runtime.backend.screenshot_png() + return base64.b64encode(png).decode("utf-8") + + async def _get_url_for_prompt(self) -> str | None: + try: + return await self.runtime.get_url() + except Exception: + return getattr(self.runtime.last_snapshot, "url", None) + + async def _should_short_circuit_to_vision(self, *, step: RuntimeStep, snap: Snapshot | None) -> bool: + if not (step.vision_executor_enabled and self.vision_executor and self.vision_executor.supports_vision()): + return False + + if snap is None: + return True + + if step.min_actionables is not None and self._count_actionables(snap) < step.min_actionables: + if self.short_circuit_canvas: + try: + n_canvas = await self.runtime.backend.eval("document.querySelectorAll('canvas').length") + if isinstance(n_canvas, (int, float)) and n_canvas > 0: + return True + except Exception: + pass + + return False + + def _vision_executor_prompts( + self, + *, + task_goal: str, + step: RuntimeStep, + url: str | None, + snap: Snapshot | None, + ) -> tuple[str, str]: + # Include URL as text: screenshots generally don't include browser chrome reliably. + verify_targets = self._verification_targets_human(step.verifications) + + snapshot_summary = "" + if snap is not None: + snapshot_summary = ( + f"\n\nStructured snapshot summary:\n" + f"- url: {getattr(snap, 'url', None)}\n" + f"- elements: {len(getattr(snap, 'elements', []) or [])}\n" + ) + + system_prompt = f"""You are a vision-capable web automation executor. + +TASK GOAL: +{task_goal} + +STEP GOAL: +{step.goal} + +CURRENT URL (text): +{url or "(unknown)"} + +VERIFICATION TARGETS (text): +{verify_targets or "(none provided)"} +{snapshot_summary} + +RESPONSE FORMAT: +Return ONLY ONE of: +- CLICK(id) +- TYPE(id, "text") +- CLICK_XY(x, y) +- CLICK_RECT(x, y, w, h) +- PRESS("key") +- FINISH() + +No explanations, no markdown. +""" + + user_prompt = "From the screenshot, return the single best next action:" + return system_prompt, user_prompt + + def _verification_targets_human(self, verifications: list[StepVerification]) -> str: + if not verifications: + return "" + lines: list[str] = [] + for v in verifications: + req = "required" if v.required else "optional" + lines.append(f"- {v.label} ({req})") + return "\n".join(lines) + + def _count_actionables(self, snap: Snapshot) -> int: + n = 0 + for el in snap.elements or []: + cues = getattr(el, "visual_cues", None) + clickable = bool(getattr(cues, "is_clickable", False)) + if clickable: + n += 1 + return n + + def _find_element(self, snap: Snapshot, element_id: int) -> Any | None: + for el in snap.elements or []: + if getattr(el, "id", None) == element_id: + return el + return None + + def _parse_action( + self, + action: str, + ) -> tuple[Literal["click", "type", "press", "finish", "click_xy", "click_rect"], dict[str, Any]]: + action = action.strip() + + if re.match(r"FINISH\s*\(\s*\)\s*$", action, re.IGNORECASE): + return "finish", {} + + if m := re.match( + r"CLICK_XY\s*\(\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\)\s*$", + action, + re.IGNORECASE, + ): + return "click_xy", {"x": float(m.group(1)), "y": float(m.group(2))} + + if m := re.match( + r"CLICK_RECT\s*\(\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\)\s*$", + action, + re.IGNORECASE, + ): + return "click_rect", { + "x": float(m.group(1)), + "y": float(m.group(2)), + "w": float(m.group(3)), + "h": float(m.group(4)), + } + + if m := re.match(r"CLICK\s*\(\s*(\d+)\s*\)\s*$", action, re.IGNORECASE): + return "click", {"id": int(m.group(1))} + + if m := re.match( + r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)\s*$', + action, + re.IGNORECASE, + ): + return "type", {"id": int(m.group(1)), "text": m.group(2)} + + if m := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)\s*$', action, re.IGNORECASE): + return "press", {"key": m.group(1)} + + raise ValueError(f"Unknown action format: {action}") + + def _extract_action_from_text(self, text: str) -> str: + # Keep consistent with LLMInteractionHandler.extract_action, but without DOM context dependency. + text = re.sub(r"```[\w]*\n?", "", text).strip() + pat = r'(CLICK_XY\s*\(\s*-?\d+(?:\.\d+)?\s*,\s*-?\d+(?:\.\d+)?\s*\)|CLICK_RECT\s*\(\s*-?\d+(?:\.\d+)?\s*,\s*-?\d+(?:\.\d+)?\s*,\s*-?\d+(?:\.\d+)?\s*,\s*-?\d+(?:\.\d+)?\s*\)|CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))' + m = re.search(pat, text, re.IGNORECASE) + return m.group(1) if m else text + diff --git a/tests/unit/test_runtime_agent.py b/tests/unit/test_runtime_agent.py new file mode 100644 index 0000000..e069b76 --- /dev/null +++ b/tests/unit/test_runtime_agent.py @@ -0,0 +1,338 @@ +from __future__ import annotations + +from unittest.mock import AsyncMock + +import pytest + +from sentience.agent_runtime import AgentRuntime +from sentience.llm_provider import LLMProvider, LLMResponse +from sentience.models import BBox, Element, Snapshot, SnapshotDiagnostics, Viewport, VisualCues +from sentience.runtime_agent import RuntimeAgent, RuntimeStep, StepVerification +from sentience.verification import AssertContext, AssertOutcome + + +class MockTracer: + def __init__(self) -> None: + self.events: list[dict] = [] + + def emit(self, event_type: str, data: dict, step_id: str | None = None) -> None: + self.events.append({"type": event_type, "data": data, "step_id": step_id}) + + +class MockBackend: + def __init__(self) -> None: + self._url = "https://example.com/start" + self.mouse_clicks: list[tuple[float, float]] = [] + self.typed: list[str] = [] + self.eval_calls: list[str] = [] + + async def get_url(self) -> str: + return self._url + + async def refresh_page_info(self): + return None + + async def eval(self, expression: str): + self.eval_calls.append(expression) + # default: no canvas + if "querySelectorAll('canvas')" in expression: + return 0 + return None + + async def call(self, function_declaration: str, args=None): + _ = function_declaration, args + return None + + async def get_layout_metrics(self): + return None + + async def screenshot_png(self) -> bytes: + return b"png" + + async def screenshot_jpeg(self, quality: int | None = None) -> bytes: + _ = quality + return b"jpeg" + + async def mouse_move(self, x: float, y: float) -> None: + _ = x, y + return None + + async def mouse_click(self, x: float, y: float, button="left", click_count=1) -> None: + _ = button, click_count + self.mouse_clicks.append((float(x), float(y))) + + async def wheel(self, delta_y: float, x=None, y=None) -> None: + _ = delta_y, x, y + return None + + async def type_text(self, text: str) -> None: + self.typed.append(text) + + async def wait_ready_state(self, state="interactive", timeout_ms=15000) -> None: + _ = state, timeout_ms + return None + + +class ProviderStub(LLMProvider): + def __init__(self, *, model: str = "stub", responses: list[str] | None = None): + super().__init__(model) + self._responses = responses or [] + self.calls: list[dict] = [] + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + self.calls.append({"system": system_prompt, "user": user_prompt, "kwargs": kwargs}) + content = self._responses.pop(0) if self._responses else "FINISH()" + return LLMResponse(content=content, model_name=self.model_name) + + def supports_json_mode(self) -> bool: + return True + + @property + def model_name(self) -> str: + return self._model_name + + +class VisionProviderStub(ProviderStub): + def supports_vision(self) -> bool: + return True + + def generate_with_image(self, system_prompt: str, user_prompt: str, image_base64: str, **kwargs): + self.calls.append( + {"system": system_prompt, "user": user_prompt, "image_base64": image_base64, "kwargs": kwargs} + ) + content = self._responses.pop(0) if self._responses else "FINISH()" + return LLMResponse(content=content, model_name=self.model_name) + + +def make_snapshot(*, url: str, elements: list[Element], confidence: float | None = None) -> Snapshot: + diagnostics = SnapshotDiagnostics(confidence=confidence) if confidence is not None else None + return Snapshot( + status="success", + url=url, + elements=elements, + viewport=Viewport(width=1280, height=720), + diagnostics=diagnostics, + ) + + +def make_clickable_element(element_id: int) -> Element: + return Element( + id=element_id, + role="button", + text="OK", + importance=100, + bbox=BBox(x=10, y=20, width=100, height=40), + visual_cues=VisualCues(is_primary=True, is_clickable=True, background_color_name=None), + in_viewport=True, + is_occluded=False, + ) + + +@pytest.mark.asyncio +async def test_runtime_agent_structured_executor_success_no_vision_used() -> None: + backend = MockBackend() + tracer = MockTracer() + runtime = AgentRuntime(backend=backend, tracer=tracer) + + # snapshot (ramp) -> S0, then verification eventually -> S1 + s0 = make_snapshot(url="https://example.com/start", elements=[make_clickable_element(1)]) + s1 = make_snapshot(url="https://example.com/done", elements=[make_clickable_element(1)]) + + async def fake_snapshot(**_kwargs): + runtime.last_snapshot = snaps.pop(0) + return runtime.last_snapshot + + snaps = [s0, s1] + runtime.snapshot = AsyncMock(side_effect=fake_snapshot) # type: ignore[method-assign] + + executor = ProviderStub(responses=["CLICK(1)"]) + agent = RuntimeAgent(runtime=runtime, executor=executor, vision_executor=None) + + def pred(ctx: AssertContext) -> AssertOutcome: + ok = (ctx.url or "").endswith("/done") + return AssertOutcome(passed=ok, reason="" if ok else "not done", details={}) + + step = RuntimeStep( + goal="Click OK", + verifications=[ + StepVerification( + predicate=pred, + label="url_done", + required=True, + eventually=True, + timeout_s=0.1, + poll_s=0.0, + max_snapshot_attempts=1, + ) + ], + max_snapshot_attempts=1, + ) + + ok = await agent.run_step(task_goal="test", step=step) + assert ok is True + assert len(executor.calls) == 1 + assert backend.mouse_clicks # click happened + + +@pytest.mark.asyncio +async def test_runtime_agent_vision_executor_fallback_after_verification_fail() -> None: + backend = MockBackend() + tracer = MockTracer() + runtime = AgentRuntime(backend=backend, tracer=tracer) + + s0 = make_snapshot(url="https://example.com/start", elements=[make_clickable_element(1)]) + s1 = make_snapshot(url="https://example.com/still", elements=[make_clickable_element(1)]) + s2 = make_snapshot(url="https://example.com/done", elements=[make_clickable_element(1)]) + + async def fake_snapshot(**_kwargs): + runtime.last_snapshot = snaps.pop(0) + return runtime.last_snapshot + + # ramp -> s0, first verification -> s1 (fail), second verification -> s2 (pass) + snaps = [s0, s1, s2] + runtime.snapshot = AsyncMock(side_effect=fake_snapshot) # type: ignore[method-assign] + + executor = ProviderStub(responses=["CLICK(1)"]) + vision = VisionProviderStub(responses=["CLICK(1)"]) + agent = RuntimeAgent(runtime=runtime, executor=executor, vision_executor=vision) + + def pred(ctx: AssertContext) -> AssertOutcome: + ok = (ctx.url or "").endswith("/done") + return AssertOutcome(passed=ok, reason="" if ok else "not done", details={}) + + step = RuntimeStep( + goal="Try click; fallback if needed", + verifications=[ + StepVerification( + predicate=pred, + label="url_done", + required=True, + eventually=True, + timeout_s=0.0, + poll_s=0.0, + max_snapshot_attempts=1, + ) + ], + max_snapshot_attempts=1, + vision_executor_enabled=True, + max_vision_executor_attempts=1, + ) + + ok = await agent.run_step(task_goal="test", step=step) + assert ok is True + assert len(executor.calls) == 1 + assert len(vision.calls) == 1 + + +@pytest.mark.asyncio +async def test_snapshot_limit_ramp_increases_limit_on_low_confidence() -> None: + backend = MockBackend() + tracer = MockTracer() + runtime = AgentRuntime(backend=backend, tracer=tracer) + + s_low = make_snapshot(url="https://example.com/start", elements=[make_clickable_element(1)], confidence=0.1) + s_hi = make_snapshot(url="https://example.com/start", elements=[make_clickable_element(1)], confidence=0.9) + s_done = make_snapshot(url="https://example.com/done", elements=[make_clickable_element(1)]) + + seen_limits: list[int] = [] + + async def fake_snapshot(**kwargs): + if kwargs.get("limit") is not None: + seen_limits.append(int(kwargs["limit"])) + runtime.last_snapshot = snaps.pop(0) + return runtime.last_snapshot + + # ramp tries low then high; verification uses done + snaps = [s_low, s_hi, s_done] + runtime.snapshot = AsyncMock(side_effect=fake_snapshot) # type: ignore[method-assign] + + executor = ProviderStub(responses=["CLICK(1)"]) + agent = RuntimeAgent(runtime=runtime, executor=executor) + + def pred(ctx: AssertContext) -> AssertOutcome: + ok = (ctx.url or "").endswith("/done") + return AssertOutcome(passed=ok, reason="" if ok else "not done", details={}) + + step = RuntimeStep( + goal="ramp snapshot", + min_confidence=0.7, + snapshot_limit_base=60, + snapshot_limit_step=40, + snapshot_limit_max=220, + max_snapshot_attempts=2, + verifications=[ + StepVerification( + predicate=pred, + label="url_done", + required=True, + eventually=True, + timeout_s=0.1, + poll_s=0.0, + max_snapshot_attempts=1, + ) + ], + ) + + ok = await agent.run_step(task_goal="test", step=step) + assert ok is True + assert seen_limits[:2] == [60, 100] + + +@pytest.mark.asyncio +async def test_short_circuit_to_vision_on_canvas_and_low_actionables() -> None: + backend = MockBackend() + + async def eval_canvas(expression: str): + backend.eval_calls.append(expression) + if "querySelectorAll('canvas')" in expression: + return 1 + return None + + backend.eval = eval_canvas # type: ignore[method-assign] + + tracer = MockTracer() + runtime = AgentRuntime(backend=backend, tracer=tracer) + + s0 = make_snapshot(url="https://example.com/start", elements=[]) # no actionables + s1 = make_snapshot(url="https://example.com/done", elements=[]) + + async def fake_snapshot(**_kwargs): + runtime.last_snapshot = snaps.pop(0) + return runtime.last_snapshot + + snaps = [s0, s1] + runtime.snapshot = AsyncMock(side_effect=fake_snapshot) # type: ignore[method-assign] + + executor = ProviderStub(responses=["CLICK(999)"]) # should NOT be called + vision = VisionProviderStub(responses=["CLICK_XY(100, 200)"]) + agent = RuntimeAgent(runtime=runtime, executor=executor, vision_executor=vision, short_circuit_canvas=True) + + def pred(ctx: AssertContext) -> AssertOutcome: + ok = (ctx.url or "").endswith("/done") + return AssertOutcome(passed=ok, reason="" if ok else "not done", details={}) + + step = RuntimeStep( + goal="canvas step", + min_actionables=1, + max_snapshot_attempts=1, + verifications=[ + StepVerification( + predicate=pred, + label="url_done", + required=True, + eventually=True, + timeout_s=0.1, + poll_s=0.0, + max_snapshot_attempts=1, + ) + ], + vision_executor_enabled=True, + max_vision_executor_attempts=1, + ) + + ok = await agent.run_step(task_goal="test", step=step) + assert ok is True + assert len(executor.calls) == 0 + assert len(vision.calls) == 1 + assert backend.mouse_clicks == [(100.0, 200.0)] + From 3582cd6d2337f560f9a04fddb8ed48ab9a7f6f93 Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Wed, 21 Jan 2026 18:22:37 -0800 Subject: [PATCH 2/2] correction --- sentience/agent_runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py index bac9837..01ba2ff 100644 --- a/sentience/agent_runtime.py +++ b/sentience/agent_runtime.py @@ -582,7 +582,7 @@ def assert_done( True if task is complete (assertion passed), False otherwise """ # Convenience wrapper for assert_ with required=True - ok = self.assertTrue(predicate, label=label, required=True) + ok = self.assert_(predicate, label=label, required=True) if ok: self._task_done = True self._task_done_label = label