From fc83a3ec16b73920b1764c3300d6d1eec5e390a6 Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Thu, 22 Jan 2026 17:01:40 -0800 Subject: [PATCH 1/3] add verification/assertion results to trace --- sentience/agent.py | 31 ++++++++++++++++++++ sentience/integrations/langchain/core.py | 1 + sentience/integrations/pydanticai/toolset.py | 1 + sentience/trace_event_builder.py | 7 +++++ sentience/visual_agent.py | 10 +++++++ 5 files changed, 50 insertions(+) diff --git a/sentience/agent.py b/sentience/agent.py index d4a9c0c..2f7f4ac 100644 --- a/sentience/agent.py +++ b/sentience/agent.py @@ -143,6 +143,25 @@ def _compute_hash(self, text: str) -> str: """Compute SHA256 hash of text.""" return hashlib.sha256(text.encode("utf-8")).hexdigest() + def _best_effort_post_snapshot_digest(self, goal: str) -> str | None: + """ + Best-effort post-action snapshot digest for tracing. + """ + try: + snap_opts = SnapshotOptions( + limit=min(10, self.default_snapshot_limit), + goal=f"{goal} (post)", + ) + snap_opts.screenshot = False + snap_opts.show_overlay = self.config.show_overlay if self.config else None + post_snap = snapshot(self.browser, snap_opts) + if post_snap.status != "success": + return None + digest_input = f"{post_snap.url}{post_snap.timestamp}" + return f"sha256:{self._compute_hash(digest_input)}" + except Exception: + return None + def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None: """Get bounding box for an element from snapshot.""" if element_id is None: @@ -513,6 +532,10 @@ def act( # noqa: C901 snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff) pre_elements = snapshot_event_data.get("elements", []) + post_snapshot_digest = ( + self._best_effort_post_snapshot_digest(goal) if self.tracer else None + ) + # Build complete step_end event step_end_data = TraceEventBuilder.build_step_end_event( step_id=step_id, @@ -522,6 +545,7 @@ def act( # noqa: C901 pre_url=pre_url, post_url=post_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=post_snapshot_digest, llm_data=llm_data, exec_data=exec_data, verify_data=verify_data, @@ -601,6 +625,7 @@ def act( # noqa: C901 pre_url=_step_pre_url, post_url=post_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=None, llm_data=llm_data, exec_data=exec_data, verify_data=None, @@ -1155,6 +1180,10 @@ async def act( # noqa: C901 snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff) pre_elements = snapshot_event_data.get("elements", []) + post_snapshot_digest = ( + self._best_effort_post_snapshot_digest(goal) if self.tracer else None + ) + # Build complete step_end event step_end_data = TraceEventBuilder.build_step_end_event( step_id=step_id, @@ -1164,6 +1193,7 @@ async def act( # noqa: C901 pre_url=pre_url, post_url=post_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=post_snapshot_digest, llm_data=llm_data, exec_data=exec_data, verify_data=verify_data, @@ -1243,6 +1273,7 @@ async def act( # noqa: C901 pre_url=_step_pre_url, post_url=post_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=None, llm_data=llm_data, exec_data=exec_data, verify_data=None, diff --git a/sentience/integrations/langchain/core.py b/sentience/integrations/langchain/core.py index ea24073..28280fa 100644 --- a/sentience/integrations/langchain/core.py +++ b/sentience/integrations/langchain/core.py @@ -113,6 +113,7 @@ async def _trace(self, tool_name: str, exec_coro, exec_meta: dict[str, Any]): pre_url=pre_url or "", post_url=post_url or "", snapshot_digest=None, + post_snapshot_digest=None, llm_data={}, exec_data=exec_data, verify_data=verify_data, diff --git a/sentience/integrations/pydanticai/toolset.py b/sentience/integrations/pydanticai/toolset.py index 033f606..1e8e82a 100644 --- a/sentience/integrations/pydanticai/toolset.py +++ b/sentience/integrations/pydanticai/toolset.py @@ -126,6 +126,7 @@ async def _trace_tool_call(ctx: Any, tool_name: str, exec_coro, exec_meta: dict[ pre_url=pre_url or "", post_url=post_url or "", snapshot_digest=None, + post_snapshot_digest=None, llm_data={}, exec_data=exec_data, verify_data=verify_data, diff --git a/sentience/trace_event_builder.py b/sentience/trace_event_builder.py index 8b5b911..372b209 100644 --- a/sentience/trace_event_builder.py +++ b/sentience/trace_event_builder.py @@ -94,6 +94,7 @@ def build_step_end_event( verify_data: dict[str, Any], pre_elements: list[dict[str, Any]] | None = None, assertions: list[dict[str, Any]] | None = None, + post_snapshot_digest: str | None = None, ) -> dict[str, Any]: """ Build step_end trace event data. @@ -106,6 +107,7 @@ def build_step_end_event( pre_url: URL before action execution post_url: URL after action execution snapshot_digest: Digest of snapshot before action + post_snapshot_digest: Digest of snapshot after action (optional) llm_data: LLM interaction data exec_data: Action execution data verify_data: Verification data @@ -153,6 +155,11 @@ def build_step_end_event( "exec": exec_data, "post": { "url": post_url, + **( + {"snapshot_digest": post_snapshot_digest} + if post_snapshot_digest + else {} + ), }, "verify": final_verify_data, } diff --git a/sentience/visual_agent.py b/sentience/visual_agent.py index 60c3851..12bb323 100644 --- a/sentience/visual_agent.py +++ b/sentience/visual_agent.py @@ -1101,6 +1101,10 @@ async def act( }, } + post_snapshot_digest = ( + self._best_effort_post_snapshot_digest(goal) if self.tracer else None + ) + # Build complete step_end event step_end_data = TraceEventBuilder.build_step_end_event( step_id=step_id, @@ -1110,6 +1114,7 @@ async def act( pre_url=pre_url, post_url=post_url or pre_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=post_snapshot_digest, llm_data={ "response_text": llm_response_text, "response_hash": f"sha256:{self._compute_hash(llm_response_text)}", @@ -2011,6 +2016,10 @@ def act( }, } + post_snapshot_digest = ( + self._best_effort_post_snapshot_digest(goal) if self.tracer else None + ) + # Build complete step_end event step_end_data = TraceEventBuilder.build_step_end_event( step_id=step_id, @@ -2020,6 +2029,7 @@ def act( pre_url=pre_url, post_url=post_url or pre_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=post_snapshot_digest, llm_data={ "response_text": llm_response_text, "response_hash": f"sha256:{self._compute_hash(llm_response_text)}", From 355e72df5e3d6a6c6b9c621a7d46389b4b5f9a6c Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Thu, 22 Jan 2026 19:26:58 -0800 Subject: [PATCH 2/3] verification payment step_end in agent runtime --- sentience/agent_runtime.py | 130 +++++++++++++++++++++++++++++++++++++ sentience/runtime_agent.py | 64 +++++++++++++----- 2 files changed, 177 insertions(+), 17 deletions(-) diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py index e0d8032..2852699 100644 --- a/sentience/agent_runtime.py +++ b/sentience/agent_runtime.py @@ -65,6 +65,7 @@ import asyncio import difflib +import hashlib import time from dataclasses import dataclass from typing import TYPE_CHECKING, Any @@ -72,6 +73,7 @@ from .captcha import CaptchaContext, CaptchaHandlingError, CaptchaOptions, CaptchaResolution from .failure_artifacts import FailureArtifactBuffer, FailureArtifactsOptions from .models import Snapshot, SnapshotOptions +from .trace_event_builder import TraceEventBuilder from .verification import AssertContext, AssertOutcome, Predicate if TYPE_CHECKING: @@ -138,6 +140,8 @@ def __init__( # Snapshot state self.last_snapshot: Snapshot | None = None + self._step_pre_snapshot: Snapshot | None = None + self._step_pre_url: str | None = None # Failure artifacts (Phase 1) self._artifact_buffer: FailureArtifactBuffer | None = None @@ -148,6 +152,12 @@ def __init__( # Assertions accumulated during current step self._assertions_this_step: list[dict[str, Any]] = [] + self._step_goal: str | None = None + self._last_action: str | None = None + self._last_action_error: str | None = None + self._last_action_outcome: str | None = None + self._last_action_duration_ms: int | None = None + self._last_action_success: bool | None = None # Task completion tracking self._task_done: bool = False @@ -250,6 +260,11 @@ async def snapshot(self, **kwargs: Any) -> Snapshot: # Check if using legacy browser (backward compat) if hasattr(self, "_legacy_browser") and hasattr(self, "_legacy_page"): self.last_snapshot = await self._legacy_browser.snapshot(self._legacy_page, **kwargs) + if self.last_snapshot is not None: + self._cached_url = self.last_snapshot.url + if self._step_pre_snapshot is None: + self._step_pre_snapshot = self.last_snapshot + self._step_pre_url = self.last_snapshot.url return self.last_snapshot # Use backend-agnostic snapshot @@ -262,6 +277,11 @@ async def snapshot(self, **kwargs: Any) -> Snapshot: options = SnapshotOptions(**options_dict) self.last_snapshot = await backend_snapshot(self.backend, options=options) + if self.last_snapshot is not None: + self._cached_url = self.last_snapshot.url + if self._step_pre_snapshot is None: + self._step_pre_snapshot = self.last_snapshot + self._step_pre_url = self.last_snapshot.url if not skip_captcha_handling: await self._handle_captcha_if_needed(self.last_snapshot, source="gateway") return self.last_snapshot @@ -414,6 +434,7 @@ async def record_action( """ Record an action in the artifact timeline and capture a frame if enabled. """ + self._last_action = action if not self._artifact_buffer: return self._artifact_buffer.record_step( @@ -425,6 +446,107 @@ async def record_action( if self._artifact_buffer.options.capture_on_action: await self._capture_artifact_frame() + def _compute_snapshot_digest(self, snap: Snapshot | None) -> str | None: + if snap is None: + return None + try: + return ( + "sha256:" + + hashlib.sha256(f"{snap.url}{snap.timestamp}".encode("utf-8")).hexdigest() + ) + except Exception: + return None + + async def emit_step_end( + self, + *, + action: str | None = None, + success: bool | None = None, + error: str | None = None, + outcome: str | None = None, + duration_ms: int | None = None, + attempt: int = 0, + verify_passed: bool | None = None, + verify_signals: dict[str, Any] | None = None, + post_url: str | None = None, + post_snapshot_digest: str | None = None, + ) -> dict[str, Any]: + """ + Emit a step_end event using TraceEventBuilder. + """ + goal = self._step_goal or "" + pre_snap = self._step_pre_snapshot or self.last_snapshot + pre_url = ( + self._step_pre_url + or (pre_snap.url if pre_snap else None) + or self._cached_url + or "" + ) + + if post_url is None: + try: + post_url = await self.get_url() + except Exception: + post_url = ( + (self.last_snapshot.url if self.last_snapshot else None) or self._cached_url + ) + post_url = post_url or pre_url + + pre_digest = self._compute_snapshot_digest(pre_snap) + post_digest = post_snapshot_digest or self._compute_snapshot_digest(self.last_snapshot) + url_changed = bool(pre_url and post_url and str(pre_url) != str(post_url)) + + assertions_data = self.get_assertions_for_step_end() + assertions = assertions_data.get("assertions") or [] + + signals = dict(verify_signals or {}) + signals.setdefault("url_changed", url_changed) + if error and "error" not in signals: + signals["error"] = error + + passed = ( + bool(verify_passed) + if verify_passed is not None + else self.required_assertions_passed() + ) + + exec_success = bool(success) if success is not None else bool( + self._last_action_success if self._last_action_success is not None else passed + ) + + exec_data: dict[str, Any] = { + "success": exec_success, + "action": action or self._last_action or "unknown", + "outcome": outcome or self._last_action_outcome or "", + } + if duration_ms is not None: + exec_data["duration_ms"] = int(duration_ms) + if error: + exec_data["error"] = error + + verify_data = { + "passed": bool(passed), + "signals": signals, + } + + step_end_data = TraceEventBuilder.build_step_end_event( + step_id=self.step_id or "", + step_index=int(self.step_index), + goal=goal, + attempt=int(attempt), + pre_url=str(pre_url or ""), + post_url=str(post_url or ""), + snapshot_digest=pre_digest, + llm_data={}, + exec_data=exec_data, + verify_data=verify_data, + pre_elements=None, + assertions=assertions, + post_snapshot_digest=post_digest, + ) + self.tracer.emit("step_end", step_end_data, step_id=self.step_id) + return step_end_data + async def _capture_artifact_frame(self) -> None: if not self._artifact_buffer: return @@ -511,6 +633,14 @@ def begin_step(self, goal: str, step_index: int | None = None) -> str: """ # Clear previous step state self._assertions_this_step = [] + self._step_pre_snapshot = None + self._step_pre_url = None + self._step_goal = goal + self._last_action = None + self._last_action_error = None + self._last_action_outcome = None + self._last_action_duration_ms = None + self._last_action_success = None # Update step index if step_index is not None: diff --git a/sentience/runtime_agent.py b/sentience/runtime_agent.py index 231f5e3..13dbbea 100644 --- a/sentience/runtime_agent.py +++ b/sentience/runtime_agent.py @@ -86,26 +86,56 @@ async def run_step( step: RuntimeStep, ) -> bool: self.runtime.begin_step(step.goal) + emitted = False + ok = False + try: + snap = await self._snapshot_with_ramp(step=step) - snap = await self._snapshot_with_ramp(step=step) - - if await self._should_short_circuit_to_vision(step=step, snap=snap): - ok = await self._vision_executor_attempt(task_goal=task_goal, step=step, snap=snap) - return ok - - # 1) Structured executor attempt. - action = self._propose_structured_action(task_goal=task_goal, step=step, snap=snap) - await self._execute_action(action=action, snap=snap) - ok = await self._apply_verifications(step=step) - if ok: - return True + if await self._should_short_circuit_to_vision(step=step, snap=snap): + ok = await self._vision_executor_attempt( + task_goal=task_goal, step=step, snap=snap + ) + return ok - # 2) Optional vision executor fallback (bounded). - if step.vision_executor_enabled and step.max_vision_executor_attempts > 0: - ok2 = await self._vision_executor_attempt(task_goal=task_goal, step=step, snap=snap) - return ok2 + # 1) Structured executor attempt. + action = self._propose_structured_action( + task_goal=task_goal, step=step, snap=snap + ) + await self._execute_action(action=action, snap=snap) + ok = await self._apply_verifications(step=step) + if ok: + return True + + # 2) Optional vision executor fallback (bounded). + if step.vision_executor_enabled and step.max_vision_executor_attempts > 0: + ok = await self._vision_executor_attempt( + task_goal=task_goal, step=step, snap=snap + ) + return ok - return False + return False + except Exception as exc: + try: + await self.runtime.emit_step_end( + success=False, + error=str(exc), + outcome="exception", + verify_passed=False, + ) + emitted = True + except Exception: + pass + raise + finally: + if not emitted: + try: + await self.runtime.emit_step_end( + success=ok, + outcome=("ok" if ok else "verification_failed"), + verify_passed=ok, + ) + except Exception: + pass async def _snapshot_with_ramp(self, *, step: RuntimeStep) -> Snapshot: limit = step.snapshot_limit_base From 3c5c904383370987e1f6db90ebe779a3fd48e91b Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Thu, 22 Jan 2026 19:37:03 -0800 Subject: [PATCH 3/3] update models with modal detection --- sentience/models.py | 9 ++++ sentience/runtime_agent.py | 12 ++--- sentience/trace_event_builder.py | 6 +-- tests/test_grid_bounds.py | 91 ++++++++++++++++++++++++++++++++ 4 files changed, 104 insertions(+), 14 deletions(-) diff --git a/sentience/models.py b/sentience/models.py index 34a947c..2daef70 100644 --- a/sentience/models.py +++ b/sentience/models.py @@ -144,6 +144,12 @@ class GridInfo(BaseModel): ) is_dominant: bool = False # Whether this grid is the dominant group (main content area) + # Z-index and modal detection fields (from gateway/sentience-core) + z_index: int = 0 # Z-index of this grid (max among elements in this grid) + z_index_max: int = 0 # Global max z-index across ALL grids (for comparison) + blocks_interaction: bool = False # Whether this grid blocks interaction with content behind it + viewport_coverage: float = 0.0 # Ratio of grid area to viewport area (0.0-1.0) + class Snapshot(BaseModel): """Snapshot response from extension""" @@ -161,6 +167,9 @@ class Snapshot(BaseModel): dominant_group_key: str | None = None # The most common group_key (main content group) # Phase 2: Runtime stability/debug info (confidence/reasons/metrics) diagnostics: SnapshotDiagnostics | None = None + # Modal detection fields (from gateway) + modal_detected: bool | None = None # True if a modal/overlay grid was detected + modal_grids: list[GridInfo] | None = None # Array of GridInfo for detected modal grids def save(self, filepath: str) -> None: """Save snapshot as JSON file""" diff --git a/sentience/runtime_agent.py b/sentience/runtime_agent.py index 13dbbea..3c107e2 100644 --- a/sentience/runtime_agent.py +++ b/sentience/runtime_agent.py @@ -92,15 +92,11 @@ async def run_step( snap = await self._snapshot_with_ramp(step=step) if await self._should_short_circuit_to_vision(step=step, snap=snap): - ok = await self._vision_executor_attempt( - task_goal=task_goal, step=step, snap=snap - ) + ok = await self._vision_executor_attempt(task_goal=task_goal, step=step, snap=snap) return ok # 1) Structured executor attempt. - action = self._propose_structured_action( - task_goal=task_goal, step=step, snap=snap - ) + action = self._propose_structured_action(task_goal=task_goal, step=step, snap=snap) await self._execute_action(action=action, snap=snap) ok = await self._apply_verifications(step=step) if ok: @@ -108,9 +104,7 @@ async def run_step( # 2) Optional vision executor fallback (bounded). if step.vision_executor_enabled and step.max_vision_executor_attempts > 0: - ok = await self._vision_executor_attempt( - task_goal=task_goal, step=step, snap=snap - ) + ok = await self._vision_executor_attempt(task_goal=task_goal, step=step, snap=snap) return ok return False diff --git a/sentience/trace_event_builder.py b/sentience/trace_event_builder.py index 372b209..f6156c8 100644 --- a/sentience/trace_event_builder.py +++ b/sentience/trace_event_builder.py @@ -155,11 +155,7 @@ def build_step_end_event( "exec": exec_data, "post": { "url": post_url, - **( - {"snapshot_digest": post_snapshot_digest} - if post_snapshot_digest - else {} - ), + **({"snapshot_digest": post_snapshot_digest} if post_snapshot_digest else {}), }, "verify": final_verify_data, } diff --git a/tests/test_grid_bounds.py b/tests/test_grid_bounds.py index 93bb526..e9c8ca1 100644 --- a/tests/test_grid_bounds.py +++ b/tests/test_grid_bounds.py @@ -348,3 +348,94 @@ def test_sorted_by_grid_id(self): assert result[0].grid_id == 0 assert result[1].grid_id == 1 assert result[2].grid_id == 2 + + +class TestGridInfoModalFields: + """Tests for GridInfo z-index and modal detection fields""" + + def test_grid_info_default_values(self): + """Test that GridInfo has correct default values for new fields""" + grid_info = GridInfo( + grid_id=0, + bbox=BBox(x=0, y=0, width=100, height=100), + row_count=1, + col_count=1, + item_count=1, + ) + # New optional fields should have defaults + assert grid_info.z_index == 0 + assert grid_info.z_index_max == 0 + assert grid_info.blocks_interaction is False + assert grid_info.viewport_coverage == 0.0 + + def test_grid_info_with_modal_fields(self): + """Test creating GridInfo with modal detection fields""" + grid_info = GridInfo( + grid_id=1, + bbox=BBox(x=100, y=100, width=500, height=400), + row_count=2, + col_count=3, + item_count=6, + confidence=0.95, + z_index=1000, + z_index_max=1000, + blocks_interaction=True, + viewport_coverage=0.25, + ) + assert grid_info.z_index == 1000 + assert grid_info.z_index_max == 1000 + assert grid_info.blocks_interaction is True + assert grid_info.viewport_coverage == 0.25 + + +class TestSnapshotModalFields: + """Tests for Snapshot modal detection fields""" + + def test_snapshot_without_modal(self): + """Test snapshot with no modal detected""" + snapshot = Snapshot( + status="success", + url="https://example.com", + elements=[], + ) + # modal_detected and modal_grids should be None by default + assert snapshot.modal_detected is None + assert snapshot.modal_grids is None + + def test_snapshot_with_modal_detected(self): + """Test snapshot with modal detected""" + modal_grid = GridInfo( + grid_id=1, + bbox=BBox(x=200, y=150, width=600, height=400), + row_count=1, + col_count=2, + item_count=5, + z_index=1000, + z_index_max=1000, + blocks_interaction=True, + viewport_coverage=0.20, + ) + snapshot = Snapshot( + status="success", + url="https://example.com", + elements=[], + modal_detected=True, + modal_grids=[modal_grid], + ) + assert snapshot.modal_detected is True + assert snapshot.modal_grids is not None + assert len(snapshot.modal_grids) == 1 + assert snapshot.modal_grids[0].z_index == 1000 + assert snapshot.modal_grids[0].blocks_interaction is True + + def test_snapshot_modal_false(self): + """Test snapshot with modal_detected explicitly False""" + snapshot = Snapshot( + status="success", + url="https://example.com", + elements=[], + modal_detected=False, + modal_grids=None, + ) + assert snapshot.modal_detected is False + assert snapshot.modal_grids is None