diff --git a/sentience/agent.py b/sentience/agent.py index d4a9c0c..2f7f4ac 100644 --- a/sentience/agent.py +++ b/sentience/agent.py @@ -143,6 +143,25 @@ def _compute_hash(self, text: str) -> str: """Compute SHA256 hash of text.""" return hashlib.sha256(text.encode("utf-8")).hexdigest() + def _best_effort_post_snapshot_digest(self, goal: str) -> str | None: + """ + Best-effort post-action snapshot digest for tracing. + """ + try: + snap_opts = SnapshotOptions( + limit=min(10, self.default_snapshot_limit), + goal=f"{goal} (post)", + ) + snap_opts.screenshot = False + snap_opts.show_overlay = self.config.show_overlay if self.config else None + post_snap = snapshot(self.browser, snap_opts) + if post_snap.status != "success": + return None + digest_input = f"{post_snap.url}{post_snap.timestamp}" + return f"sha256:{self._compute_hash(digest_input)}" + except Exception: + return None + def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None: """Get bounding box for an element from snapshot.""" if element_id is None: @@ -513,6 +532,10 @@ def act( # noqa: C901 snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff) pre_elements = snapshot_event_data.get("elements", []) + post_snapshot_digest = ( + self._best_effort_post_snapshot_digest(goal) if self.tracer else None + ) + # Build complete step_end event step_end_data = TraceEventBuilder.build_step_end_event( step_id=step_id, @@ -522,6 +545,7 @@ def act( # noqa: C901 pre_url=pre_url, post_url=post_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=post_snapshot_digest, llm_data=llm_data, exec_data=exec_data, verify_data=verify_data, @@ -601,6 +625,7 @@ def act( # noqa: C901 pre_url=_step_pre_url, post_url=post_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=None, llm_data=llm_data, exec_data=exec_data, verify_data=None, @@ -1155,6 +1180,10 @@ async def act( # noqa: C901 snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff) pre_elements = snapshot_event_data.get("elements", []) + post_snapshot_digest = ( + self._best_effort_post_snapshot_digest(goal) if self.tracer else None + ) + # Build complete step_end event step_end_data = TraceEventBuilder.build_step_end_event( step_id=step_id, @@ -1164,6 +1193,7 @@ async def act( # noqa: C901 pre_url=pre_url, post_url=post_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=post_snapshot_digest, llm_data=llm_data, exec_data=exec_data, verify_data=verify_data, @@ -1243,6 +1273,7 @@ async def act( # noqa: C901 pre_url=_step_pre_url, post_url=post_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=None, llm_data=llm_data, exec_data=exec_data, verify_data=None, diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py index e0d8032..2852699 100644 --- a/sentience/agent_runtime.py +++ b/sentience/agent_runtime.py @@ -65,6 +65,7 @@ import asyncio import difflib +import hashlib import time from dataclasses import dataclass from typing import TYPE_CHECKING, Any @@ -72,6 +73,7 @@ from .captcha import CaptchaContext, CaptchaHandlingError, CaptchaOptions, CaptchaResolution from .failure_artifacts import FailureArtifactBuffer, FailureArtifactsOptions from .models import Snapshot, SnapshotOptions +from .trace_event_builder import TraceEventBuilder from .verification import AssertContext, AssertOutcome, Predicate if TYPE_CHECKING: @@ -138,6 +140,8 @@ def __init__( # Snapshot state self.last_snapshot: Snapshot | None = None + self._step_pre_snapshot: Snapshot | None = None + self._step_pre_url: str | None = None # Failure artifacts (Phase 1) self._artifact_buffer: FailureArtifactBuffer | None = None @@ -148,6 +152,12 @@ def __init__( # Assertions accumulated during current step self._assertions_this_step: list[dict[str, Any]] = [] + self._step_goal: str | None = None + self._last_action: str | None = None + self._last_action_error: str | None = None + self._last_action_outcome: str | None = None + self._last_action_duration_ms: int | None = None + self._last_action_success: bool | None = None # Task completion tracking self._task_done: bool = False @@ -250,6 +260,11 @@ async def snapshot(self, **kwargs: Any) -> Snapshot: # Check if using legacy browser (backward compat) if hasattr(self, "_legacy_browser") and hasattr(self, "_legacy_page"): self.last_snapshot = await self._legacy_browser.snapshot(self._legacy_page, **kwargs) + if self.last_snapshot is not None: + self._cached_url = self.last_snapshot.url + if self._step_pre_snapshot is None: + self._step_pre_snapshot = self.last_snapshot + self._step_pre_url = self.last_snapshot.url return self.last_snapshot # Use backend-agnostic snapshot @@ -262,6 +277,11 @@ async def snapshot(self, **kwargs: Any) -> Snapshot: options = SnapshotOptions(**options_dict) self.last_snapshot = await backend_snapshot(self.backend, options=options) + if self.last_snapshot is not None: + self._cached_url = self.last_snapshot.url + if self._step_pre_snapshot is None: + self._step_pre_snapshot = self.last_snapshot + self._step_pre_url = self.last_snapshot.url if not skip_captcha_handling: await self._handle_captcha_if_needed(self.last_snapshot, source="gateway") return self.last_snapshot @@ -414,6 +434,7 @@ async def record_action( """ Record an action in the artifact timeline and capture a frame if enabled. """ + self._last_action = action if not self._artifact_buffer: return self._artifact_buffer.record_step( @@ -425,6 +446,107 @@ async def record_action( if self._artifact_buffer.options.capture_on_action: await self._capture_artifact_frame() + def _compute_snapshot_digest(self, snap: Snapshot | None) -> str | None: + if snap is None: + return None + try: + return ( + "sha256:" + + hashlib.sha256(f"{snap.url}{snap.timestamp}".encode("utf-8")).hexdigest() + ) + except Exception: + return None + + async def emit_step_end( + self, + *, + action: str | None = None, + success: bool | None = None, + error: str | None = None, + outcome: str | None = None, + duration_ms: int | None = None, + attempt: int = 0, + verify_passed: bool | None = None, + verify_signals: dict[str, Any] | None = None, + post_url: str | None = None, + post_snapshot_digest: str | None = None, + ) -> dict[str, Any]: + """ + Emit a step_end event using TraceEventBuilder. + """ + goal = self._step_goal or "" + pre_snap = self._step_pre_snapshot or self.last_snapshot + pre_url = ( + self._step_pre_url + or (pre_snap.url if pre_snap else None) + or self._cached_url + or "" + ) + + if post_url is None: + try: + post_url = await self.get_url() + except Exception: + post_url = ( + (self.last_snapshot.url if self.last_snapshot else None) or self._cached_url + ) + post_url = post_url or pre_url + + pre_digest = self._compute_snapshot_digest(pre_snap) + post_digest = post_snapshot_digest or self._compute_snapshot_digest(self.last_snapshot) + url_changed = bool(pre_url and post_url and str(pre_url) != str(post_url)) + + assertions_data = self.get_assertions_for_step_end() + assertions = assertions_data.get("assertions") or [] + + signals = dict(verify_signals or {}) + signals.setdefault("url_changed", url_changed) + if error and "error" not in signals: + signals["error"] = error + + passed = ( + bool(verify_passed) + if verify_passed is not None + else self.required_assertions_passed() + ) + + exec_success = bool(success) if success is not None else bool( + self._last_action_success if self._last_action_success is not None else passed + ) + + exec_data: dict[str, Any] = { + "success": exec_success, + "action": action or self._last_action or "unknown", + "outcome": outcome or self._last_action_outcome or "", + } + if duration_ms is not None: + exec_data["duration_ms"] = int(duration_ms) + if error: + exec_data["error"] = error + + verify_data = { + "passed": bool(passed), + "signals": signals, + } + + step_end_data = TraceEventBuilder.build_step_end_event( + step_id=self.step_id or "", + step_index=int(self.step_index), + goal=goal, + attempt=int(attempt), + pre_url=str(pre_url or ""), + post_url=str(post_url or ""), + snapshot_digest=pre_digest, + llm_data={}, + exec_data=exec_data, + verify_data=verify_data, + pre_elements=None, + assertions=assertions, + post_snapshot_digest=post_digest, + ) + self.tracer.emit("step_end", step_end_data, step_id=self.step_id) + return step_end_data + async def _capture_artifact_frame(self) -> None: if not self._artifact_buffer: return @@ -511,6 +633,14 @@ def begin_step(self, goal: str, step_index: int | None = None) -> str: """ # Clear previous step state self._assertions_this_step = [] + self._step_pre_snapshot = None + self._step_pre_url = None + self._step_goal = goal + self._last_action = None + self._last_action_error = None + self._last_action_outcome = None + self._last_action_duration_ms = None + self._last_action_success = None # Update step index if step_index is not None: diff --git a/sentience/integrations/langchain/core.py b/sentience/integrations/langchain/core.py index ea24073..28280fa 100644 --- a/sentience/integrations/langchain/core.py +++ b/sentience/integrations/langchain/core.py @@ -113,6 +113,7 @@ async def _trace(self, tool_name: str, exec_coro, exec_meta: dict[str, Any]): pre_url=pre_url or "", post_url=post_url or "", snapshot_digest=None, + post_snapshot_digest=None, llm_data={}, exec_data=exec_data, verify_data=verify_data, diff --git a/sentience/integrations/pydanticai/toolset.py b/sentience/integrations/pydanticai/toolset.py index 033f606..1e8e82a 100644 --- a/sentience/integrations/pydanticai/toolset.py +++ b/sentience/integrations/pydanticai/toolset.py @@ -126,6 +126,7 @@ async def _trace_tool_call(ctx: Any, tool_name: str, exec_coro, exec_meta: dict[ pre_url=pre_url or "", post_url=post_url or "", snapshot_digest=None, + post_snapshot_digest=None, llm_data={}, exec_data=exec_data, verify_data=verify_data, diff --git a/sentience/models.py b/sentience/models.py index 34a947c..2daef70 100644 --- a/sentience/models.py +++ b/sentience/models.py @@ -144,6 +144,12 @@ class GridInfo(BaseModel): ) is_dominant: bool = False # Whether this grid is the dominant group (main content area) + # Z-index and modal detection fields (from gateway/sentience-core) + z_index: int = 0 # Z-index of this grid (max among elements in this grid) + z_index_max: int = 0 # Global max z-index across ALL grids (for comparison) + blocks_interaction: bool = False # Whether this grid blocks interaction with content behind it + viewport_coverage: float = 0.0 # Ratio of grid area to viewport area (0.0-1.0) + class Snapshot(BaseModel): """Snapshot response from extension""" @@ -161,6 +167,9 @@ class Snapshot(BaseModel): dominant_group_key: str | None = None # The most common group_key (main content group) # Phase 2: Runtime stability/debug info (confidence/reasons/metrics) diagnostics: SnapshotDiagnostics | None = None + # Modal detection fields (from gateway) + modal_detected: bool | None = None # True if a modal/overlay grid was detected + modal_grids: list[GridInfo] | None = None # Array of GridInfo for detected modal grids def save(self, filepath: str) -> None: """Save snapshot as JSON file""" diff --git a/sentience/runtime_agent.py b/sentience/runtime_agent.py index 231f5e3..3c107e2 100644 --- a/sentience/runtime_agent.py +++ b/sentience/runtime_agent.py @@ -86,26 +86,50 @@ async def run_step( step: RuntimeStep, ) -> bool: self.runtime.begin_step(step.goal) + emitted = False + ok = False + try: + snap = await self._snapshot_with_ramp(step=step) - snap = await self._snapshot_with_ramp(step=step) - - if await self._should_short_circuit_to_vision(step=step, snap=snap): - ok = await self._vision_executor_attempt(task_goal=task_goal, step=step, snap=snap) - return ok + if await self._should_short_circuit_to_vision(step=step, snap=snap): + ok = await self._vision_executor_attempt(task_goal=task_goal, step=step, snap=snap) + return ok - # 1) Structured executor attempt. - action = self._propose_structured_action(task_goal=task_goal, step=step, snap=snap) - await self._execute_action(action=action, snap=snap) - ok = await self._apply_verifications(step=step) - if ok: - return True + # 1) Structured executor attempt. + action = self._propose_structured_action(task_goal=task_goal, step=step, snap=snap) + await self._execute_action(action=action, snap=snap) + ok = await self._apply_verifications(step=step) + if ok: + return True - # 2) Optional vision executor fallback (bounded). - if step.vision_executor_enabled and step.max_vision_executor_attempts > 0: - ok2 = await self._vision_executor_attempt(task_goal=task_goal, step=step, snap=snap) - return ok2 + # 2) Optional vision executor fallback (bounded). + if step.vision_executor_enabled and step.max_vision_executor_attempts > 0: + ok = await self._vision_executor_attempt(task_goal=task_goal, step=step, snap=snap) + return ok - return False + return False + except Exception as exc: + try: + await self.runtime.emit_step_end( + success=False, + error=str(exc), + outcome="exception", + verify_passed=False, + ) + emitted = True + except Exception: + pass + raise + finally: + if not emitted: + try: + await self.runtime.emit_step_end( + success=ok, + outcome=("ok" if ok else "verification_failed"), + verify_passed=ok, + ) + except Exception: + pass async def _snapshot_with_ramp(self, *, step: RuntimeStep) -> Snapshot: limit = step.snapshot_limit_base diff --git a/sentience/trace_event_builder.py b/sentience/trace_event_builder.py index 8b5b911..f6156c8 100644 --- a/sentience/trace_event_builder.py +++ b/sentience/trace_event_builder.py @@ -94,6 +94,7 @@ def build_step_end_event( verify_data: dict[str, Any], pre_elements: list[dict[str, Any]] | None = None, assertions: list[dict[str, Any]] | None = None, + post_snapshot_digest: str | None = None, ) -> dict[str, Any]: """ Build step_end trace event data. @@ -106,6 +107,7 @@ def build_step_end_event( pre_url: URL before action execution post_url: URL after action execution snapshot_digest: Digest of snapshot before action + post_snapshot_digest: Digest of snapshot after action (optional) llm_data: LLM interaction data exec_data: Action execution data verify_data: Verification data @@ -153,6 +155,7 @@ def build_step_end_event( "exec": exec_data, "post": { "url": post_url, + **({"snapshot_digest": post_snapshot_digest} if post_snapshot_digest else {}), }, "verify": final_verify_data, } diff --git a/sentience/visual_agent.py b/sentience/visual_agent.py index 60c3851..12bb323 100644 --- a/sentience/visual_agent.py +++ b/sentience/visual_agent.py @@ -1101,6 +1101,10 @@ async def act( }, } + post_snapshot_digest = ( + self._best_effort_post_snapshot_digest(goal) if self.tracer else None + ) + # Build complete step_end event step_end_data = TraceEventBuilder.build_step_end_event( step_id=step_id, @@ -1110,6 +1114,7 @@ async def act( pre_url=pre_url, post_url=post_url or pre_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=post_snapshot_digest, llm_data={ "response_text": llm_response_text, "response_hash": f"sha256:{self._compute_hash(llm_response_text)}", @@ -2011,6 +2016,10 @@ def act( }, } + post_snapshot_digest = ( + self._best_effort_post_snapshot_digest(goal) if self.tracer else None + ) + # Build complete step_end event step_end_data = TraceEventBuilder.build_step_end_event( step_id=step_id, @@ -2020,6 +2029,7 @@ def act( pre_url=pre_url, post_url=post_url or pre_url, snapshot_digest=snapshot_digest, + post_snapshot_digest=post_snapshot_digest, llm_data={ "response_text": llm_response_text, "response_hash": f"sha256:{self._compute_hash(llm_response_text)}", diff --git a/tests/test_grid_bounds.py b/tests/test_grid_bounds.py index 93bb526..e9c8ca1 100644 --- a/tests/test_grid_bounds.py +++ b/tests/test_grid_bounds.py @@ -348,3 +348,94 @@ def test_sorted_by_grid_id(self): assert result[0].grid_id == 0 assert result[1].grid_id == 1 assert result[2].grid_id == 2 + + +class TestGridInfoModalFields: + """Tests for GridInfo z-index and modal detection fields""" + + def test_grid_info_default_values(self): + """Test that GridInfo has correct default values for new fields""" + grid_info = GridInfo( + grid_id=0, + bbox=BBox(x=0, y=0, width=100, height=100), + row_count=1, + col_count=1, + item_count=1, + ) + # New optional fields should have defaults + assert grid_info.z_index == 0 + assert grid_info.z_index_max == 0 + assert grid_info.blocks_interaction is False + assert grid_info.viewport_coverage == 0.0 + + def test_grid_info_with_modal_fields(self): + """Test creating GridInfo with modal detection fields""" + grid_info = GridInfo( + grid_id=1, + bbox=BBox(x=100, y=100, width=500, height=400), + row_count=2, + col_count=3, + item_count=6, + confidence=0.95, + z_index=1000, + z_index_max=1000, + blocks_interaction=True, + viewport_coverage=0.25, + ) + assert grid_info.z_index == 1000 + assert grid_info.z_index_max == 1000 + assert grid_info.blocks_interaction is True + assert grid_info.viewport_coverage == 0.25 + + +class TestSnapshotModalFields: + """Tests for Snapshot modal detection fields""" + + def test_snapshot_without_modal(self): + """Test snapshot with no modal detected""" + snapshot = Snapshot( + status="success", + url="https://example.com", + elements=[], + ) + # modal_detected and modal_grids should be None by default + assert snapshot.modal_detected is None + assert snapshot.modal_grids is None + + def test_snapshot_with_modal_detected(self): + """Test snapshot with modal detected""" + modal_grid = GridInfo( + grid_id=1, + bbox=BBox(x=200, y=150, width=600, height=400), + row_count=1, + col_count=2, + item_count=5, + z_index=1000, + z_index_max=1000, + blocks_interaction=True, + viewport_coverage=0.20, + ) + snapshot = Snapshot( + status="success", + url="https://example.com", + elements=[], + modal_detected=True, + modal_grids=[modal_grid], + ) + assert snapshot.modal_detected is True + assert snapshot.modal_grids is not None + assert len(snapshot.modal_grids) == 1 + assert snapshot.modal_grids[0].z_index == 1000 + assert snapshot.modal_grids[0].blocks_interaction is True + + def test_snapshot_modal_false(self): + """Test snapshot with modal_detected explicitly False""" + snapshot = Snapshot( + status="success", + url="https://example.com", + elements=[], + modal_detected=False, + modal_grids=None, + ) + assert snapshot.modal_detected is False + assert snapshot.modal_grids is None