From 43772765263a247f7fef2e6e32bfed32526a37a0 Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Mon, 19 Jan 2026 16:40:22 -0800 Subject: [PATCH 1/5] snapshot screenshot options config --- sentience/backends/snapshot.py | 5 ++++- sentience/failure_artifacts.py | 36 +++++++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/sentience/backends/snapshot.py b/sentience/backends/snapshot.py index 624a03f..b1cda88 100644 --- a/sentience/backends/snapshot.py +++ b/sentience/backends/snapshot.py @@ -392,7 +392,10 @@ async def _snapshot_via_api( # Step 1: Get raw data from local extension (always happens locally) raw_options: dict[str, Any] = {} if options.screenshot is not False: - raw_options["screenshot"] = options.screenshot + if hasattr(options.screenshot, "model_dump"): + raw_options["screenshot"] = options.screenshot.model_dump() + else: + raw_options["screenshot"] = options.screenshot # Call extension to get raw elements raw_result = await _eval_with_navigation_retry( diff --git a/sentience/failure_artifacts.py b/sentience/failure_artifacts.py index 324aa90..b2cb0fa 100644 --- a/sentience/failure_artifacts.py +++ b/sentience/failure_artifacts.py @@ -186,11 +186,41 @@ def _generate_clip_from_frames( ) if result.returncode != 0: + stderr = result.stderr.decode("utf-8", errors="replace")[:500] logger.warning( - f"ffmpeg failed with return code {result.returncode}: " - f"{result.stderr.decode('utf-8', errors='replace')[:500]}" + f"ffmpeg failed with return code {result.returncode}: {stderr}" ) - return False + # Fallback: use glob input (handles non-uniform filenames) + fallback_cmd = [ + "ffmpeg", + "-y", + "-pattern_type", + "glob", + "-i", + frame_pattern, + "-r", + str(fps), + "-pix_fmt", + "yuv420p", + "-c:v", + "libx264", + "-crf", + "23", + str(output_path), + ] + fallback = subprocess.run( + fallback_cmd, + capture_output=True, + timeout=60, + cwd=str(frames_dir), + ) + if fallback.returncode != 0: + fb_stderr = fallback.stderr.decode("utf-8", errors="replace")[:500] + logger.warning( + f"ffmpeg fallback failed with return code {fallback.returncode}: {fb_stderr}" + ) + return False + return output_path.exists() return output_path.exists() From eb95eaf4b95120880107d851b07a6a24ba908191 Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Mon, 19 Jan 2026 18:41:03 -0800 Subject: [PATCH 2/5] fix trace upload issue in exception; ffmpeg fix --- README.md | 2 + sentience/agent.py | 150 +++++++++++++++++++++++++++++++++ sentience/failure_artifacts.py | 76 ++++++++++++++--- 3 files changed, 217 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index f0eab6d..4fa9219 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,8 @@ await runtime.enable_failure_artifacts( await runtime.record_action("CLICK") ``` +**Video clip generation (optional):** To generate MP4 video clips from captured frames, install [ffmpeg](https://ffmpeg.org/) (version 4.0 or later; version 5.1+ recommended for best compatibility). If ffmpeg is not installed, frames are still captured but no video clip is generated. + ### Redaction callback (Phase 3) Provide a user-defined callback to redact snapshots and decide whether to persist frames. The SDK does not implement image/video redaction. diff --git a/sentience/agent.py b/sentience/agent.py index 1907144..83ebf1f 100644 --- a/sentience/agent.py +++ b/sentience/agent.py @@ -205,6 +205,13 @@ def act( # noqa: C901 pre_url=pre_url, ) + # Track data collected during step execution for step_end emission on failure + _step_snap_with_diff: Optional[Snapshot] = None + _step_pre_url: Optional[str] = None + _step_llm_response: Optional[LLMResponse] = None + _step_result: Optional[AgentActionResult] = None + _step_duration_ms: int = 0 + for attempt in range(max_retries + 1): try: # 1. OBSERVE: Get refined semantic snapshot @@ -254,6 +261,10 @@ def act( # noqa: C901 error=snap.error, ) + # Track for step_end emission on failure + _step_snap_with_diff = snap_with_diff + _step_pre_url = snap.url + # Update previous snapshot for next comparison self._previous_snapshot = snap @@ -311,6 +322,9 @@ def act( # noqa: C901 # 3. THINK: Query LLM for next action llm_response = self.llm_handler.query_llm(context, goal) + # Track for step_end emission on failure + _step_llm_response = llm_response + # Emit LLM query trace event if tracer is enabled if self.tracer: _safe_tracer_call( @@ -358,6 +372,10 @@ def act( # noqa: C901 cursor=result_dict.get("cursor"), ) + # Track for step_end emission on failure + _step_result = result + _step_duration_ms = duration_ms + # Emit action execution trace event if tracer is enabled if self.tracer: post_url = self.browser.page.url if self.browser.page else None @@ -539,6 +557,63 @@ def act( # noqa: C901 time.sleep(1.0) # Brief delay before retry continue else: + # Emit step_end with whatever data we collected before failure + # This ensures diff_status and other fields are preserved in traces + if self.tracer and _step_snap_with_diff is not None: + post_url = self.browser.page.url if self.browser.page else None + snapshot_digest = f"sha256:{self._compute_hash(f'{_step_pre_url}{_step_snap_with_diff.timestamp}')}" + + # Build pre_elements from snap_with_diff (includes diff_status) + snapshot_event_data = TraceEventBuilder.build_snapshot_event(_step_snap_with_diff) + pre_elements = snapshot_event_data.get("elements", []) + + # Build LLM data if available + llm_data = None + if _step_llm_response: + llm_response_text = _step_llm_response.content + llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}" + llm_data = { + "response_text": llm_response_text, + "response_hash": llm_response_hash, + "usage": { + "prompt_tokens": _step_llm_response.prompt_tokens or 0, + "completion_tokens": _step_llm_response.completion_tokens or 0, + "total_tokens": _step_llm_response.total_tokens or 0, + }, + } + + # Build exec data (failure state) + exec_data = { + "success": False, + "action": _step_result.action if _step_result else "error", + "outcome": str(e), + "duration_ms": _step_duration_ms, + } + + # Build step_end event for failed step + step_end_data = TraceEventBuilder.build_step_end_event( + step_id=step_id, + step_index=self._step_count, + goal=goal, + attempt=attempt, + pre_url=_step_pre_url, + post_url=post_url, + snapshot_digest=snapshot_digest, + llm_data=llm_data, + exec_data=exec_data, + verify_data=None, + pre_elements=pre_elements, + ) + + _safe_tracer_call( + self.tracer, + "emit", + self.verbose, + "step_end", + step_end_data, + step_id=step_id, + ) + # Create error result error_result = AgentActionResult( success=False, @@ -771,6 +846,13 @@ async def act( # noqa: C901 pre_url=pre_url, ) + # Track data collected during step execution for step_end emission on failure + _step_snap_with_diff: Optional[Snapshot] = None + _step_pre_url: Optional[str] = None + _step_llm_response: Optional[LLMResponse] = None + _step_result: Optional[AgentActionResult] = None + _step_duration_ms: int = 0 + for attempt in range(max_retries + 1): try: # 1. OBSERVE: Get refined semantic snapshot @@ -823,6 +905,10 @@ async def act( # noqa: C901 error=snap.error, ) + # Track for step_end emission on failure + _step_snap_with_diff = snap_with_diff + _step_pre_url = snap.url + # Update previous snapshot for next comparison self._previous_snapshot = snap @@ -880,6 +966,9 @@ async def act( # noqa: C901 # 3. THINK: Query LLM for next action llm_response = self.llm_handler.query_llm(context, goal) + # Track for step_end emission on failure + _step_llm_response = llm_response + # Emit LLM query trace event if tracer is enabled if self.tracer: _safe_tracer_call( @@ -926,6 +1015,10 @@ async def act( # noqa: C901 message=result_dict.get("message"), ) + # Track for step_end emission on failure + _step_result = result + _step_duration_ms = duration_ms + # Emit action execution trace event if tracer is enabled if self.tracer: post_url = self.browser.page.url if self.browser.page else None @@ -1104,6 +1197,63 @@ async def act( # noqa: C901 await asyncio.sleep(1.0) # Brief delay before retry continue else: + # Emit step_end with whatever data we collected before failure + # This ensures diff_status and other fields are preserved in traces + if self.tracer and _step_snap_with_diff is not None: + post_url = self.browser.page.url if self.browser.page else None + snapshot_digest = f"sha256:{self._compute_hash(f'{_step_pre_url}{_step_snap_with_diff.timestamp}')}" + + # Build pre_elements from snap_with_diff (includes diff_status) + snapshot_event_data = TraceEventBuilder.build_snapshot_event(_step_snap_with_diff) + pre_elements = snapshot_event_data.get("elements", []) + + # Build LLM data if available + llm_data = None + if _step_llm_response: + llm_response_text = _step_llm_response.content + llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}" + llm_data = { + "response_text": llm_response_text, + "response_hash": llm_response_hash, + "usage": { + "prompt_tokens": _step_llm_response.prompt_tokens or 0, + "completion_tokens": _step_llm_response.completion_tokens or 0, + "total_tokens": _step_llm_response.total_tokens or 0, + }, + } + + # Build exec data (failure state) + exec_data = { + "success": False, + "action": _step_result.action if _step_result else "error", + "outcome": str(e), + "duration_ms": _step_duration_ms, + } + + # Build step_end event for failed step + step_end_data = TraceEventBuilder.build_step_end_event( + step_id=step_id, + step_index=self._step_count, + goal=goal, + attempt=attempt, + pre_url=_step_pre_url, + post_url=post_url, + snapshot_digest=snapshot_digest, + llm_data=llm_data, + exec_data=exec_data, + verify_data=None, + pre_elements=pre_elements, + ) + + _safe_tracer_call( + self.tracer, + "emit", + self.verbose, + "step_end", + step_end_data, + step_id=step_id, + ) + # Create error result error_result = AgentActionResult( success=False, diff --git a/sentience/failure_artifacts.py b/sentience/failure_artifacts.py index b2cb0fa..7473449 100644 --- a/sentience/failure_artifacts.py +++ b/sentience/failure_artifacts.py @@ -3,6 +3,7 @@ import gzip import json import logging +import re import shutil import subprocess import tempfile @@ -104,6 +105,26 @@ def _is_ffmpeg_available() -> bool: return False +def _get_ffmpeg_version() -> tuple[int, int] | None: + """Get ffmpeg major and minor version. Returns (major, minor) or None if unavailable.""" + try: + result = subprocess.run( + ["ffmpeg", "-version"], + capture_output=True, + timeout=5, + ) + if result.returncode != 0: + return None + output = result.stdout.decode("utf-8", errors="replace") + # Parse version from "ffmpeg version X.Y.Z ..." + match = re.search(r"ffmpeg version (\d+)\.(\d+)", output) + if match: + return (int(match.group(1)), int(match.group(2))) + return None + except (subprocess.TimeoutExpired, FileNotFoundError, OSError): + return None + + def _generate_clip_from_frames( frames_dir: Path, output_path: Path, @@ -154,10 +175,17 @@ def _generate_clip_from_frames( # -f concat: use concat demuxer # -safe 0: allow unsafe file paths # -i: input file list - # -vsync vfr: variable frame rate + # -fps_mode vfr or -vsync vfr: variable frame rate + # (-fps_mode replaces deprecated -vsync in ffmpeg 5.1+) # -pix_fmt yuv420p: compatibility with most players # -c:v libx264: H.264 codec # -crf 23: quality (lower = better, 23 is default) + + # Detect ffmpeg version to use correct vsync option + # -fps_mode was introduced in ffmpeg 5.1, -vsync deprecated in 7.0 + ffmpeg_version = _get_ffmpeg_version() + use_fps_mode = ffmpeg_version is not None and ffmpeg_version >= (5, 1) + cmd = [ "ffmpeg", "-y", @@ -166,17 +194,40 @@ def _generate_clip_from_frames( "-safe", "0", "-i", - str(list_file), - "-vsync", - "vfr", - "-pix_fmt", - "yuv420p", - "-c:v", - "libx264", - "-crf", - "23", - str(output_path), + "frames_list.txt", # Use relative path since cwd=frames_dir ] + # Add vsync option based on ffmpeg version + if use_fps_mode: + cmd.extend(["-fps_mode", "vfr"]) + else: + cmd.extend(["-vsync", "vfr"]) + cmd.extend( + [ + "-pix_fmt", + "yuv420p", + "-c:v", + "libx264", + "-crf", + "23", + str(output_path), + ] + ) + + # Log the command for debugging + logger.debug(f"Running ffmpeg command: {' '.join(cmd)}") + logger.debug(f"Working directory: {frames_dir}") + logger.debug(f"Frame files found: {len(frame_files)}") + + # Verify files exist before running ffmpeg + if not list_file.exists(): + logger.warning(f"frames_list.txt does not exist: {list_file}") + return False + + # Verify all frame files referenced in the list exist + for frame_file in frame_files: + if not frame_file.exists(): + logger.warning(f"Frame file does not exist: {frame_file}") + return False result = subprocess.run( cmd, @@ -187,9 +238,12 @@ def _generate_clip_from_frames( if result.returncode != 0: stderr = result.stderr.decode("utf-8", errors="replace")[:500] + stdout = result.stdout.decode("utf-8", errors="replace")[:200] logger.warning( f"ffmpeg failed with return code {result.returncode}: {stderr}" ) + if stdout: + logger.debug(f"ffmpeg stdout: {stdout}") # Fallback: use glob input (handles non-uniform filenames) fallback_cmd = [ "ffmpeg", From 7c7e93b8ad6bbd9f4b20f84114480163f50b39b8 Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Mon, 19 Jan 2026 18:41:36 -0800 Subject: [PATCH 3/5] fix trace upload issue in exception; ffmpeg fix --- sentience/agent.py | 24 ++++++++++++++---------- sentience/agent_runtime.py | 2 +- sentience/failure_artifacts.py | 4 +--- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/sentience/agent.py b/sentience/agent.py index 83ebf1f..d4a9c0c 100644 --- a/sentience/agent.py +++ b/sentience/agent.py @@ -206,10 +206,10 @@ def act( # noqa: C901 ) # Track data collected during step execution for step_end emission on failure - _step_snap_with_diff: Optional[Snapshot] = None - _step_pre_url: Optional[str] = None - _step_llm_response: Optional[LLMResponse] = None - _step_result: Optional[AgentActionResult] = None + _step_snap_with_diff: Snapshot | None = None + _step_pre_url: str | None = None + _step_llm_response: LLMResponse | None = None + _step_result: AgentActionResult | None = None _step_duration_ms: int = 0 for attempt in range(max_retries + 1): @@ -564,7 +564,9 @@ def act( # noqa: C901 snapshot_digest = f"sha256:{self._compute_hash(f'{_step_pre_url}{_step_snap_with_diff.timestamp}')}" # Build pre_elements from snap_with_diff (includes diff_status) - snapshot_event_data = TraceEventBuilder.build_snapshot_event(_step_snap_with_diff) + snapshot_event_data = TraceEventBuilder.build_snapshot_event( + _step_snap_with_diff + ) pre_elements = snapshot_event_data.get("elements", []) # Build LLM data if available @@ -847,10 +849,10 @@ async def act( # noqa: C901 ) # Track data collected during step execution for step_end emission on failure - _step_snap_with_diff: Optional[Snapshot] = None - _step_pre_url: Optional[str] = None - _step_llm_response: Optional[LLMResponse] = None - _step_result: Optional[AgentActionResult] = None + _step_snap_with_diff: Snapshot | None = None + _step_pre_url: str | None = None + _step_llm_response: LLMResponse | None = None + _step_result: AgentActionResult | None = None _step_duration_ms: int = 0 for attempt in range(max_retries + 1): @@ -1204,7 +1206,9 @@ async def act( # noqa: C901 snapshot_digest = f"sha256:{self._compute_hash(f'{_step_pre_url}{_step_snap_with_diff.timestamp}')}" # Build pre_elements from snap_with_diff (includes diff_status) - snapshot_event_data = TraceEventBuilder.build_snapshot_event(_step_snap_with_diff) + snapshot_event_data = TraceEventBuilder.build_snapshot_event( + _step_snap_with_diff + ) pre_elements = snapshot_event_data.get("elements", []) # Build LLM data if available diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py index 399df5c..6dad018 100644 --- a/sentience/agent_runtime.py +++ b/sentience/agent_runtime.py @@ -583,7 +583,7 @@ def assert_done( True if task is complete (assertion passed), False otherwise """ # Convenience wrapper for assert_ with required=True - ok = self.assert_(predicate, label=label, required=True) + ok = self.assertTrue(predicate, label=label, required=True) if ok: self._task_done = True self._task_done_label = label diff --git a/sentience/failure_artifacts.py b/sentience/failure_artifacts.py index 7473449..dc43b05 100644 --- a/sentience/failure_artifacts.py +++ b/sentience/failure_artifacts.py @@ -239,9 +239,7 @@ def _generate_clip_from_frames( if result.returncode != 0: stderr = result.stderr.decode("utf-8", errors="replace")[:500] stdout = result.stdout.decode("utf-8", errors="replace")[:200] - logger.warning( - f"ffmpeg failed with return code {result.returncode}: {stderr}" - ) + logger.warning(f"ffmpeg failed with return code {result.returncode}: {stderr}") if stdout: logger.debug(f"ffmpeg stdout: {stdout}") # Fallback: use glob input (handles non-uniform filenames) From 6f30a796c64f3ab34f7d265c2db4042b4037e666 Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Mon, 19 Jan 2026 18:48:10 -0800 Subject: [PATCH 4/5] revert assertTrue to assert_ --- sentience/agent_runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py index 6dad018..399df5c 100644 --- a/sentience/agent_runtime.py +++ b/sentience/agent_runtime.py @@ -583,7 +583,7 @@ def assert_done( True if task is complete (assertion passed), False otherwise """ # Convenience wrapper for assert_ with required=True - ok = self.assertTrue(predicate, label=label, required=True) + ok = self.assert_(predicate, label=label, required=True) if ok: self._task_done = True self._task_done_label = label From 939fc1bfc27889b84ff582bbee69ff07f46fbf20 Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Mon, 19 Jan 2026 19:03:33 -0800 Subject: [PATCH 5/5] fix step_end issue for failed run --- sentience/agent_runtime.py | 2 +- sentience/trace_event_builder.py | 3 +- tests/test_trace_event_builder.py | 47 +++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py index 399df5c..6dad018 100644 --- a/sentience/agent_runtime.py +++ b/sentience/agent_runtime.py @@ -583,7 +583,7 @@ def assert_done( True if task is complete (assertion passed), False otherwise """ # Convenience wrapper for assert_ with required=True - ok = self.assert_(predicate, label=label, required=True) + ok = self.assertTrue(predicate, label=label, required=True) if ok: self._task_done = True self._task_done_label = label diff --git a/sentience/trace_event_builder.py b/sentience/trace_event_builder.py index 272c7e0..44efa57 100644 --- a/sentience/trace_event_builder.py +++ b/sentience/trace_event_builder.py @@ -116,7 +116,8 @@ def build_step_end_event( pre_data["elements"] = pre_elements # Build verify data with assertions if provided - final_verify_data = verify_data.copy() + # Handle None verify_data for failed steps + final_verify_data = verify_data.copy() if verify_data else {} if assertions: # Ensure signals dict exists if "signals" not in final_verify_data: diff --git a/tests/test_trace_event_builder.py b/tests/test_trace_event_builder.py index fc0b5d7..b9b637c 100644 --- a/tests/test_trace_event_builder.py +++ b/tests/test_trace_event_builder.py @@ -273,3 +273,50 @@ def test_build_step_end_event_empty_elements(): # Should have elements field but it's empty assert "elements" in result["pre"] assert len(result["pre"]["elements"]) == 0 + + +def test_build_step_end_event_with_none_verify_data(): + """Test step_end event building when verify_data is None (failed steps). + + This test ensures that failed steps can emit step_end events even when + verify_data is None, which happens when a step fails before verification. + """ + llm_data = { + "response_text": "click(123)", + "response_hash": "sha256:abc123", + "usage": {"prompt_tokens": 100, "completion_tokens": 10, "total_tokens": 110}, + } + + exec_data = { + "success": False, + "action": "error", + "outcome": "Element not found", + "duration_ms": 500, + } + + # verify_data is None for failed steps + result = TraceEventBuilder.build_step_end_event( + step_id="step-1", + step_index=1, + goal="Click the button", + attempt=2, + pre_url="http://example.com/page1", + post_url="http://example.com/page1", + snapshot_digest="sha256:digest123", + llm_data=llm_data, + exec_data=exec_data, + verify_data=None, # None for failed steps + ) + + # Verify basic structure + assert result["v"] == 1 + assert result["step_id"] == "step-1" + assert result["step_index"] == 1 + assert result["attempt"] == 2 + + # Verify exec shows failure + assert result["exec"]["success"] is False + assert result["exec"]["action"] == "error" + + # Verify should be empty dict when verify_data is None + assert result["verify"] == {}