Skip to content

Commit 1d3b41b

Browse files
authored
Merge pull request #159 from SentienceAPI/p0
P0: add baseline safety net testing + assert failure artifacts
2 parents 913b34b + 1a26e19 commit 1d3b41b

File tree

7 files changed

+631
-33
lines changed

7 files changed

+631
-33
lines changed

.github/workflows/test.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,15 @@ jobs:
373373
print('WARNING: Could not find assert_ method call in assert_done')
374374
sys.exit(1)
375375
PYEOF
376+
377+
- name: Phase 0 regression safety net (unit)
378+
shell: bash
379+
run: |
380+
pytest tests/unit/test_agent_runtime_phase0.py -v
381+
382+
- name: Run full test suite
383+
shell: bash
384+
run: |
376385
pytest tests/ -v
377386
env:
378387
CI: true

README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,37 @@ async def main():
8989
asyncio.run(main())
9090
```
9191

92+
### Failure Artifact Buffer (Phase 1)
93+
94+
Capture a short ring buffer of screenshots and persist them when a required assertion fails.
95+
96+
```python
97+
from sentience.failure_artifacts import FailureArtifactsOptions
98+
99+
await runtime.enable_failure_artifacts(
100+
FailureArtifactsOptions(buffer_seconds=15, capture_on_action=True, fps=0.0)
101+
)
102+
103+
# After each action, record it (best-effort).
104+
await runtime.record_action("CLICK")
105+
```
106+
107+
### Redaction callback (Phase 3)
108+
109+
Provide a user-defined callback to redact snapshots and decide whether to persist frames. The SDK does not implement image/video redaction.
110+
111+
```python
112+
from sentience.failure_artifacts import FailureArtifactsOptions, RedactionContext, RedactionResult
113+
114+
def redact(ctx: RedactionContext) -> RedactionResult:
115+
# Example: drop frames entirely, keep JSON only.
116+
return RedactionResult(drop_frames=True)
117+
118+
await runtime.enable_failure_artifacts(
119+
FailureArtifactsOptions(on_before_persist=redact)
120+
)
121+
```
122+
92123
**See examples:** [`examples/asserts/`](examples/asserts/)
93124

94125
## 🚀 Quick Start: Choose Your Abstraction Level

sentience/agent_runtime.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
from dataclasses import dataclass
7171
from typing import TYPE_CHECKING, Any
7272

73+
from .failure_artifacts import FailureArtifactBuffer, FailureArtifactsOptions
7374
from .models import Snapshot, SnapshotOptions
7475
from .verification import AssertContext, AssertOutcome, Predicate
7576

@@ -138,6 +139,10 @@ def __init__(
138139
# Snapshot state
139140
self.last_snapshot: Snapshot | None = None
140141

142+
# Failure artifacts (Phase 1)
143+
self._artifact_buffer: FailureArtifactBuffer | None = None
144+
self._artifact_timer_task: asyncio.Task | None = None
145+
141146
# Cached URL (updated on snapshot or explicit get_url call)
142147
self._cached_url: str | None = None
143148

@@ -250,6 +255,113 @@ async def snapshot(self, **kwargs: Any) -> Snapshot:
250255
self.last_snapshot = await backend_snapshot(self.backend, options=options)
251256
return self.last_snapshot
252257

258+
async def enable_failure_artifacts(
259+
self,
260+
options: FailureArtifactsOptions | None = None,
261+
) -> None:
262+
"""
263+
Enable failure artifact buffer (Phase 1).
264+
"""
265+
opts = options or FailureArtifactsOptions()
266+
self._artifact_buffer = FailureArtifactBuffer(
267+
run_id=self.tracer.run_id,
268+
options=opts,
269+
)
270+
if opts.fps > 0:
271+
self._artifact_timer_task = asyncio.create_task(self._artifact_timer_loop())
272+
273+
def disable_failure_artifacts(self) -> None:
274+
"""
275+
Disable failure artifact buffer and stop background capture.
276+
"""
277+
if self._artifact_timer_task:
278+
self._artifact_timer_task.cancel()
279+
self._artifact_timer_task = None
280+
281+
async def record_action(
282+
self,
283+
action: str,
284+
*,
285+
url: str | None = None,
286+
) -> None:
287+
"""
288+
Record an action in the artifact timeline and capture a frame if enabled.
289+
"""
290+
if not self._artifact_buffer:
291+
return
292+
self._artifact_buffer.record_step(
293+
action=action,
294+
step_id=self.step_id,
295+
step_index=self.step_index,
296+
url=url,
297+
)
298+
if self._artifact_buffer.options.capture_on_action:
299+
await self._capture_artifact_frame()
300+
301+
async def _capture_artifact_frame(self) -> None:
302+
if not self._artifact_buffer:
303+
return
304+
try:
305+
image_bytes = await self.backend.screenshot_png()
306+
except Exception:
307+
return
308+
self._artifact_buffer.add_frame(image_bytes, fmt="png")
309+
310+
async def _artifact_timer_loop(self) -> None:
311+
if not self._artifact_buffer:
312+
return
313+
interval = 1.0 / max(0.001, self._artifact_buffer.options.fps)
314+
try:
315+
while True:
316+
await self._capture_artifact_frame()
317+
await asyncio.sleep(interval)
318+
except asyncio.CancelledError:
319+
return
320+
321+
def finalize_run(self, *, success: bool) -> None:
322+
"""
323+
Finalize artifact buffer at end of run.
324+
"""
325+
if not self._artifact_buffer:
326+
return
327+
if success:
328+
if self._artifact_buffer.options.persist_mode == "always":
329+
self._artifact_buffer.persist(
330+
reason="success",
331+
status="success",
332+
snapshot=self.last_snapshot,
333+
diagnostics=getattr(self.last_snapshot, "diagnostics", None),
334+
metadata=self._artifact_metadata(),
335+
)
336+
self._artifact_buffer.cleanup()
337+
else:
338+
self._persist_failure_artifacts(reason="finalize_failure")
339+
340+
def _persist_failure_artifacts(self, *, reason: str) -> None:
341+
if not self._artifact_buffer:
342+
return
343+
self._artifact_buffer.persist(
344+
reason=reason,
345+
status="failure",
346+
snapshot=self.last_snapshot,
347+
diagnostics=getattr(self.last_snapshot, "diagnostics", None),
348+
metadata=self._artifact_metadata(),
349+
)
350+
self._artifact_buffer.cleanup()
351+
if self._artifact_buffer.options.persist_mode == "onFail":
352+
self.disable_failure_artifacts()
353+
354+
def _artifact_metadata(self) -> dict[str, Any]:
355+
url = None
356+
if self.last_snapshot is not None:
357+
url = self.last_snapshot.url
358+
elif self._cached_url:
359+
url = self._cached_url
360+
return {
361+
"backend": self.backend.__class__.__name__,
362+
"url": url,
363+
}
364+
253365
def begin_step(self, goal: str, step_index: int | None = None) -> str:
254366
"""
255367
Begin a new step in the verification loop.
@@ -309,6 +421,8 @@ def assert_(
309421
kind="assert",
310422
record_in_step=True,
311423
)
424+
if required and not outcome.passed:
425+
self._persist_failure_artifacts(reason=f"assert_failed:{label}")
312426
return outcome.passed
313427

314428
def check(self, predicate: Predicate, label: str, required: bool = False) -> AssertionHandle:
@@ -619,6 +733,10 @@ async def eventually(
619733
"vision_fallback": True,
620734
},
621735
)
736+
if self.required and not passed:
737+
self.runtime._persist_failure_artifacts(
738+
reason=f"assert_eventually_failed:{self.label}"
739+
)
622740
return passed
623741
except Exception as e:
624742
# If vision fallback fails, fall through to snapshot_exhausted.
@@ -649,6 +767,10 @@ async def eventually(
649767
"exhausted": True,
650768
},
651769
)
770+
if self.required:
771+
self.runtime._persist_failure_artifacts(
772+
reason=f"assert_eventually_failed:{self.label}"
773+
)
652774
return False
653775

654776
if time.monotonic() >= deadline:
@@ -666,6 +788,10 @@ async def eventually(
666788
"timeout": True,
667789
},
668790
)
791+
if self.required:
792+
self.runtime._persist_failure_artifacts(
793+
reason=f"assert_eventually_timeout:{self.label}"
794+
)
669795
return False
670796

671797
await asyncio.sleep(poll_s)
@@ -705,6 +831,10 @@ async def eventually(
705831
record_in_step=True,
706832
extra={"eventually": True, "attempt": attempt, "final": True, "timeout": True},
707833
)
834+
if self.required:
835+
self.runtime._persist_failure_artifacts(
836+
reason=f"assert_eventually_timeout:{self.label}"
837+
)
708838
return False
709839

710840
await asyncio.sleep(poll_s)

0 commit comments

Comments
 (0)