Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions sentience/element_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def filter_by_importance(
Returns:
Top N elements sorted by importance score
"""
elements = snapshot.elements
# Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
elements = [el for el in snapshot.elements if el.diff_status != "REMOVED"]
# Elements are already sorted by importance in snapshot
return elements[:max_elements]

Expand All @@ -81,7 +82,8 @@ def filter_by_goal(
Returns:
Filtered list of elements sorted by boosted importance score
"""
elements = snapshot.elements
# Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
elements = [el for el in snapshot.elements if el.diff_status != "REMOVED"]

# If no goal provided, return all elements (up to limit)
if not goal:
Expand Down
81 changes: 63 additions & 18 deletions sentience/llm_interaction_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"""

import re
from typing import Optional

from .llm_provider import LLMProvider, LLMResponse
from .models import Snapshot
Expand Down Expand Up @@ -35,7 +34,7 @@ def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
"""
Convert snapshot elements to token-efficient prompt string.

Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
Format: [ID] <role> "text" {cues} @ position size:WxH importance:score [status]

Args:
snap: Snapshot object
Expand All @@ -46,24 +45,50 @@ def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
"""
lines = []
for el in snap.elements:
# Skip REMOVED elements - they're not actionable and shouldn't be in LLM context
if el.diff_status == "REMOVED":
continue
# Extract visual cues
cues = []
cues: list[str] = []
if el.visual_cues.is_primary:
cues.append("PRIMARY")
if el.visual_cues.is_clickable:
cues.append("CLICKABLE")
if el.visual_cues.background_color_name:
cues.append(f"color:{el.visual_cues.background_color_name}")

# Format element line
# Format element line with improved readability
# Ensure cues is defined before using it in f-string
cues_str = f" {{{','.join(cues)}}}" if cues else ""
text_preview = (
(el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
)


# Better text handling - show truncation indicator
text_preview = ""
if el.text:
if len(el.text) > 50:
text_preview = f'"{el.text[:50]}..."'
else:
text_preview = f'"{el.text}"'

# Build position and size info
x, y = int(el.bbox.x), int(el.bbox.y)
width, height = int(el.bbox.width), int(el.bbox.height)
position_str = f"@ ({x},{y})"
size_str = f"size:{width}x{height}"

# Build status indicators (only include if relevant)
status_parts = []
if not el.in_viewport:
status_parts.append("not_in_viewport")
if el.is_occluded:
status_parts.append("occluded")
if el.diff_status:
status_parts.append(f"diff:{el.diff_status}")
status_str = f" [{','.join(status_parts)}]" if status_parts else ""

# Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
lines.append(
f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
f'[{el.id}] <{el.role}> {text_preview}{cues_str} '
f"{position_str} {size_str} importance:{el.importance}{status_str}"
)

return "\n".join(lines)
Expand All @@ -87,24 +112,44 @@ def query_llm(self, dom_context: str, goal: str) -> LLMResponse:
{dom_context}

VISUAL CUES EXPLAINED:
- {{PRIMARY}}: Main call-to-action element on the page
- {{CLICKABLE}}: Element is clickable
- {{color:X}}: Background color name
After the text, you may see visual cues in curly braces like {{CLICKABLE}} or {{PRIMARY,CLICKABLE,color:white}}:
- PRIMARY: Main call-to-action element on the page
- CLICKABLE: Element is clickable/interactive
- color:X: Background color name (e.g., color:white, color:blue)
Multiple cues are comma-separated inside the braces: {{CLICKABLE,color:white}}

ELEMENT FORMAT EXPLAINED:
Each element line follows this format:
[ID] <role> "text" {{cues}} @ (x,y) size:WxH importance:score [status]

Example: [346] <button> "Computer Accessories" {{CLICKABLE,color:white}} @ (664,100) size:150x40 importance:811

Breaking down each part:
- [ID]: The number in brackets is the element ID - use this EXACT number in CLICK/TYPE commands
Example: If you see [346], use CLICK(346) or TYPE(346, "text")
- <role>: Element type (button, link, textbox, etc.)
- "text": Visible text content (truncated with "..." if long)
- {{cues}}: Optional visual cues in curly braces (e.g., {{CLICKABLE}}, {{PRIMARY,CLICKABLE}}, {{CLICKABLE,color:white}})
If no cues, this part is omitted entirely
- @ (x,y): Element position in pixels from top-left corner
- size:WxH: Element dimensions (width x height in pixels)
- importance: Score indicating element relevance (higher = more important)
- [status]: Optional status flags in brackets (not_in_viewport, occluded, diff:ADDED/MODIFIED/etc)

CRITICAL RESPONSE FORMAT:
You MUST respond with ONLY ONE of these exact action formats:
- CLICK(id) - Click element by ID
- TYPE(id, "text") - Type text into element
- CLICK(id) - Click element by ID (use the number from [ID] brackets)
- TYPE(id, "text") - Type text into element (use the number from [ID] brackets)
- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
- FINISH() - Task complete

DO NOT include any explanation, reasoning, or natural language.
DO NOT use markdown formatting or code blocks.
DO NOT say "The next step is..." or anything similar.

CORRECT Examples:
CLICK(42)
TYPE(15, "magic mouse")
CORRECT Examples (matching element IDs from the list above):
If element is [346] <button> "Click me" → respond: CLICK(346)
If element is [15] <textbox> "Search" → respond: TYPE(15, "magic mouse")
PRESS("Enter")
FINISH()

Expand Down
56 changes: 51 additions & 5 deletions sentience/schemas/trace_v1.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
"type": {
"type": "string",
"enum": ["run_start", "step_start", "snapshot_taken", "llm_called", "action_executed", "verification", "recovery", "step_end", "run_end", "error"],
"enum": ["run_start", "step_start", "snapshot", "snapshot_taken", "llm_called", "llm_response", "action", "action_executed", "verification", "recovery", "step_end", "run_end", "error"],
"description": "Event type"
},
"ts": {
Expand Down Expand Up @@ -64,15 +64,61 @@
}
},
{
"description": "snapshot_taken data",
"required": ["step_id", "snapshot_digest"],
"description": "snapshot or snapshot_taken data",
"properties": {
"step_id": {"type": "string"},
"step_id": {"type": ["string", "null"]},
"snapshot_id": {"type": ["string", "null"]},
"snapshot_digest": {"type": "string", "pattern": "^sha256:[0-9a-f]{64}$"},
"snapshot_digest_loose": {"type": "string", "pattern": "^sha256:[0-9a-f]{64}$"},
"url": {"type": ["string", "null"]},
"element_count": {"type": "integer"}
"element_count": {"type": "integer"},
"timestamp": {"type": ["string", "null"]},
"elements": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"role": {"type": "string"},
"text": {"type": ["string", "null"]},
"importance": {"type": "number"},
"importance_score": {"type": "number"},
"bbox": {
"type": "object",
"properties": {
"x": {"type": "number"},
"y": {"type": "number"},
"width": {"type": "number"},
"height": {"type": "number"}
},
"required": ["x", "y", "width", "height"]
},
"visual_cues": {
"type": "object",
"properties": {
"is_primary": {"type": "boolean"},
"is_clickable": {"type": "boolean"},
"background_color_name": {"type": ["string", "null"]}
}
},
"in_viewport": {"type": "boolean"},
"is_occluded": {"type": "boolean"},
"z_index": {"type": "integer"},
"rerank_index": {"type": ["integer", "null"]},
"heuristic_index": {"type": ["integer", "null"]},
"ml_probability": {"type": ["number", "null"]},
"ml_score": {"type": ["number", "null"]},
"diff_status": {
"type": ["string", "null"],
"enum": ["ADDED", "REMOVED", "MODIFIED", "MOVED", null],
"description": "Diff status for Diff Overlay feature. ADDED: new element, REMOVED: element was removed, MODIFIED: element changed, MOVED: element position changed, null: no change"
}
},
"required": ["id", "role", "importance", "bbox", "visual_cues"]
}
},
"screenshot_base64": {"type": ["string", "null"]},
"screenshot_format": {"type": ["string", "null"], "enum": ["png", "jpeg", null]}
}
},
{
Expand Down
2 changes: 1 addition & 1 deletion tests/test_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def test_agent_build_context():
assert "PRIMARY" in context
assert "CLICKABLE" in context
assert "color:blue" in context
assert "(Imp:900)" in context
assert "importance:900" in context


def test_agent_execute_click_action():
Expand Down
Loading