Skip to content

Commit d7899b4

Browse files
author
Sentience Dev
committed
Merge pull request #112 from SentienceAPI/optimize_llm
optimize LLM agent efficiency
2 parents 0889aba + a93bff8 commit d7899b4

File tree

4 files changed

+119
-26
lines changed

4 files changed

+119
-26
lines changed

sentience/element_filter.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ def filter_by_importance(
5757
Returns:
5858
Top N elements sorted by importance score
5959
"""
60-
elements = snapshot.elements
60+
# Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
61+
elements = [el for el in snapshot.elements if el.diff_status != "REMOVED"]
6162
# Elements are already sorted by importance in snapshot
6263
return elements[:max_elements]
6364

@@ -81,7 +82,8 @@ def filter_by_goal(
8182
Returns:
8283
Filtered list of elements sorted by boosted importance score
8384
"""
84-
elements = snapshot.elements
85+
# Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
86+
elements = [el for el in snapshot.elements if el.diff_status != "REMOVED"]
8587

8688
# If no goal provided, return all elements (up to limit)
8789
if not goal:

sentience/llm_interaction_handler.py

Lines changed: 63 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
"""
77

88
import re
9-
from typing import Optional
109

1110
from .llm_provider import LLMProvider, LLMResponse
1211
from .models import Snapshot
@@ -35,7 +34,7 @@ def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
3534
"""
3635
Convert snapshot elements to token-efficient prompt string.
3736
38-
Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
37+
Format: [ID] <role> "text" {cues} @ position size:WxH importance:score [status]
3938
4039
Args:
4140
snap: Snapshot object
@@ -46,24 +45,50 @@ def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
4645
"""
4746
lines = []
4847
for el in snap.elements:
48+
# Skip REMOVED elements - they're not actionable and shouldn't be in LLM context
49+
if el.diff_status == "REMOVED":
50+
continue
4951
# Extract visual cues
50-
cues = []
52+
cues: list[str] = []
5153
if el.visual_cues.is_primary:
5254
cues.append("PRIMARY")
5355
if el.visual_cues.is_clickable:
5456
cues.append("CLICKABLE")
5557
if el.visual_cues.background_color_name:
5658
cues.append(f"color:{el.visual_cues.background_color_name}")
5759

58-
# Format element line
60+
# Format element line with improved readability
61+
# Ensure cues is defined before using it in f-string
5962
cues_str = f" {{{','.join(cues)}}}" if cues else ""
60-
text_preview = (
61-
(el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
62-
)
63-
63+
64+
# Better text handling - show truncation indicator
65+
text_preview = ""
66+
if el.text:
67+
if len(el.text) > 50:
68+
text_preview = f'"{el.text[:50]}..."'
69+
else:
70+
text_preview = f'"{el.text}"'
71+
72+
# Build position and size info
73+
x, y = int(el.bbox.x), int(el.bbox.y)
74+
width, height = int(el.bbox.width), int(el.bbox.height)
75+
position_str = f"@ ({x},{y})"
76+
size_str = f"size:{width}x{height}"
77+
78+
# Build status indicators (only include if relevant)
79+
status_parts = []
80+
if not el.in_viewport:
81+
status_parts.append("not_in_viewport")
82+
if el.is_occluded:
83+
status_parts.append("occluded")
84+
if el.diff_status:
85+
status_parts.append(f"diff:{el.diff_status}")
86+
status_str = f" [{','.join(status_parts)}]" if status_parts else ""
87+
88+
# Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
6489
lines.append(
65-
f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
66-
f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
90+
f'[{el.id}] <{el.role}> {text_preview}{cues_str} '
91+
f"{position_str} {size_str} importance:{el.importance}{status_str}"
6792
)
6893

6994
return "\n".join(lines)
@@ -87,24 +112,44 @@ def query_llm(self, dom_context: str, goal: str) -> LLMResponse:
87112
{dom_context}
88113
89114
VISUAL CUES EXPLAINED:
90-
- {{PRIMARY}}: Main call-to-action element on the page
91-
- {{CLICKABLE}}: Element is clickable
92-
- {{color:X}}: Background color name
115+
After the text, you may see visual cues in curly braces like {{CLICKABLE}} or {{PRIMARY,CLICKABLE,color:white}}:
116+
- PRIMARY: Main call-to-action element on the page
117+
- CLICKABLE: Element is clickable/interactive
118+
- color:X: Background color name (e.g., color:white, color:blue)
119+
Multiple cues are comma-separated inside the braces: {{CLICKABLE,color:white}}
120+
121+
ELEMENT FORMAT EXPLAINED:
122+
Each element line follows this format:
123+
[ID] <role> "text" {{cues}} @ (x,y) size:WxH importance:score [status]
124+
125+
Example: [346] <button> "Computer Accessories" {{CLICKABLE,color:white}} @ (664,100) size:150x40 importance:811
126+
127+
Breaking down each part:
128+
- [ID]: The number in brackets is the element ID - use this EXACT number in CLICK/TYPE commands
129+
Example: If you see [346], use CLICK(346) or TYPE(346, "text")
130+
- <role>: Element type (button, link, textbox, etc.)
131+
- "text": Visible text content (truncated with "..." if long)
132+
- {{cues}}: Optional visual cues in curly braces (e.g., {{CLICKABLE}}, {{PRIMARY,CLICKABLE}}, {{CLICKABLE,color:white}})
133+
If no cues, this part is omitted entirely
134+
- @ (x,y): Element position in pixels from top-left corner
135+
- size:WxH: Element dimensions (width x height in pixels)
136+
- importance: Score indicating element relevance (higher = more important)
137+
- [status]: Optional status flags in brackets (not_in_viewport, occluded, diff:ADDED/MODIFIED/etc)
93138
94139
CRITICAL RESPONSE FORMAT:
95140
You MUST respond with ONLY ONE of these exact action formats:
96-
- CLICK(id) - Click element by ID
97-
- TYPE(id, "text") - Type text into element
141+
- CLICK(id) - Click element by ID (use the number from [ID] brackets)
142+
- TYPE(id, "text") - Type text into element (use the number from [ID] brackets)
98143
- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
99144
- FINISH() - Task complete
100145
101146
DO NOT include any explanation, reasoning, or natural language.
102147
DO NOT use markdown formatting or code blocks.
103148
DO NOT say "The next step is..." or anything similar.
104149
105-
CORRECT Examples:
106-
CLICK(42)
107-
TYPE(15, "magic mouse")
150+
CORRECT Examples (matching element IDs from the list above):
151+
If element is [346] <button> "Click me" → respond: CLICK(346)
152+
If element is [15] <textbox> "Search" → respond: TYPE(15, "magic mouse")
108153
PRESS("Enter")
109154
FINISH()
110155

sentience/schemas/trace_v1.json

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
},
1414
"type": {
1515
"type": "string",
16-
"enum": ["run_start", "step_start", "snapshot_taken", "llm_called", "action_executed", "verification", "recovery", "step_end", "run_end", "error"],
16+
"enum": ["run_start", "step_start", "snapshot", "snapshot_taken", "llm_called", "llm_response", "action", "action_executed", "verification", "recovery", "step_end", "run_end", "error"],
1717
"description": "Event type"
1818
},
1919
"ts": {
@@ -64,15 +64,61 @@
6464
}
6565
},
6666
{
67-
"description": "snapshot_taken data",
68-
"required": ["step_id", "snapshot_digest"],
67+
"description": "snapshot or snapshot_taken data",
6968
"properties": {
70-
"step_id": {"type": "string"},
69+
"step_id": {"type": ["string", "null"]},
7170
"snapshot_id": {"type": ["string", "null"]},
7271
"snapshot_digest": {"type": "string", "pattern": "^sha256:[0-9a-f]{64}$"},
7372
"snapshot_digest_loose": {"type": "string", "pattern": "^sha256:[0-9a-f]{64}$"},
7473
"url": {"type": ["string", "null"]},
75-
"element_count": {"type": "integer"}
74+
"element_count": {"type": "integer"},
75+
"timestamp": {"type": ["string", "null"]},
76+
"elements": {
77+
"type": "array",
78+
"items": {
79+
"type": "object",
80+
"properties": {
81+
"id": {"type": "integer"},
82+
"role": {"type": "string"},
83+
"text": {"type": ["string", "null"]},
84+
"importance": {"type": "number"},
85+
"importance_score": {"type": "number"},
86+
"bbox": {
87+
"type": "object",
88+
"properties": {
89+
"x": {"type": "number"},
90+
"y": {"type": "number"},
91+
"width": {"type": "number"},
92+
"height": {"type": "number"}
93+
},
94+
"required": ["x", "y", "width", "height"]
95+
},
96+
"visual_cues": {
97+
"type": "object",
98+
"properties": {
99+
"is_primary": {"type": "boolean"},
100+
"is_clickable": {"type": "boolean"},
101+
"background_color_name": {"type": ["string", "null"]}
102+
}
103+
},
104+
"in_viewport": {"type": "boolean"},
105+
"is_occluded": {"type": "boolean"},
106+
"z_index": {"type": "integer"},
107+
"rerank_index": {"type": ["integer", "null"]},
108+
"heuristic_index": {"type": ["integer", "null"]},
109+
"ml_probability": {"type": ["number", "null"]},
110+
"ml_score": {"type": ["number", "null"]},
111+
"diff_status": {
112+
"type": ["string", "null"],
113+
"enum": ["ADDED", "REMOVED", "MODIFIED", "MOVED", null],
114+
"description": "Diff status for Diff Overlay feature. ADDED: new element, REMOVED: element was removed, MODIFIED: element changed, MOVED: element position changed, null: no change"
115+
}
116+
},
117+
"required": ["id", "role", "importance", "bbox", "visual_cues"]
118+
}
119+
},
120+
"screenshot_base64": {"type": ["string", "null"]},
121+
"screenshot_format": {"type": ["string", "null"], "enum": ["png", "jpeg", null]}
76122
}
77123
},
78124
{

tests/test_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def test_agent_build_context():
185185
assert "PRIMARY" in context
186186
assert "CLICKABLE" in context
187187
assert "color:blue" in context
188-
assert "(Imp:900)" in context
188+
assert "importance:900" in context
189189

190190

191191
def test_agent_execute_click_action():

0 commit comments

Comments
 (0)