Merge pull request #112 from SentienceAPI/optimize_llm

Sentience Dev · Sentience Dev · commit d7899b412e3a · 2026-01-03T14:40:49.000-08:00
optimize LLM agent efficiency
diff --git a/sentience/element_filter.py b/sentience/element_filter.py
@@ -57,7 +57,8 @@ def filter_by_importance(
         Returns:
             Top N elements sorted by importance score
         """
-        elements = snapshot.elements
+        # Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
+        elements = [el for el in snapshot.elements if el.diff_status != "REMOVED"]
         # Elements are already sorted by importance in snapshot
         return elements[:max_elements]
 
@@ -81,7 +82,8 @@ def filter_by_goal(
         Returns:
             Filtered list of elements sorted by boosted importance score
         """
-        elements = snapshot.elements
+        # Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
+        elements = [el for el in snapshot.elements if el.diff_status != "REMOVED"]
 
         # If no goal provided, return all elements (up to limit)
         if not goal:
diff --git a/sentience/llm_interaction_handler.py b/sentience/llm_interaction_handler.py
@@ -6,7 +6,6 @@
 """
 
 import re
-from typing import Optional
 
 from .llm_provider import LLMProvider, LLMResponse
 from .models import Snapshot
@@ -35,7 +34,7 @@ def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
         """
         Convert snapshot elements to token-efficient prompt string.
 
-        Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
+        Format: [ID] <role> "text" {cues} @ position size:WxH importance:score [status]
 
         Args:
             snap: Snapshot object
@@ -46,24 +45,50 @@ def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
         """
         lines = []
         for el in snap.elements:
+            # Skip REMOVED elements - they're not actionable and shouldn't be in LLM context
+            if el.diff_status == "REMOVED":
+                continue
             # Extract visual cues
-            cues = []
+            cues: list[str] = []
             if el.visual_cues.is_primary:
                 cues.append("PRIMARY")
             if el.visual_cues.is_clickable:
                 cues.append("CLICKABLE")
             if el.visual_cues.background_color_name:
                 cues.append(f"color:{el.visual_cues.background_color_name}")
 
-            # Format element line
+            # Format element line with improved readability
+            # Ensure cues is defined before using it in f-string
             cues_str = f" {{{','.join(cues)}}}" if cues else ""
-            text_preview = (
-                (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
-            )
-
+            
+            # Better text handling - show truncation indicator
+            text_preview = ""
+            if el.text:
+                if len(el.text) > 50:
+                    text_preview = f'"{el.text[:50]}..."'
+                else:
+                    text_preview = f'"{el.text}"'
+            
+            # Build position and size info
+            x, y = int(el.bbox.x), int(el.bbox.y)
+            width, height = int(el.bbox.width), int(el.bbox.height)
+            position_str = f"@ ({x},{y})"
+            size_str = f"size:{width}x{height}"
+            
+            # Build status indicators (only include if relevant)
+            status_parts = []
+            if not el.in_viewport:
+                status_parts.append("not_in_viewport")
+            if el.is_occluded:
+                status_parts.append("occluded")
+            if el.diff_status:
+                status_parts.append(f"diff:{el.diff_status}")
+            status_str = f" [{','.join(status_parts)}]" if status_parts else ""
+            
+            # Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
             lines.append(
-                f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
-                f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
+                f'[{el.id}] <{el.role}> {text_preview}{cues_str} '
+                f"{position_str} {size_str} importance:{el.importance}{status_str}"
             )
 
         return "\n".join(lines)
@@ -87,24 +112,44 @@ def query_llm(self, dom_context: str, goal: str) -> LLMResponse:
 {dom_context}
 
 VISUAL CUES EXPLAINED:
-- {{PRIMARY}}: Main call-to-action element on the page
-- {{CLICKABLE}}: Element is clickable
-- {{color:X}}: Background color name
+After the text, you may see visual cues in curly braces like {{CLICKABLE}} or {{PRIMARY,CLICKABLE,color:white}}:
+- PRIMARY: Main call-to-action element on the page
+- CLICKABLE: Element is clickable/interactive
+- color:X: Background color name (e.g., color:white, color:blue)
+Multiple cues are comma-separated inside the braces: {{CLICKABLE,color:white}}
+
+ELEMENT FORMAT EXPLAINED:
+Each element line follows this format:
+[ID] <role> "text" {{cues}} @ (x,y) size:WxH importance:score [status]
+
+Example: [346] <button> "Computer Accessories" {{CLICKABLE,color:white}} @ (664,100) size:150x40 importance:811
+
+Breaking down each part:
+- [ID]: The number in brackets is the element ID - use this EXACT number in CLICK/TYPE commands
+  Example: If you see [346], use CLICK(346) or TYPE(346, "text")
+- <role>: Element type (button, link, textbox, etc.)
+- "text": Visible text content (truncated with "..." if long)
+- {{cues}}: Optional visual cues in curly braces (e.g., {{CLICKABLE}}, {{PRIMARY,CLICKABLE}}, {{CLICKABLE,color:white}})
+  If no cues, this part is omitted entirely
+- @ (x,y): Element position in pixels from top-left corner
+- size:WxH: Element dimensions (width x height in pixels)
+- importance: Score indicating element relevance (higher = more important)
+- [status]: Optional status flags in brackets (not_in_viewport, occluded, diff:ADDED/MODIFIED/etc)
 
 CRITICAL RESPONSE FORMAT:
 You MUST respond with ONLY ONE of these exact action formats:
-- CLICK(id) - Click element by ID
-- TYPE(id, "text") - Type text into element
+- CLICK(id) - Click element by ID (use the number from [ID] brackets)
+- TYPE(id, "text") - Type text into element (use the number from [ID] brackets)
 - PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
 - FINISH() - Task complete
 
 DO NOT include any explanation, reasoning, or natural language.
 DO NOT use markdown formatting or code blocks.
 DO NOT say "The next step is..." or anything similar.
 
-CORRECT Examples:
-CLICK(42)
-TYPE(15, "magic mouse")
+CORRECT Examples (matching element IDs from the list above):
+If element is [346] <button> "Click me" → respond: CLICK(346)
+If element is [15] <textbox> "Search" → respond: TYPE(15, "magic mouse")
 PRESS("Enter")
 FINISH()
 
diff --git a/sentience/schemas/trace_v1.json b/sentience/schemas/trace_v1.json
@@ -13,7 +13,7 @@
     },
     "type": {
       "type": "string",
-      "enum": ["run_start", "step_start", "snapshot_taken", "llm_called", "action_executed", "verification", "recovery", "step_end", "run_end", "error"],
+      "enum": ["run_start", "step_start", "snapshot", "snapshot_taken", "llm_called", "llm_response", "action", "action_executed", "verification", "recovery", "step_end", "run_end", "error"],
       "description": "Event type"
     },
     "ts": {
@@ -64,15 +64,61 @@
           }
         },
         {
-          "description": "snapshot_taken data",
-          "required": ["step_id", "snapshot_digest"],
+          "description": "snapshot or snapshot_taken data",
           "properties": {
-            "step_id": {"type": "string"},
+            "step_id": {"type": ["string", "null"]},
             "snapshot_id": {"type": ["string", "null"]},
             "snapshot_digest": {"type": "string", "pattern": "^sha256:[0-9a-f]{64}$"},
             "snapshot_digest_loose": {"type": "string", "pattern": "^sha256:[0-9a-f]{64}$"},
             "url": {"type": ["string", "null"]},
-            "element_count": {"type": "integer"}
+            "element_count": {"type": "integer"},
+            "timestamp": {"type": ["string", "null"]},
+            "elements": {
+              "type": "array",
+              "items": {
+                "type": "object",
+                "properties": {
+                  "id": {"type": "integer"},
+                  "role": {"type": "string"},
+                  "text": {"type": ["string", "null"]},
+                  "importance": {"type": "number"},
+                  "importance_score": {"type": "number"},
+                  "bbox": {
+                    "type": "object",
+                    "properties": {
+                      "x": {"type": "number"},
+                      "y": {"type": "number"},
+                      "width": {"type": "number"},
+                      "height": {"type": "number"}
+                    },
+                    "required": ["x", "y", "width", "height"]
+                  },
+                  "visual_cues": {
+                    "type": "object",
+                    "properties": {
+                      "is_primary": {"type": "boolean"},
+                      "is_clickable": {"type": "boolean"},
+                      "background_color_name": {"type": ["string", "null"]}
+                    }
+                  },
+                  "in_viewport": {"type": "boolean"},
+                  "is_occluded": {"type": "boolean"},
+                  "z_index": {"type": "integer"},
+                  "rerank_index": {"type": ["integer", "null"]},
+                  "heuristic_index": {"type": ["integer", "null"]},
+                  "ml_probability": {"type": ["number", "null"]},
+                  "ml_score": {"type": ["number", "null"]},
+                  "diff_status": {
+                    "type": ["string", "null"],
+                    "enum": ["ADDED", "REMOVED", "MODIFIED", "MOVED", null],
+                    "description": "Diff status for Diff Overlay feature. ADDED: new element, REMOVED: element was removed, MODIFIED: element changed, MOVED: element position changed, null: no change"
+                  }
+                },
+                "required": ["id", "role", "importance", "bbox", "visual_cues"]
+              }
+            },
+            "screenshot_base64": {"type": ["string", "null"]},
+            "screenshot_format": {"type": ["string", "null"], "enum": ["png", "jpeg", null]}
           }
         },
         {
diff --git a/tests/test_agent.py b/tests/test_agent.py
@@ -185,7 +185,7 @@ def test_agent_build_context():
     assert "PRIMARY" in context
     assert "CLICKABLE" in context
     assert "color:blue" in context
-    assert "(Imp:900)" in context
+    assert "importance:900" in context
 
 
 def test_agent_execute_click_action():