From 619cb3a1d36b59ec91077ec43559b6e7bdfd80a7 Mon Sep 17 00:00:00 2001 From: rcholic Date: Wed, 24 Dec 2025 11:50:27 -0800 Subject: [PATCH 1/2] improve agent code --- examples/agent_layers_demo.py | 4 +- sentience/__init__.py | 27 +++- sentience/agent.py | 253 ++++++++++++++++++++++++------ sentience/base_agent.py | 115 ++++++++++++++ sentience/conversational_agent.py | 16 +- sentience/models.py | 89 ++++++++++- tests/test_agent.py | 44 ++++-- 7 files changed, 470 insertions(+), 78 deletions(-) create mode 100644 sentience/base_agent.py diff --git a/examples/agent_layers_demo.py b/examples/agent_layers_demo.py index c5432e7..ceab8a5 100644 --- a/examples/agent_layers_demo.py +++ b/examples/agent_layers_demo.py @@ -85,7 +85,9 @@ def demo_layer2_sentience_agent(): print(" Code required: ~10 lines") print(" Technical knowledge: Medium") print(" Flexibility: High") - print(f" Tokens used: {agent.get_token_stats()['total_tokens']}") + # Use new TokenStats dataclass + stats = agent.get_token_stats() + print(f" Tokens used: {stats.total_tokens}") def demo_layer3_conversational_agent(): diff --git a/sentience/__init__.py b/sentience/__init__.py index 31094a3..3384549 100644 --- a/sentience/__init__.py +++ b/sentience/__init__.py @@ -3,7 +3,22 @@ """ from .browser import SentienceBrowser -from .models import Snapshot, Element, BBox, Viewport, ActionResult, WaitResult +from .models import ( + Snapshot, + Element, + BBox, + Viewport, + ActionResult, + WaitResult, + # Agent Layer Models + AgentActionResult, + TokenStats, + ActionHistory, + ActionTokenUsage, + SnapshotOptions, + SnapshotFilter, + ScreenshotConfig +) from .snapshot import snapshot from .query import query, find from .actions import click, type_text, press, click_rect @@ -16,6 +31,7 @@ from .screenshot import screenshot # Agent Layer (Phase 1 & 2) +from .base_agent import BaseAgent from .llm_provider import LLMProvider, LLMResponse, OpenAIProvider, AnthropicProvider, LocalLLMProvider from .agent import SentienceAgent from .conversational_agent import ConversationalAgent @@ -51,6 +67,7 @@ "read", "screenshot", # Agent Layer (Phase 1 & 2) + "BaseAgent", "LLMProvider", "LLMResponse", "OpenAIProvider", @@ -58,5 +75,13 @@ "LocalLLMProvider", "SentienceAgent", "ConversationalAgent", + # Agent Layer Models + "AgentActionResult", + "TokenStats", + "ActionHistory", + "ActionTokenUsage", + "SnapshotOptions", + "SnapshotFilter", + "ScreenshotConfig", ] diff --git a/sentience/agent.py b/sentience/agent.py index 158c4d9..d6d8735 100644 --- a/sentience/agent.py +++ b/sentience/agent.py @@ -5,15 +5,27 @@ import re import time -from typing import Dict, Any, List, Optional +from typing import Dict, Any, List, Optional, Union +from .base_agent import BaseAgent from .llm_provider import LLMProvider, LLMResponse from .browser import SentienceBrowser from .snapshot import snapshot from .actions import click, type_text, press -from .models import Snapshot, Element, ActionResult - - -class SentienceAgent: +from .models import ( + Snapshot, + Element, + ActionResult, + AgentActionResult, + TokenStats, + ActionHistory, + ActionTokenUsage, + SnapshotOptions, + SnapshotFilter, + ScreenshotConfig +) + + +class SentienceAgent(BaseAgent): """ High-level agent that combines Sentience SDK with any LLM provider. @@ -41,7 +53,7 @@ def __init__( self, browser: SentienceBrowser, llm: LLMProvider, - snapshot_limit: int = 50, + default_snapshot_limit: int = 50, verbose: bool = True ): """ @@ -50,19 +62,19 @@ def __init__( Args: browser: SentienceBrowser instance llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.) - snapshot_limit: Maximum elements to include in context (default: 50) + default_snapshot_limit: Default maximum elements to include in context (default: 50) verbose: Print execution logs (default: True) """ self.browser = browser self.llm = llm - self.snapshot_limit = snapshot_limit + self.default_snapshot_limit = default_snapshot_limit self.verbose = verbose # Execution history self.history: List[Dict[str, Any]] = [] - # Token usage tracking - self.token_usage = { + # Token usage tracking (will be converted to TokenStats on get_token_stats()) + self._token_usage_raw = { "total_prompt_tokens": 0, "total_completion_tokens": 0, "total_tokens": 0, @@ -73,23 +85,26 @@ def act( self, goal: str, max_retries: int = 2, - snapshot_options: Optional[Dict[str, Any]] = None - ) -> Dict[str, Any]: + snapshot_options: Optional[SnapshotOptions] = None + ) -> AgentActionResult: """ Execute a high-level goal using observe → think → act loop Args: goal: Natural language instruction (e.g., "Click the Sign In button") max_retries: Number of retries on failure (default: 2) - snapshot_options: Optional snapshot parameters (limit, filter, etc.) + snapshot_options: Optional SnapshotOptions for this specific action Returns: - Result dict with status, action_taken, reasoning, and execution data + AgentActionResult with execution details Example: >>> result = agent.act("Click the search box") - >>> print(result) - {'success': True, 'action': 'click', 'element_id': 42, 'reasoning': '...'} + >>> print(result.success, result.action, result.element_id) + True click 42 + >>> # Backward compatible dict access + >>> print(result["element_id"]) # Works but shows deprecation warning + 42 """ if self.verbose: print(f"\n{'='*70}") @@ -101,16 +116,46 @@ def act( # 1. OBSERVE: Get refined semantic snapshot start_time = time.time() - snap_opts = snapshot_options or {} - snap_opts.setdefault('limit', self.snapshot_limit) + # Use provided options or create default + snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit) - snap = snapshot(self.browser, **snap_opts) + # Convert screenshot config to dict if needed + screenshot_param = snap_opts.screenshot + if isinstance(snap_opts.screenshot, ScreenshotConfig): + screenshot_param = { + 'format': snap_opts.screenshot.format, + 'quality': snap_opts.screenshot.quality + } + + # Call snapshot with converted parameters + snap = snapshot( + self.browser, + screenshot=screenshot_param, + limit=snap_opts.limit, + filter=snap_opts.filter.model_dump() if snap_opts.filter else None, + use_api=snap_opts.use_api + ) if snap.status != "success": raise RuntimeError(f"Snapshot failed: {snap.error}") + # Apply element filtering based on goal + filtered_elements = self.filter_elements(snap, goal) + + # Create filtered snapshot + filtered_snap = Snapshot( + status=snap.status, + timestamp=snap.timestamp, + url=snap.url, + viewport=snap.viewport, + elements=filtered_elements, + screenshot=snap.screenshot, + screenshot_format=snap.screenshot_format, + error=snap.error + ) + # 2. GROUND: Format elements for LLM context - context = self._build_context(snap, goal) + context = self._build_context(filtered_snap, goal) # 3. THINK: Query LLM for next action llm_response = self._query_llm(context, goal) @@ -125,25 +170,38 @@ def act( action_str = llm_response.content.strip() # 4. EXECUTE: Parse and run action - result = self._execute_action(action_str, snap) + result_dict = self._execute_action(action_str, filtered_snap) duration_ms = int((time.time() - start_time) * 1000) - result['duration_ms'] = duration_ms - result['attempt'] = attempt - result['goal'] = goal + + # Create AgentActionResult from execution result + result = AgentActionResult( + success=result_dict["success"], + action=result_dict["action"], + goal=goal, + duration_ms=duration_ms, + attempt=attempt, + element_id=result_dict.get("element_id"), + text=result_dict.get("text"), + key=result_dict.get("key"), + outcome=result_dict.get("outcome"), + url_changed=result_dict.get("url_changed"), + error=result_dict.get("error"), + message=result_dict.get("message") + ) # 5. RECORD: Track history self.history.append({ "goal": goal, "action": action_str, - "result": result, - "success": result.get("success", False), + "result": result.model_dump(), # Store as dict + "success": result.success, "attempt": attempt, "duration_ms": duration_ms }) if self.verbose: - status = "✅" if result.get("success") else "❌" + status = "✅" if result.success else "❌" print(f"{status} Completed in {duration_ms}ms") return result @@ -155,13 +213,23 @@ def act( time.sleep(1.0) # Brief delay before retry continue else: - error_result = { - "success": False, + # Create error result + error_result = AgentActionResult( + success=False, + action="error", + goal=goal, + duration_ms=0, + attempt=attempt, + error=str(e) + ) + self.history.append({ "goal": goal, - "error": str(e), - "attempt": attempt - } - self.history.append(error_result) + "action": "error", + "result": error_result.model_dump(), + "success": False, + "attempt": attempt, + "duration_ms": 0 + }) raise RuntimeError(f"Failed after {max_retries} retries: {e}") def _build_context(self, snap: Snapshot, goal: str) -> str: @@ -178,7 +246,8 @@ def _build_context(self, snap: Snapshot, goal: str) -> str: Formatted element context string """ lines = [] - for el in snap.elements[:self.snapshot_limit]: + # Note: elements are already filtered by filter_elements() in act() + for el in snap.elements: # Extract visual cues cues = [] if el.visual_cues.is_primary: @@ -214,7 +283,7 @@ def _query_llm(self, dom_context: str, goal: str) -> LLMResponse: GOAL: {goal} -VISIBLE ELEMENTS (sorted by importance, max {self.snapshot_limit}): +VISIBLE ELEMENTS (sorted by importance): {dom_context} VISUAL CUES EXPLAINED: @@ -312,44 +381,126 @@ def _track_tokens(self, goal: str, llm_response: LLMResponse): llm_response: LLM response with token usage """ if llm_response.prompt_tokens: - self.token_usage["total_prompt_tokens"] += llm_response.prompt_tokens + self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens if llm_response.completion_tokens: - self.token_usage["total_completion_tokens"] += llm_response.completion_tokens + self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens if llm_response.total_tokens: - self.token_usage["total_tokens"] += llm_response.total_tokens + self._token_usage_raw["total_tokens"] += llm_response.total_tokens - self.token_usage["by_action"].append({ + self._token_usage_raw["by_action"].append({ "goal": goal, - "prompt_tokens": llm_response.prompt_tokens, - "completion_tokens": llm_response.completion_tokens, - "total_tokens": llm_response.total_tokens, + "prompt_tokens": llm_response.prompt_tokens or 0, + "completion_tokens": llm_response.completion_tokens or 0, + "total_tokens": llm_response.total_tokens or 0, "model": llm_response.model_name }) - def get_token_stats(self) -> Dict[str, Any]: + def get_token_stats(self) -> TokenStats: """ Get token usage statistics Returns: - Dictionary with token usage breakdown + TokenStats with token usage breakdown """ - return self.token_usage.copy() - - def get_history(self) -> List[Dict[str, Any]]: + by_action = [ + ActionTokenUsage(**action) + for action in self._token_usage_raw["by_action"] + ] + return TokenStats( + total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"], + total_completion_tokens=self._token_usage_raw["total_completion_tokens"], + total_tokens=self._token_usage_raw["total_tokens"], + by_action=by_action + ) + + def get_history(self) -> List[ActionHistory]: """ Get execution history Returns: - List of all actions taken with results + List of ActionHistory entries """ - return self.history.copy() + return [ActionHistory(**h) for h in self.history] - def clear_history(self): + def clear_history(self) -> None: """Clear execution history and reset token counters""" self.history.clear() - self.token_usage = { + self._token_usage_raw = { "total_prompt_tokens": 0, "total_completion_tokens": 0, "total_tokens": 0, "by_action": [] } + + def filter_elements( + self, + snapshot: Snapshot, + goal: Optional[str] = None + ) -> List[Element]: + """ + Filter elements from snapshot based on goal context. + + This default implementation applies goal-based keyword matching to boost + relevant elements and filters out irrelevant ones. + + Args: + snapshot: Current page snapshot + goal: User's goal (can inform filtering) + + Returns: + Filtered list of elements + """ + elements = snapshot.elements + + # If no goal provided, return all elements (up to limit) + if not goal: + return elements[:self.default_snapshot_limit] + + goal_lower = goal.lower() + + # Extract keywords from goal + keywords = self._extract_keywords(goal_lower) + + # Boost elements matching goal keywords + scored_elements = [] + for el in elements: + score = el.importance + + # Boost if element text matches goal + if el.text and any(kw in el.text.lower() for kw in keywords): + score += 0.3 + + # Boost if role matches goal intent + if "click" in goal_lower and el.visual_cues.is_clickable: + score += 0.2 + if "type" in goal_lower and el.role in ["textbox", "searchbox"]: + score += 0.2 + if "search" in goal_lower: + # Filter out non-interactive elements for search tasks + if el.role in ["link", "img"] and not el.visual_cues.is_primary: + score -= 0.5 + + scored_elements.append((score, el)) + + # Re-sort by boosted score + scored_elements.sort(key=lambda x: x[0], reverse=True) + elements = [el for _, el in scored_elements] + + return elements[:self.default_snapshot_limit] + + def _extract_keywords(self, text: str) -> List[str]: + """ + Extract meaningful keywords from goal text + + Args: + text: Text to extract keywords from + + Returns: + List of keywords + """ + stopwords = { + "the", "a", "an", "and", "or", "but", "in", "on", "at", + "to", "for", "of", "with", "by", "from", "as", "is", "was" + } + words = text.split() + return [w for w in words if w not in stopwords and len(w) > 2] diff --git a/sentience/base_agent.py b/sentience/base_agent.py new file mode 100644 index 0000000..8a15a25 --- /dev/null +++ b/sentience/base_agent.py @@ -0,0 +1,115 @@ +""" +BaseAgent: Abstract base class for all Sentience agents +Defines the interface that all agent implementations must follow +""" + +from abc import ABC, abstractmethod +from typing import List, Optional +from .models import ( + Snapshot, + Element, + AgentActionResult, + TokenStats, + ActionHistory +) + + +class BaseAgent(ABC): + """ + Abstract base class for all Sentience agents. + + Provides a standard interface for: + - Executing natural language goals (act) + - Tracking execution history + - Monitoring token usage + - Filtering elements based on goals + + Subclasses must implement: + - act(): Execute a natural language goal + - get_history(): Return execution history + - get_token_stats(): Return token usage statistics + - clear_history(): Reset history and token counters + + Subclasses can override: + - filter_elements(): Customize element filtering logic + """ + + @abstractmethod + def act( + self, + goal: str, + **kwargs + ) -> AgentActionResult: + """ + Execute a natural language goal using the agent. + + Args: + goal: Natural language instruction (e.g., "Click the login button") + **kwargs: Additional parameters (implementation-specific) + + Returns: + AgentActionResult with execution details + + Raises: + RuntimeError: If execution fails after retries + """ + pass + + @abstractmethod + def get_history(self) -> List[ActionHistory]: + """ + Get the execution history of all actions taken. + + Returns: + List of ActionHistory entries + """ + pass + + @abstractmethod + def get_token_stats(self) -> TokenStats: + """ + Get token usage statistics for the agent session. + + Returns: + TokenStats with cumulative token counts + """ + pass + + @abstractmethod + def clear_history(self) -> None: + """ + Clear execution history and reset token counters. + + This resets the agent to a clean state. + """ + pass + + def filter_elements( + self, + snapshot: Snapshot, + goal: Optional[str] = None + ) -> List[Element]: + """ + Filter elements from a snapshot based on goal context. + + Default implementation returns all elements unchanged. + Subclasses can override to implement custom filtering logic + such as: + - Removing irrelevant elements based on goal keywords + - Boosting importance of matching elements + - Filtering by role, size, or visual properties + + Args: + snapshot: Current page snapshot + goal: User's goal (can inform filtering strategy) + + Returns: + Filtered list of elements (default: all elements) + + Example: + >>> agent = SentienceAgent(browser, llm) + >>> snap = snapshot(browser) + >>> filtered = agent.filter_elements(snap, goal="Click login") + >>> # filtered now contains only relevant elements + """ + return snapshot.elements diff --git a/sentience/conversational_agent.py b/sentience/conversational_agent.py index c3f4e83..6ada4b9 100644 --- a/sentience/conversational_agent.py +++ b/sentience/conversational_agent.py @@ -240,32 +240,32 @@ def _execute_step(self, step: Dict[str, Any]) -> Dict[str, Any]: elif action == "FIND_AND_CLICK": element_desc = params['element_description'] - # Use technical agent to find and click + # Use technical agent to find and click (returns AgentActionResult) result = self.technical_agent.act(f"Click the {element_desc}") return { - "success": result.get('success', False), + "success": result.success, # Use attribute access "action": action, - "data": result + "data": result.model_dump() # Convert to dict for flexibility } elif action == "FIND_AND_TYPE": element_desc = params['element_description'] text = params['text'] - # Use technical agent to find input and type + # Use technical agent to find input and type (returns AgentActionResult) result = self.technical_agent.act(f"Type '{text}' into {element_desc}") return { - "success": result.get('success', False), + "success": result.success, # Use attribute access "action": action, - "data": {"text": text} + "data": {"text": text, "result": result.model_dump()} } elif action == "PRESS_KEY": key = params['key'] result = self.technical_agent.act(f"Press {key} key") return { - "success": result.get('success', False), + "success": result.success, # Use attribute access "action": action, - "data": {"key": key} + "data": {"key": key, "result": result.model_dump()} } elif action == "WAIT": diff --git a/sentience/models.py b/sentience/models.py index 89f50e7..18a2795 100644 --- a/sentience/models.py +++ b/sentience/models.py @@ -3,7 +3,7 @@ """ from pydantic import BaseModel, Field -from typing import Optional, List, Literal +from typing import Optional, List, Literal, Union from datetime import datetime @@ -77,3 +77,90 @@ class WaitResult(BaseModel): duration_ms: int timeout: bool + +# ========== Agent Layer Models ========== + +class ScreenshotConfig(BaseModel): + """Screenshot format configuration""" + format: Literal['png', 'jpeg'] = 'png' + quality: Optional[int] = Field(None, ge=1, le=100) # Only for JPEG (1-100) + + +class SnapshotFilter(BaseModel): + """Filter options for snapshot elements""" + min_area: Optional[int] = Field(None, ge=0) + allowed_roles: Optional[List[str]] = None + min_z_index: Optional[int] = None + + +class SnapshotOptions(BaseModel): + """ + Configuration for snapshot calls. + Matches TypeScript SnapshotOptions interface from sdk-ts/src/snapshot.ts + """ + screenshot: Union[bool, ScreenshotConfig] = False # Union type: boolean or config + limit: int = Field(50, ge=1, le=500) + filter: Optional[SnapshotFilter] = None + use_api: Optional[bool] = None # Force API vs extension + + class Config: + arbitrary_types_allowed = True + + +class AgentActionResult(BaseModel): + """Result of a single agent action (from agent.act())""" + success: bool + action: Literal["click", "type", "press", "finish", "error"] + goal: str + duration_ms: int + attempt: int + + # Optional fields based on action type + element_id: Optional[int] = None + text: Optional[str] = None + key: Optional[str] = None + outcome: Optional[Literal["navigated", "dom_updated", "no_change", "error"]] = None + url_changed: Optional[bool] = None + error: Optional[str] = None + message: Optional[str] = None # For FINISH action + + def __getitem__(self, key): + """ + Support dict-style access for backward compatibility. + This allows existing code using result["success"] to continue working. + """ + import warnings + warnings.warn( + f"Dict-style access result['{key}'] is deprecated. Use result.{key} instead.", + DeprecationWarning, + stacklevel=2 + ) + return getattr(self, key) + + +class ActionTokenUsage(BaseModel): + """Token usage for a single action""" + goal: str + prompt_tokens: int + completion_tokens: int + total_tokens: int + model: str + + +class TokenStats(BaseModel): + """Token usage statistics for an agent session""" + total_prompt_tokens: int + total_completion_tokens: int + total_tokens: int + by_action: List[ActionTokenUsage] + + +class ActionHistory(BaseModel): + """Single history entry from agent execution""" + goal: str + action: str # The raw action string from LLM + result: dict # Will be AgentActionResult but stored as dict for flexibility + success: bool + attempt: int + duration_ms: int + diff --git a/tests/test_agent.py b/tests/test_agent.py index e837487..32de611 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -161,14 +161,16 @@ def test_agent_initialization(): browser = create_mock_browser() llm = MockLLMProvider() - agent = SentienceAgent(browser, llm, snapshot_limit=50, verbose=False) + agent = SentienceAgent(browser, llm, default_snapshot_limit=50, verbose=False) assert agent.browser == browser assert agent.llm == llm - assert agent.snapshot_limit == 50 + assert agent.default_snapshot_limit == 50 assert agent.verbose is False assert len(agent.history) == 0 - assert agent.token_usage["total_tokens"] == 0 + # Test new get_token_stats() method + stats = agent.get_token_stats() + assert stats.total_tokens == 0 def test_agent_build_context(): @@ -314,17 +316,23 @@ def test_agent_act_full_cycle(): result = agent.act("Click the button", max_retries=0) + # Test new dataclass return type (with backward compatible dict access) + assert result.success is True + assert result.action == "click" + assert result.element_id == 1 + assert result.goal == "Click the button" + + # Also test backward compatible dict-style access (shows deprecation warning) assert result["success"] is True assert result["action"] == "click" - assert result["element_id"] == 1 - assert result["goal"] == "Click the button" # Check history was recorded assert len(agent.history) == 1 assert agent.history[0]["goal"] == "Click the button" - # Check tokens were tracked - assert agent.token_usage["total_tokens"] > 0 + # Check tokens were tracked using new method + stats = agent.get_token_stats() + assert stats.total_tokens > 0 def test_agent_token_tracking(): @@ -334,17 +342,20 @@ def test_agent_token_tracking(): agent = SentienceAgent(browser, llm, verbose=False) # Simulate multiple actions - response1 = LLMResponse(content="CLICK(1)", prompt_tokens=100, completion_tokens=20, total_tokens=120) - response2 = LLMResponse(content="TYPE(2, \"test\")", prompt_tokens=150, completion_tokens=30, total_tokens=180) + response1 = LLMResponse(content="CLICK(1)", prompt_tokens=100, completion_tokens=20, total_tokens=120, model_name="mock-model") + response2 = LLMResponse(content="TYPE(2, \"test\")", prompt_tokens=150, completion_tokens=30, total_tokens=180, model_name="mock-model") agent._track_tokens("goal 1", response1) agent._track_tokens("goal 2", response2) + # Test new TokenStats dataclass return type stats = agent.get_token_stats() - assert stats["total_prompt_tokens"] == 250 - assert stats["total_completion_tokens"] == 50 - assert stats["total_tokens"] == 300 - assert len(stats["by_action"]) == 2 + assert stats.total_prompt_tokens == 250 + assert stats.total_completion_tokens == 50 + assert stats.total_tokens == 300 + assert len(stats.by_action) == 2 + assert stats.by_action[0].goal == "goal 1" + assert stats.by_action[0].model == "mock-model" def test_agent_clear_history(): @@ -354,13 +365,14 @@ def test_agent_clear_history(): agent = SentienceAgent(browser, llm, verbose=False) # Add some history - agent.history.append({"goal": "test"}) - agent.token_usage["total_tokens"] = 100 + agent.history.append({"goal": "test", "action": "test", "result": {}, "success": True, "attempt": 0, "duration_ms": 0}) + agent._token_usage_raw["total_tokens"] = 100 agent.clear_history() assert len(agent.history) == 0 - assert agent.token_usage["total_tokens"] == 0 + stats = agent.get_token_stats() + assert stats.total_tokens == 0 def test_agent_retry_on_failure(): From ad2efc445091447fcdcc0df616dcdc58d29c676f Mon Sep 17 00:00:00 2001 From: rcholic Date: Wed, 24 Dec 2025 12:09:20 -0800 Subject: [PATCH 2/2] improve agent interface --- .gitattributes | 1 - .github/workflows/release.yml | 23 +-- .github/workflows/sync-extension.yml | 62 +++--- .github/workflows/test.yml | 13 +- .pre-commit-config.yaml | 104 ++++++++++ LICENSE.md | 2 +- MANIFEST.in | 1 - docs/QUERY_DSL.md | 23 +-- examples/agent_layers_demo.py | 36 ++-- examples/basic_agent.py | 12 +- examples/click_rect_demo.py | 40 ++-- examples/hello.py | 23 ++- examples/query_demo.py | 18 +- examples/read_markdown.py | 17 +- examples/semantic_wait_demo.py | 28 +-- examples/test_local_llm_agent.py | 30 +-- examples/wait_and_click.py | 18 +- pyproject.toml | 72 +++++++ pytest.ini | 1 - sentience/__init__.py | 58 +++--- sentience/actions.py | 142 +++++++------ sentience/agent.py | 169 ++++++++-------- sentience/base_agent.py | 24 +-- sentience/browser.py | 90 +++++---- sentience/cli.py | 45 +++-- sentience/conversational_agent.py | 170 +++++++--------- sentience/expect.py | 66 +++--- sentience/extension/content.js | 2 +- sentience/extension/injected_api.js | 40 ++-- sentience/extension/manifest.json | 2 +- sentience/generator.py | 256 +++++++++++------------ sentience/inspector.py | 92 +++++---- sentience/llm_provider.py | 104 ++++------ sentience/models.py | 86 ++++---- sentience/query.py | 98 ++++----- sentience/read.py | 28 ++- sentience/recorder.py | 165 +++++++-------- sentience/screenshot.py | 27 +-- sentience/snapshot.py | 63 +++--- sentience/wait.py | 28 +-- spec/README.md | 3 +- spec/SNAPSHOT_V1.md | 5 +- spec/sdk-types.md | 1 - spec/snapshot.schema.json | 1 - tests/README.md | 1 - tests/__init__.py | 1 - tests/conftest.py | 19 +- tests/test_actions.py | 46 ++--- tests/test_agent.py | 123 +++++------ tests/test_bot.py | 3 +- tests/test_conversational_agent.py | 291 +++++++++++++-------------- tests/test_generator.py | 68 +++---- tests/test_inspector.py | 22 +- tests/test_query.py | 84 ++++---- tests/test_read.py | 19 +- tests/test_recorder.py | 53 ++--- tests/test_screenshot.py | 26 +-- tests/test_smart_selector.py | 21 +- tests/test_snapshot.py | 49 +++-- tests/test_spec_validation.py | 31 +-- tests/test_stealth.py | 56 +++--- tests/test_wait.py | 15 +- 62 files changed, 1697 insertions(+), 1520 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.gitattributes b/.gitattributes index 9f2b770..b3381b9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -11,4 +11,3 @@ *.md text eol=lf *.yml text eol=lf *.yaml text eol=lf - diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a397e7c..f410d90 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -13,21 +13,21 @@ on: jobs: build-and-publish: runs-on: ubuntu-latest - + steps: - name: Checkout code uses: actions/checkout@v4 - + - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.11' - + - name: Install build dependencies run: | python -m pip install --upgrade pip pip install build twine - + - name: Extract version from tag or input id: version run: | @@ -39,32 +39,32 @@ jobs: fi echo "version=$VERSION" >> $GITHUB_OUTPUT echo "Version: $VERSION" - + - name: Update version in pyproject.toml run: | VERSION="${{ steps.version.outputs.version }}" sed -i "s/^version = \".*\"/version = \"$VERSION\"/" pyproject.toml - + - name: Update version in __init__.py run: | VERSION="${{ steps.version.outputs.version }}" sed -i "s/^__version__ = \".*\"/__version__ = \"$VERSION\"/" sentience/__init__.py - + - name: Build package run: | python -m build - + - name: Check package run: | twine check dist/* - + - name: Publish to PyPI env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} run: | twine upload dist/* - + - name: Create GitHub Release if: github.event_name == 'workflow_dispatch' uses: softprops/action-gh-release@v1 @@ -73,7 +73,7 @@ jobs: name: Release v${{ steps.version.outputs.version }} body: | Release v${{ steps.version.outputs.version }} of sentience-python - + ## Installation ```bash pip install sentience-python==${{ steps.version.outputs.version }} @@ -82,4 +82,3 @@ jobs: prerelease: false env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - diff --git a/.github/workflows/sync-extension.yml b/.github/workflows/sync-extension.yml index 224556f..4746ac3 100644 --- a/.github/workflows/sync-extension.yml +++ b/.github/workflows/sync-extension.yml @@ -19,19 +19,19 @@ jobs: permissions: contents: write pull-requests: write - + steps: - name: Checkout sdk-python uses: actions/checkout@v4 with: token: ${{ secrets.GITHUB_TOKEN }} fetch-depth: 0 # Fetch all history for proper branching - + - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.11' - + - name: Determine release tag id: release run: | @@ -45,15 +45,15 @@ jobs: HTTP_CODE=$(curl -s -o latest_release.json -w "%{http_code}" \ -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ "https://api.github.com/repos/${{ secrets.SENTIENCE_CHROME_REPO }}/releases/latest") - + if [ "$HTTP_CODE" != "200" ]; then echo "❌ Failed to fetch latest release. HTTP Code: $HTTP_CODE" cat latest_release.json exit 1 fi - + TAG=$(cat latest_release.json | jq -r '.tag_name // empty') - + # Check if we already processed this tag if git ls-remote --exit-code --heads origin "sync-extension-$TAG"; then echo "Branch for $TAG already exists, skipping." @@ -61,42 +61,42 @@ jobs: exit 0 fi fi - + if [ -z "$TAG" ]; then echo "Could not determine release tag." exit 1 fi - + echo "Syncing tag: $TAG" echo "tag=$TAG" >> $GITHUB_OUTPUT - + - name: Download extension files if: steps.release.outputs.skip != 'true' run: | TAG="${{ steps.release.outputs.tag }}" REPO="${{ secrets.SENTIENCE_CHROME_REPO }}" - + # Setup temp directory mkdir -p extension-temp cd extension-temp - + echo "⬇️ Fetching release info for $TAG from $REPO..." - + # 1. Get Release Info HTTP_CODE=$(curl -s -w "%{http_code}" -o release.json \ -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ "https://api.github.com/repos/$REPO/releases/tags/$TAG") - + if [ "$HTTP_CODE" != "200" ]; then echo "❌ Failed to fetch release info. HTTP Code: $HTTP_CODE" echo "Response Body:" cat release.json exit 1 fi - + # Check for asset URL ASSET_URL=$(cat release.json | jq -r '.assets[]? | select(.name == "extension-files.tar.gz") | .url') - + if [ -z "$ASSET_URL" ] || [ "$ASSET_URL" == "null" ]; then echo "❌ Critical Error: extension-files.tar.gz not found in release assets!" echo "Available assets:" @@ -107,7 +107,7 @@ jobs: echo "📦 Downloading tarball from asset API endpoint..." # NOTE: For private repos, we must use the API URL (.url) with Accept: application/octet-stream header # Using .browser_download_url often redirects to S3 which breaks auth headers - + HTTP_CODE=$(curl -L -s -w "%{http_code}" -o extension.tar.gz \ -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ -H "Accept: application/octet-stream" \ @@ -125,7 +125,7 @@ jobs: # 3. Verify File Type before extracting FILE_TYPE=$(file -b --mime-type extension.tar.gz) echo "📄 Downloaded file type: $FILE_TYPE" - + if [[ "$FILE_TYPE" != *"gzip"* ]] && [[ "$FILE_TYPE" != *"octet-stream"* ]]; then echo "❌ Error: Downloaded file is not a gzip archive. It is: $FILE_TYPE" echo "First 100 bytes:" @@ -137,50 +137,50 @@ jobs: echo "📂 Extracting..." tar -xzf extension.tar.gz rm extension.tar.gz - + if [ ! -f "manifest.json" ]; then echo "❌ Error: manifest.json missing after extraction" exit 1 fi - + - name: Update extension files if: steps.release.outputs.skip != 'true' run: | # Target directory in sdk-python (inside the package source) TARGET_DIR="sentience/extension" - + # Ensure target directory exists and is clean rm -rf "$TARGET_DIR" mkdir -p "$TARGET_DIR" - + # Copy files from temp directory cp -r extension-temp/* "$TARGET_DIR/" - + # Verify copy if [ ! -f "$TARGET_DIR/manifest.json" ]; then echo "❌ Failed to copy manifest.json to $TARGET_DIR" exit 1 fi - + # Cleanup rm -rf extension-temp - + echo "✅ Extension files updated in $TARGET_DIR" ls -la "$TARGET_DIR" - + - name: Check for changes if: steps.release.outputs.skip != 'true' id: changes run: | git add sentience/extension/ - + if git diff --staged --quiet; then echo "No changes detected." echo "changed=false" >> $GITHUB_OUTPUT else echo "Changes detected." echo "changed=true" >> $GITHUB_OUTPUT - + # Show staged files echo "📊 Staged file sizes:" git diff --staged --name-only | while read file; do @@ -190,7 +190,7 @@ jobs: fi done fi - + - name: Create Pull Request if: steps.release.outputs.skip != 'true' && steps.changes.outputs.changed == 'true' uses: peter-evans/create-pull-request@v5 @@ -200,14 +200,14 @@ jobs: title: "Sync Extension: ${{ steps.release.outputs.tag }}" body: | This PR syncs extension files from sentience-chrome release ${{ steps.release.outputs.tag }}. - + **Files updated:** - Extension manifest and scripts - WASM binary and bindings - + **Source:** [sentience-chrome release ${{ steps.release.outputs.tag }}](https://github.com/${{ secrets.SENTIENCE_CHROME_REPO }}/releases/tag/${{ steps.release.outputs.tag }}) branch: sync-extension-${{ steps.release.outputs.tag }} delete-branch: true labels: | automated - extension-sync \ No newline at end of file + extension-sync diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 89b9c58..419f2fd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,26 +13,26 @@ jobs: matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: ['3.11'] - + steps: - name: Checkout code uses: actions/checkout@v4 - + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - + - name: Install Playwright browsers run: | python -m pip install --upgrade pip pip install playwright playwright install chromium - + - name: Install dependencies run: | pip install -e ".[dev]" - + - name: Build extension (if needed) if: runner.os != 'Windows' shell: bash @@ -42,10 +42,9 @@ jobs: else echo "Extension directory not found, skipping build" fi - + - name: Run tests run: | pytest tests/ -v env: CI: true - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..7649ba7 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,104 @@ +# Pre-commit hooks for Sentience Python SDK +# See https://pre-commit.com for more information + +repos: + # General file checks + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + exclude: ^tests/fixtures/ + - id: end-of-file-fixer + exclude: ^tests/fixtures/ + - id: check-yaml + - id: check-json + - id: check-added-large-files + args: ['--maxkb=1000'] + - id: check-merge-conflict + - id: check-case-conflict + - id: detect-private-key + - id: debug-statements + - id: mixed-line-ending + args: ['--fix=lf'] + + # Python code formatting with Black + - repo: https://github.com/psf/black + rev: 24.2.0 + hooks: + - id: black + language_version: python3.11 + args: ['--line-length=100'] + exclude: ^(venv/|\.venv/|build/|dist/) + + # Import sorting with isort (compatible with Black) + - repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + args: ['--profile=black', '--line-length=100'] + exclude: ^(venv/|\.venv/|build/|dist/) + + # Flake8 for style guide enforcement + - repo: https://github.com/pycqa/flake8 + rev: 7.0.0 + hooks: + - id: flake8 + args: + - '--max-line-length=100' + - '--extend-ignore=E203,W503,E501' # Black compatibility + - '--exclude=venv,build,dist,.eggs,*.egg' + - '--max-complexity=15' + exclude: ^(venv/|\.venv/|build/|dist/|tests/fixtures/) + + # Type checking with mypy (disabled for now - too strict) + # Uncomment to enable strict type checking + # - repo: https://github.com/pre-commit/mirrors-mypy + # rev: v1.8.0 + # hooks: + # - id: mypy + # additional_dependencies: + # - pydantic>=2.0 + # - types-requests + # args: + # - '--ignore-missing-imports' + # - '--no-strict-optional' + # - '--warn-unused-ignores' + # exclude: ^(tests/|examples/|venv/|\.venv/|build/|dist/) + + # Security checks + - repo: https://github.com/PyCQA/bandit + rev: 1.7.7 + hooks: + - id: bandit + args: ['-c', 'pyproject.toml'] + additional_dependencies: ['bandit[toml]'] + exclude: ^(tests/|venv/|\.venv/) + + # Check for common Python anti-patterns + - repo: https://github.com/asottile/pyupgrade + rev: v3.15.0 + hooks: + - id: pyupgrade + args: ['--py311-plus'] + exclude: ^(venv/|\.venv/|build/|dist/) + +# Configuration for specific files +default_language_version: + python: python3.11 + +# Fail fast on first error +fail_fast: false + +# Files to exclude globally +exclude: | + (?x)^( + venv/.*| + \.venv/.*| + build/.*| + dist/.*| + \.eggs/.*| + .*\.egg-info/.*| + __pycache__/.*| + \.pytest_cache/.*| + \.mypy_cache/.* + )$ diff --git a/LICENSE.md b/LICENSE.md index e1954d9..e9c3b11 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -40,4 +40,4 @@ you refers to the individual or entity agreeing to these terms. **use** means anything you do with the software requiring one of your licenses. -**trademark** means trademarks, service marks, and similar rights. \ No newline at end of file +**trademark** means trademarks, service marks, and similar rights. diff --git a/MANIFEST.in b/MANIFEST.in index ab56f16..921b4e2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,4 +2,3 @@ include README.md include LICENSE recursive-include spec * recursive-include sentience *.py - diff --git a/docs/QUERY_DSL.md b/docs/QUERY_DSL.md index bb3fd72..c391eaa 100644 --- a/docs/QUERY_DSL.md +++ b/docs/QUERY_DSL.md @@ -23,15 +23,15 @@ from sentience import SentienceBrowser, snapshot, query, find with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) - + # Find all buttons buttons = query(snap, "role=button") - + # Find button with specific text sign_in = find(snap, "role=button text~'Sign in'") - + # Find high-importance elements (Pro/Enterprise only) important = query(snap, "importance>500") ``` @@ -312,33 +312,33 @@ snap = snapshot(browser) # Uses server-side API ### 1. Use Semantic Roles -✅ **Good**: `role=button text~'Submit'` +✅ **Good**: `role=button text~'Submit'` ❌ **Avoid**: CSS selectors (not supported) ### 2. Combine Multiple Conditions -✅ **Good**: `role=button clickable=true text~'Sign in'` +✅ **Good**: `role=button clickable=true text~'Sign in'` ❌ **Avoid**: Single condition if multiple elements match ### 3. Use Text Matching Wisely -✅ **Good**: `text~'Sign in'` (case-insensitive, flexible) -✅ **Good**: `text^='Sign'` (prefix matching) +✅ **Good**: `text~'Sign in'` (case-insensitive, flexible) +✅ **Good**: `text^='Sign'` (prefix matching) ❌ **Avoid**: `text='Sign In'` (exact match, too brittle) ### 4. Filter by Visibility -✅ **Good**: `visible=true` or `in_viewport=true` for actionable elements +✅ **Good**: `visible=true` or `in_viewport=true` for actionable elements ✅ **Good**: `is_occluded=false` to exclude covered elements ### 5. Use Importance (Pro/Enterprise) -✅ **Good**: `importance>500` to find high-priority elements +✅ **Good**: `importance>500` to find high-priority elements ✅ **Good**: Combine with role: `role=button importance>800` ### 6. Spatial Filtering -✅ **Good**: `bbox.x>100` to find elements in specific regions +✅ **Good**: `bbox.x>100` to find elements in specific regions ✅ **Good**: Combine with other conditions: `role=button bbox.x>400` ### 7. Query Performance @@ -499,4 +499,3 @@ center = query(snap, "bbox.x>400 bbox.x<600 bbox.y>300 bbox.y<500") - [Examples](../examples/query_demo.py) - [Type Definitions](../../spec/sdk-types.md) - [Snapshot Schema](../../spec/snapshot.schema.json) - diff --git a/examples/agent_layers_demo.py b/examples/agent_layers_demo.py index ceab8a5..f41f84d 100644 --- a/examples/agent_layers_demo.py +++ b/examples/agent_layers_demo.py @@ -9,11 +9,13 @@ """ import os + from dotenv import load_dotenv # Load environment variables load_dotenv() + def demo_layer1_direct_sdk(): """ Layer 1: Direct SDK Usage @@ -21,11 +23,11 @@ def demo_layer1_direct_sdk(): - Requires knowing exact element selectors - 50+ lines of code for typical automation """ - print("\n" + "="*70) + print("\n" + "=" * 70) print("LAYER 1: Direct SDK Usage (Full Control)") - print("="*70) + print("=" * 70) - from sentience import SentienceBrowser, snapshot, find, click, type_text, press + from sentience import SentienceBrowser, click, find, press, snapshot, type_text with SentienceBrowser(headless=False) as browser: # Navigate @@ -61,11 +63,11 @@ def demo_layer2_sentience_agent(): - No need to know selectors - 15 lines of code for typical automation """ - print("\n" + "="*70) + print("\n" + "=" * 70) print("LAYER 2: SentienceAgent (Technical Commands)") - print("="*70) + print("=" * 70) - from sentience import SentienceBrowser, SentienceAgent + from sentience import SentienceAgent, SentienceBrowser from sentience.llm_provider import OpenAIProvider # Initialize @@ -97,11 +99,11 @@ def demo_layer3_conversational_agent(): - Automatic planning and execution - 3 lines of code for typical automation """ - print("\n" + "="*70) + print("\n" + "=" * 70) print("LAYER 3: ConversationalAgent (Natural Language)") - print("="*70) + print("=" * 70) - from sentience import SentienceBrowser, ConversationalAgent + from sentience import ConversationalAgent, SentienceBrowser from sentience.llm_provider import OpenAIProvider # Initialize @@ -127,11 +129,11 @@ def demo_layer3_with_local_llm(): - No API costs - Runs on your hardware """ - print("\n" + "="*70) + print("\n" + "=" * 70) print("LAYER 3: ConversationalAgent with Local LLM (Zero Cost)") - print("="*70) + print("=" * 70) - from sentience import SentienceBrowser, ConversationalAgent + from sentience import ConversationalAgent, SentienceBrowser from sentience.llm_provider import LocalLLMProvider # Initialize with local LLM @@ -139,7 +141,7 @@ def demo_layer3_with_local_llm(): llm = LocalLLMProvider( model_name="Qwen/Qwen2.5-3B-Instruct", device="auto", # Use CUDA if available - load_in_4bit=True # Save memory with quantization + load_in_4bit=True, # Save memory with quantization ) agent = ConversationalAgent(browser, llm, verbose=True) @@ -157,9 +159,9 @@ def demo_comparison(): """ Side-by-side comparison of all layers """ - print("\n" + "="*70) + print("\n" + "=" * 70) print("COMPARISON: All Three Layers") - print("="*70) + print("=" * 70) comparison_table = """ | Feature | Layer 1 (SDK) | Layer 2 (Agent) | Layer 3 (Conversational) | @@ -181,9 +183,9 @@ def demo_comparison(): def main(): """Run all demos""" - print("\n" + "="*70) + print("\n" + "=" * 70) print("SENTIENCE SDK: Multi-Layer Abstraction Demo") - print("="*70) + print("=" * 70) print("\nThis demo shows how to use the SDK at different abstraction levels:") print(" 1. Layer 1: Direct SDK (maximum control)") print(" 2. Layer 2: SentienceAgent (technical commands)") diff --git a/examples/basic_agent.py b/examples/basic_agent.py index a80ed89..a14ac3e 100644 --- a/examples/basic_agent.py +++ b/examples/basic_agent.py @@ -2,9 +2,10 @@ Example: Basic snapshot functionality """ -from sentience import SentienceBrowser, snapshot import os +from sentience import SentienceBrowser, snapshot + def main(): # Get API key from environment variable (optional - uses free tier if not set) @@ -13,19 +14,19 @@ def main(): with SentienceBrowser(api_key=api_key, headless=False) as browser: # Navigate to a test page browser.page.goto("https://example.com", wait_until="domcontentloaded") - + # Take snapshot snap = snapshot(browser) - + print(f"Status: {snap.status}") print(f"URL: {snap.url}") print(f"Elements found: {len(snap.elements)}") - + # Show top 5 elements print("\nTop 5 elements:") for i, el in enumerate(snap.elements[:5], 1): print(f"{i}. [{el.role}] {el.text or '(no text)'} (importance: {el.importance})") - + # Save snapshot snap.save("snapshot_example.json") print("\n✅ Snapshot saved to snapshot_example.json") @@ -33,4 +34,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/examples/click_rect_demo.py b/examples/click_rect_demo.py index 2bf574b..0c2148c 100644 --- a/examples/click_rect_demo.py +++ b/examples/click_rect_demo.py @@ -2,9 +2,10 @@ Example: Using click_rect for coordinate-based clicking with visual feedback """ -from sentience import SentienceBrowser, snapshot, find, click_rect import os +from sentience import SentienceBrowser, click_rect, find, snapshot + def main(): # Get API key from environment variable (optional - uses free tier if not set) @@ -13,63 +14,57 @@ def main(): with SentienceBrowser(api_key=api_key, headless=False) as browser: # Navigate to example.com browser.page.goto("https://example.com", wait_until="domcontentloaded") - + print("=== click_rect Demo ===\n") - + # Example 1: Click using rect dictionary print("1. Clicking at specific coordinates (100, 100) with size 50x30") print(" (You should see a red border highlight for 2 seconds)") result = click_rect(browser, {"x": 100, "y": 100, "w": 50, "h": 30}) print(f" Result: success={result.success}, outcome={result.outcome}") print(f" Duration: {result.duration_ms}ms\n") - + # Wait a bit browser.page.wait_for_timeout(1000) - + # Example 2: Click using element's bbox print("2. Clicking using element's bounding box") snap = snapshot(browser) link = find(snap, "role=link") - + if link: print(f" Found link: '{link.text}' at ({link.bbox.x}, {link.bbox.y})") print(" Clicking at center of element's bbox...") - result = click_rect(browser, { - "x": link.bbox.x, - "y": link.bbox.y, - "w": link.bbox.width, - "h": link.bbox.height - }) + result = click_rect( + browser, + {"x": link.bbox.x, "y": link.bbox.y, "w": link.bbox.width, "h": link.bbox.height}, + ) print(f" Result: success={result.success}, outcome={result.outcome}") print(f" URL changed: {result.url_changed}\n") - + # Navigate back if needed if result.url_changed: browser.page.goto("https://example.com", wait_until="domcontentloaded") browser.page.wait_for_load_state("networkidle") - + # Example 3: Click without highlight (for headless/CI) print("3. Clicking without visual highlight") result = click_rect(browser, {"x": 200, "y": 200, "w": 40, "h": 20}, highlight=False) print(f" Result: success={result.success}\n") - + # Example 4: Custom highlight duration print("4. Clicking with custom highlight duration (3 seconds)") result = click_rect(browser, {"x": 300, "y": 300, "w": 60, "h": 40}, highlight_duration=3.0) print(f" Result: success={result.success}") print(" (Red border should stay visible for 3 seconds)\n") - + # Example 5: Click with snapshot capture print("5. Clicking and capturing snapshot after action") - result = click_rect( - browser, - {"x": 150, "y": 150, "w": 50, "h": 30}, - take_snapshot=True - ) + result = click_rect(browser, {"x": 150, "y": 150, "w": 50, "h": 30}, take_snapshot=True) if result.snapshot_after: print(f" Snapshot captured: {len(result.snapshot_after.elements)} elements found") print(f" URL: {result.snapshot_after.url}\n") - + print("✅ click_rect demo complete!") print("\nNote: click_rect uses Playwright's native mouse.click() for realistic") print("event simulation, triggering hover, focus, mousedown, mouseup sequences.") @@ -77,4 +72,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/examples/hello.py b/examples/hello.py index c8b2d89..6499252 100644 --- a/examples/hello.py +++ b/examples/hello.py @@ -2,9 +2,10 @@ Example: Verify extension bridge is loaded """ -from sentience import SentienceBrowser import os +from sentience import SentienceBrowser + def main(): # Get API key from environment variable (optional - uses free tier if not set) @@ -14,16 +15,18 @@ def main(): with SentienceBrowser(api_key=api_key, headless=False) as browser: # Navigate to a page to ensure extension is active browser.page.goto("https://example.com", wait_until="domcontentloaded") - + # Check if extension API is available - bridge_ok = browser.page.evaluate(""" + bridge_ok = browser.page.evaluate( + """ () => { - return typeof window.sentience !== 'undefined' && + return typeof window.sentience !== 'undefined' && typeof window.sentience.snapshot === 'function'; } - """) + """ + ) print(f"bridge_ok={bridge_ok}") - + if bridge_ok: print("✅ Extension loaded successfully!") # Try a quick snapshot to verify it works @@ -38,7 +41,8 @@ def main(): else: print("❌ Extension not loaded") # Debug info - debug_info = browser.page.evaluate(""" + debug_info = browser.page.evaluate( + """ () => { return { sentience_defined: typeof window.sentience !== 'undefined', @@ -46,14 +50,15 @@ def main(): snapshot_defined: typeof window.sentience?.snapshot !== 'undefined' }; } - """) + """ + ) print(f"Debug info: {debug_info}") except Exception as e: print(f"❌ Error: {e}") import traceback + traceback.print_exc() if __name__ == "__main__": main() - diff --git a/examples/query_demo.py b/examples/query_demo.py index 46b4692..4ae4295 100644 --- a/examples/query_demo.py +++ b/examples/query_demo.py @@ -2,9 +2,10 @@ Example: Query engine demonstration """ -from sentience import SentienceBrowser, snapshot, query, find import os +from sentience import SentienceBrowser, find, query, snapshot + def main(): # Get API key from environment variable (optional - uses free tier if not set) @@ -13,31 +14,31 @@ def main(): with SentienceBrowser(api_key=api_key, headless=False) as browser: # Navigate to a page with links browser.page.goto("https://example.com", wait_until="domcontentloaded") - + snap = snapshot(browser) - + # Query examples print("=== Query Examples ===\n") - + # Find all buttons buttons = query(snap, "role=button") print(f"Found {len(buttons)} buttons") - + # Find all links links = query(snap, "role=link") print(f"Found {len(links)} links") - + # Find clickable elements clickables = query(snap, "clickable=true") print(f"Found {len(clickables)} clickable elements") - + # Find element with text containing "More" more_link = find(snap, "text~'More'") if more_link: print(f"\nFound 'More' link: {more_link.text} (id: {more_link.id})") else: print("\nNo 'More' link found") - + # Complex query: clickable links clickable_links = query(snap, "role=link clickable=true") print(f"\nFound {len(clickable_links)} clickable links") @@ -45,4 +46,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/examples/read_markdown.py b/examples/read_markdown.py index 3da6f4f..d2cd669 100644 --- a/examples/read_markdown.py +++ b/examples/read_markdown.py @@ -5,10 +5,12 @@ and convert it to high-quality markdown using markdownify. """ -from sentience import SentienceBrowser, read -from markdownify import markdownify import os +from markdownify import markdownify + +from sentience import SentienceBrowser, read + def main(): # Get API key from environment variable (optional - uses free tier if not set) @@ -18,23 +20,23 @@ def main(): with SentienceBrowser(api_key=api_key, headless=True) as browser: # Navigate to a page browser.page.goto("https://example.com", wait_until="domcontentloaded") - + # Method 1: Get raw HTML (default) and convert with markdownify print("=== Method 1: Raw HTML + markdownify (Recommended) ===") result = read(browser) # format="raw" is default html_content = result["content"] - + # Convert to markdown using markdownify (better quality) markdown = markdownify( html_content, heading_style="ATX", # Use # for headings bullets="-", # Use - for lists - strip=['script', 'style', 'nav', 'footer', 'header'], # Strip unwanted tags + strip=["script", "style", "nav", "footer", "header"], # Strip unwanted tags ) print(f"Markdown length: {len(markdown)} characters") print(markdown[:500]) # Print first 500 chars print("\n") - + # Method 2: Get high-quality markdown directly (uses markdownify internally) print("=== Method 2: Direct markdown (High-quality via markdownify) ===") result = read(browser, format="markdown") @@ -42,7 +44,7 @@ def main(): print(f"Markdown length: {len(high_quality_markdown)} characters") print(high_quality_markdown[:500]) # Print first 500 chars print("\n") - + # Method 3: Get plain text print("=== Method 3: Plain text ===") result = read(browser, format="text") @@ -53,4 +55,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/examples/semantic_wait_demo.py b/examples/semantic_wait_demo.py index b738826..f072117 100644 --- a/examples/semantic_wait_demo.py +++ b/examples/semantic_wait_demo.py @@ -3,9 +3,10 @@ Demonstrates waiting for elements using semantic selectors """ -from sentience import SentienceBrowser, wait_for, click import os +from sentience import SentienceBrowser, click, wait_for + def main(): # Get API key from environment variable (optional - uses free tier if not set) @@ -14,9 +15,9 @@ def main(): with SentienceBrowser(api_key=api_key, headless=False) as browser: # Navigate to example.com browser.page.goto("https://example.com", wait_until="domcontentloaded") - + print("=== Semantic wait_for Demo ===\n") - + # Example 1: Wait for element by role print("1. Waiting for link element (role=link)") wait_result = wait_for(browser, "role=link", timeout=5.0) @@ -26,7 +27,7 @@ def main(): else: print(f" ❌ Not found (timeout: {wait_result.timeout})") print() - + # Example 2: Wait for element by role and text print("2. Waiting for link with specific text") wait_result = wait_for(browser, "role=link text~'Example'", timeout=5.0) @@ -36,7 +37,7 @@ def main(): else: print(" ❌ Not found") print() - + # Example 3: Wait for clickable element print("3. Waiting for clickable element") wait_result = wait_for(browser, "clickable=true", timeout=5.0) @@ -48,7 +49,7 @@ def main(): else: print(" ❌ Not found") print() - + # Example 4: Wait for element with importance threshold print("4. Waiting for important element (importance > 100)") wait_result = wait_for(browser, "importance>100", timeout=5.0) @@ -59,20 +60,22 @@ def main(): else: print(" ❌ Not found") print() - + # Example 5: Wait and then click print("5. Wait for element, then click it") wait_result = wait_for(browser, "role=link", timeout=5.0) if wait_result.found: print(" ✅ Found element, clicking...") click_result = click(browser, wait_result.element.id) - print(f" Click result: success={click_result.success}, outcome={click_result.outcome}") + print( + f" Click result: success={click_result.success}, outcome={click_result.outcome}" + ) if click_result.url_changed: print(f" ✅ Navigation occurred: {browser.page.url}") else: print(" ❌ Element not found, cannot click") print() - + # Example 6: Using local extension (fast polling) print("6. Using local extension with auto-optimized interval") print(" When use_api=False, interval auto-adjusts to 0.25s (fast)") @@ -81,7 +84,7 @@ def main(): print(f" ✅ Found after {wait_result.duration_ms}ms") print(" (Used local extension, polled every 0.25 seconds)") print() - + # Example 7: Using remote API (slower polling) print("7. Using remote API with auto-optimized interval") print(" When use_api=True, interval auto-adjusts to 1.5s (network-friendly)") @@ -93,7 +96,7 @@ def main(): else: print(" ⚠️ Skipped (no API key set)") print() - + # Example 8: Custom interval override print("8. Custom interval override (manual control)") print(" You can still specify custom interval if needed") @@ -102,7 +105,7 @@ def main(): print(f" ✅ Found after {wait_result.duration_ms}ms") print(" (Custom interval: 0.5 seconds)") print() - + print("✅ Semantic wait_for demo complete!") print("\nNote: wait_for uses the semantic query DSL to find elements.") print("This is more robust than CSS selectors because it understands") @@ -111,4 +114,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/examples/test_local_llm_agent.py b/examples/test_local_llm_agent.py index f2a5c1b..8dcf1ca 100644 --- a/examples/test_local_llm_agent.py +++ b/examples/test_local_llm_agent.py @@ -5,11 +5,12 @@ from sentience.llm_provider import LocalLLMProvider + def test_local_llm_basic(): """Test basic LLM response generation""" - print("="*70) + print("=" * 70) print("Testing LocalLLMProvider with Qwen2.5-3B-Instruct") - print("="*70) + print("=" * 70) # Initialize local LLM # Using the model from your local cache @@ -17,26 +18,28 @@ def test_local_llm_basic(): model_name="Qwen/Qwen2.5-3B-Instruct", device="auto", # Will use CUDA if available, else CPU load_in_4bit=False, # Set to True to save memory - torch_dtype="auto" + torch_dtype="auto", ) - print("\n" + "="*70) + print("\n" + "=" * 70) print("Test 1: Simple question") - print("="*70) + print("=" * 70) response = llm.generate( system_prompt="You are a helpful web automation assistant.", user_prompt="What is 2+2?", max_new_tokens=50, - temperature=0.1 + temperature=0.1, ) print(f"Response: {response.content}") - print(f"Tokens: {response.total_tokens} (prompt: {response.prompt_tokens}, completion: {response.completion_tokens})") + print( + f"Tokens: {response.total_tokens} (prompt: {response.prompt_tokens}, completion: {response.completion_tokens})" + ) - print("\n" + "="*70) + print("\n" + "=" * 70) print("Test 2: Action parsing (for agent)") - print("="*70) + print("=" * 70) system_prompt = """You are an AI web automation agent. @@ -62,10 +65,7 @@ def test_local_llm_basic(): user_prompt = "What is the next step to achieve the goal?" response = llm.generate( - system_prompt=system_prompt, - user_prompt=user_prompt, - max_new_tokens=20, - temperature=0.0 + system_prompt=system_prompt, user_prompt=user_prompt, max_new_tokens=20, temperature=0.0 ) print(f"Agent Response: {response.content}") @@ -77,9 +77,9 @@ def test_local_llm_basic(): else: print(f"\n⚠️ Response may need adjustment: {response.content}") - print("\n" + "="*70) + print("\n" + "=" * 70) print("LocalLLMProvider Test Complete!") - print("="*70) + print("=" * 70) if __name__ == "__main__": diff --git a/examples/wait_and_click.py b/examples/wait_and_click.py index d2576dc..5a1cb84 100644 --- a/examples/wait_and_click.py +++ b/examples/wait_and_click.py @@ -2,9 +2,10 @@ Example: Wait for element and click """ -from sentience import SentienceBrowser, snapshot, find, wait_for, click, expect import os +from sentience import SentienceBrowser, click, expect, find, snapshot, wait_for + def main(): # Get API key from environment variable (optional - uses free tier if not set) @@ -13,16 +14,16 @@ def main(): with SentienceBrowser(api_key=api_key, headless=False) as browser: # Navigate to example.com browser.page.goto("https://example.com", wait_until="domcontentloaded") - + # Take initial snapshot snap = snapshot(browser) - + # Find a link link = find(snap, "role=link") - + if link: print(f"Found link: {link.text} (id: {link.id})") - + # Click it result = click(browser, link.id) print(f"Click result: success={result.success}, outcome={result.outcome}") @@ -30,17 +31,17 @@ def main(): print(f"New URL: {browser.page.url}") else: print("No link found") - + # Example: Wait for element using wait_for print("\n=== Wait Example ===") browser.page.goto("https://example.com", wait_until="domcontentloaded") - + wait_result = wait_for(browser, "role=link", timeout=5.0) if wait_result.found: print(f"✅ Found element after {wait_result.duration_ms}ms") else: print(f"❌ Element not found (timeout: {wait_result.timeout})") - + # Example: Expect assertion print("\n=== Expect Example ===") try: @@ -52,4 +53,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/pyproject.toml b/pyproject.toml index aba6a07..989d440 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,3 +53,75 @@ python_files = ["test_*.py"] python_classes = ["Test*"] python_functions = ["test_*"] asyncio_mode = "auto" + +# Black configuration +[tool.black] +line-length = 100 +target-version = ['py311'] +include = '\.pyi?$' +extend-exclude = ''' +/( + # directories + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | venv + | build + | dist +)/ +''' + +# isort configuration (compatible with Black) +[tool.isort] +profile = "black" +line_length = 100 +skip_gitignore = true +known_first_party = ["sentience"] +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true + +# Flake8 configuration +[tool.flake8] +max-line-length = 100 +extend-ignore = ["E203", "W503", "E501"] +exclude = [ + ".git", + "__pycache__", + "venv", + ".venv", + "build", + "dist", + ".eggs", + "*.egg-info", +] +max-complexity = 15 + +# Mypy configuration +[tool.mypy] +python_version = "3.11" +warn_return_any = true +warn_unused_configs = true +ignore_missing_imports = true +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +check_untyped_defs = false +disallow_untyped_defs = false +exclude = [ + "venv", + ".venv", + "build", + "dist", + "tests", +] + +# Bandit configuration (security linting) +[tool.bandit] +exclude_dirs = ["tests", "venv", ".venv", "build", "dist"] +skips = ["B101", "B601"] # Skip assert_used and shell injection (we use them carefully) diff --git a/pytest.ini b/pytest.ini index d912f99..67c1eed 100644 --- a/pytest.ini +++ b/pytest.ini @@ -4,4 +4,3 @@ python_files = test_*.py python_classes = Test* python_functions = test_* asyncio_mode = auto - diff --git a/sentience/__init__.py b/sentience/__init__.py index 3384549..e6c5ea2 100644 --- a/sentience/__init__.py +++ b/sentience/__init__.py @@ -2,39 +2,44 @@ Sentience Python SDK - AI Agent Browser Automation """ +from .actions import click, click_rect, press, type_text +from .agent import SentienceAgent + +# Agent Layer (Phase 1 & 2) +from .base_agent import BaseAgent from .browser import SentienceBrowser -from .models import ( - Snapshot, - Element, - BBox, - Viewport, - ActionResult, - WaitResult, - # Agent Layer Models - AgentActionResult, - TokenStats, +from .conversational_agent import ConversationalAgent +from .expect import expect +from .generator import ScriptGenerator, generate +from .inspector import Inspector, inspect +from .llm_provider import ( + AnthropicProvider, + LLMProvider, + LLMResponse, + LocalLLMProvider, + OpenAIProvider, +) +from .models import ( # Agent Layer Models ActionHistory, + ActionResult, ActionTokenUsage, - SnapshotOptions, + AgentActionResult, + BBox, + Element, + ScreenshotConfig, + Snapshot, SnapshotFilter, - ScreenshotConfig + SnapshotOptions, + TokenStats, + Viewport, + WaitResult, ) -from .snapshot import snapshot -from .query import query, find -from .actions import click, type_text, press, click_rect -from .wait import wait_for -from .expect import expect -from .inspector import Inspector, inspect -from .recorder import Recorder, Trace, TraceStep, record -from .generator import ScriptGenerator, generate +from .query import find, query from .read import read +from .recorder import Recorder, Trace, TraceStep, record from .screenshot import screenshot - -# Agent Layer (Phase 1 & 2) -from .base_agent import BaseAgent -from .llm_provider import LLMProvider, LLMResponse, OpenAIProvider, AnthropicProvider, LocalLLMProvider -from .agent import SentienceAgent -from .conversational_agent import ConversationalAgent +from .snapshot import snapshot +from .wait import wait_for __version__ = "0.10.7" @@ -84,4 +89,3 @@ "SnapshotFilter", "ScreenshotConfig", ] - diff --git a/sentience/actions.py b/sentience/actions.py index 6f646b1..2ce8a3f 100644 --- a/sentience/actions.py +++ b/sentience/actions.py @@ -3,9 +3,10 @@ """ import time -from typing import Optional, Dict, Any +from typing import Any, Dict, Optional + from .browser import SentienceBrowser -from .models import ActionResult, Snapshot, BBox +from .models import ActionResult, BBox, Snapshot from .snapshot import snapshot @@ -17,23 +18,23 @@ def click( ) -> ActionResult: """ Click an element by ID using hybrid approach (mouse simulation by default) - + Args: browser: SentienceBrowser instance element_id: Element ID from snapshot use_mouse: If True, use Playwright's mouse.click() at element center (hybrid approach). If False, use JS-based window.sentience.click() (legacy). take_snapshot: Whether to take snapshot after action - + Returns: ActionResult """ if not browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - + start_time = time.time() url_before = browser.page.url - + if use_mouse: # Hybrid approach: Get element bbox from snapshot, calculate center, use mouse.click() try: @@ -43,7 +44,7 @@ def click( if el.id == element_id: element = el break - + if element: # Calculate center of element bbox center_x = element.bbox.x + element.bbox.width / 2 @@ -93,17 +94,17 @@ def click( } """, element_id, - ) - + ) + # Wait a bit for navigation/DOM updates try: browser.page.wait_for_timeout(500) except Exception: # Navigation might have happened, context destroyed pass - + duration_ms = int((time.time() - start_time) * 1000) - + # Check if URL changed (handle navigation gracefully) try: url_after = browser.page.url @@ -112,54 +113,60 @@ def click( # Context destroyed due to navigation - assume URL changed url_after = url_before url_changed = True - + # Determine outcome - outcome: Optional[str] = None + outcome: str | None = None if url_changed: outcome = "navigated" elif success: outcome = "dom_updated" else: outcome = "error" - + # Optional snapshot after - snapshot_after: Optional[Snapshot] = None + snapshot_after: Snapshot | None = None if take_snapshot: try: snapshot_after = snapshot(browser) except Exception: # Navigation might have destroyed context pass - + return ActionResult( success=success, duration_ms=duration_ms, outcome=outcome, url_changed=url_changed, snapshot_after=snapshot_after, - error=None if success else {"code": "click_failed", "reason": "Element not found or not clickable"}, + error=( + None + if success + else {"code": "click_failed", "reason": "Element not found or not clickable"} + ), ) -def type_text(browser: SentienceBrowser, element_id: int, text: str, take_snapshot: bool = False) -> ActionResult: +def type_text( + browser: SentienceBrowser, element_id: int, text: str, take_snapshot: bool = False +) -> ActionResult: """ Type text into an element (focus then input) - + Args: browser: SentienceBrowser instance element_id: Element ID from snapshot text: Text to type take_snapshot: Whether to take snapshot after action - + Returns: ActionResult """ if not browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - + start_time = time.time() url_before = browser.page.url - + # Focus element first using extension registry focused = browser.page.evaluate( """ @@ -174,7 +181,7 @@ def type_text(browser: SentienceBrowser, element_id: int, text: str, take_snapsh """, element_id, ) - + if not focused: return ActionResult( success=False, @@ -182,20 +189,20 @@ def type_text(browser: SentienceBrowser, element_id: int, text: str, take_snapsh outcome="error", error={"code": "focus_failed", "reason": "Element not found"}, ) - + # Type using Playwright keyboard browser.page.keyboard.type(text) - + duration_ms = int((time.time() - start_time) * 1000) url_after = browser.page.url url_changed = url_before != url_after - + outcome = "navigated" if url_changed else "dom_updated" - - snapshot_after: Optional[Snapshot] = None + + snapshot_after: Snapshot | None = None if take_snapshot: snapshot_after = snapshot(browser) - + return ActionResult( success=True, duration_ms=duration_ms, @@ -208,37 +215,37 @@ def type_text(browser: SentienceBrowser, element_id: int, text: str, take_snapsh def press(browser: SentienceBrowser, key: str, take_snapshot: bool = False) -> ActionResult: """ Press a keyboard key - + Args: browser: SentienceBrowser instance key: Key to press (e.g., "Enter", "Escape", "Tab") take_snapshot: Whether to take snapshot after action - + Returns: ActionResult """ if not browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - + start_time = time.time() url_before = browser.page.url - + # Press key using Playwright browser.page.keyboard.press(key) - + # Wait a bit for navigation/DOM updates browser.page.wait_for_timeout(500) - + duration_ms = int((time.time() - start_time) * 1000) url_after = browser.page.url url_changed = url_before != url_after - + outcome = "navigated" if url_changed else "dom_updated" - - snapshot_after: Optional[Snapshot] = None + + snapshot_after: Snapshot | None = None if take_snapshot: snapshot_after = snapshot(browser) - + return ActionResult( success=True, duration_ms=duration_ms, @@ -248,10 +255,12 @@ def press(browser: SentienceBrowser, key: str, take_snapshot: bool = False) -> A ) -def _highlight_rect(browser: SentienceBrowser, rect: Dict[str, float], duration_sec: float = 2.0) -> None: +def _highlight_rect( + browser: SentienceBrowser, rect: dict[str, float], duration_sec: float = 2.0 +) -> None: """ Highlight a rectangle with a red border overlay - + Args: browser: SentienceBrowser instance rect: Dictionary with x, y, width (w), height (h) keys @@ -259,10 +268,10 @@ def _highlight_rect(browser: SentienceBrowser, rect: Dict[str, float], duration_ """ if not browser.page: return - + # Create a unique ID for this highlight highlight_id = f"sentience_highlight_{int(time.time() * 1000)}" - + # Combine all arguments into a single object for Playwright args = { "rect": { @@ -274,7 +283,7 @@ def _highlight_rect(browser: SentienceBrowser, rect: Dict[str, float], duration_ "highlightId": highlight_id, "durationSec": duration_sec, } - + # Inject CSS and create overlay element browser.page.evaluate( """ @@ -295,9 +304,9 @@ def _highlight_rect(browser: SentienceBrowser, rect: Dict[str, float], duration_ overlay.style.zIndex = '999999'; overlay.style.backgroundColor = 'rgba(255, 0, 0, 0.1)'; overlay.style.transition = 'opacity 0.3s ease-out'; - + document.body.appendChild(overlay); - + // Remove after duration setTimeout(() => { overlay.style.opacity = '0'; @@ -315,7 +324,7 @@ def _highlight_rect(browser: SentienceBrowser, rect: Dict[str, float], duration_ def click_rect( browser: SentienceBrowser, - rect: Dict[str, float], + rect: dict[str, float], highlight: bool = True, highlight_duration: float = 2.0, take_snapshot: bool = False, @@ -324,17 +333,17 @@ def click_rect( Click at the center of a rectangle using Playwright's native mouse simulation. This uses a hybrid approach: calculates center coordinates and uses mouse.click() for realistic event simulation (triggers hover, focus, mousedown, mouseup). - + Args: browser: SentienceBrowser instance rect: Dictionary with x, y, width (w), height (h) keys, or BBox object highlight: Whether to show a red border highlight when clicking (default: True) highlight_duration: How long to show the highlight in seconds (default: 2.0) take_snapshot: Whether to take snapshot after action - + Returns: ActionResult - + Example: >>> click_rect(browser, {"x": 100, "y": 200, "w": 50, "h": 30}) >>> # Or using BBox object @@ -344,7 +353,7 @@ def click_rect( """ if not browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - + # Handle BBox object or dict if isinstance(rect, BBox): x = rect.x @@ -356,7 +365,7 @@ def click_rect( y = rect.get("y", 0) w = rect.get("w") or rect.get("width", 0) h = rect.get("h") or rect.get("height", 0) - + if w <= 0 or h <= 0: return ActionResult( success=False, @@ -364,20 +373,20 @@ def click_rect( outcome="error", error={"code": "invalid_rect", "reason": "Rectangle width and height must be positive"}, ) - + start_time = time.time() url_before = browser.page.url - + # Calculate center of rectangle center_x = x + w / 2 center_y = y + h / 2 - + # Show highlight before clicking (if enabled) if highlight: _highlight_rect(browser, {"x": x, "y": y, "w": w, "h": h}, highlight_duration) # Small delay to ensure highlight is visible browser.page.wait_for_timeout(50) - + # Use Playwright's native mouse click for realistic simulation # This triggers hover, focus, mousedown, mouseup sequences try: @@ -386,34 +395,37 @@ def click_rect( except Exception as e: success = False error_msg = str(e) - + # Wait a bit for navigation/DOM updates browser.page.wait_for_timeout(500) - + duration_ms = int((time.time() - start_time) * 1000) url_after = browser.page.url url_changed = url_before != url_after - + # Determine outcome - outcome: Optional[str] = None + outcome: str | None = None if url_changed: outcome = "navigated" elif success: outcome = "dom_updated" else: outcome = "error" - + # Optional snapshot after - snapshot_after: Optional[Snapshot] = None + snapshot_after: Snapshot | None = None if take_snapshot: snapshot_after = snapshot(browser) - + return ActionResult( success=success, duration_ms=duration_ms, outcome=outcome, url_changed=url_changed, snapshot_after=snapshot_after, - error=None if success else {"code": "click_failed", "reason": error_msg if not success else "Click failed"}, + error=( + None + if success + else {"code": "click_failed", "reason": error_msg if not success else "Click failed"} + ), ) - diff --git a/sentience/agent.py b/sentience/agent.py index d6d8735..bd27326 100644 --- a/sentience/agent.py +++ b/sentience/agent.py @@ -5,24 +5,23 @@ import re import time -from typing import Dict, Any, List, Optional, Union +from typing import Any, Dict, List, Optional, Union + +from .actions import click, press, type_text from .base_agent import BaseAgent -from .llm_provider import LLMProvider, LLMResponse from .browser import SentienceBrowser -from .snapshot import snapshot -from .actions import click, type_text, press +from .llm_provider import LLMProvider, LLMResponse from .models import ( - Snapshot, - Element, - ActionResult, - AgentActionResult, - TokenStats, ActionHistory, ActionTokenUsage, + AgentActionResult, + Element, + ScreenshotConfig, + Snapshot, SnapshotOptions, - SnapshotFilter, - ScreenshotConfig + TokenStats, ) +from .snapshot import snapshot class SentienceAgent(BaseAgent): @@ -54,7 +53,7 @@ def __init__( browser: SentienceBrowser, llm: LLMProvider, default_snapshot_limit: int = 50, - verbose: bool = True + verbose: bool = True, ): """ Initialize Sentience Agent @@ -71,21 +70,18 @@ def __init__( self.verbose = verbose # Execution history - self.history: List[Dict[str, Any]] = [] + self.history: list[dict[str, Any]] = [] # Token usage tracking (will be converted to TokenStats on get_token_stats()) self._token_usage_raw = { "total_prompt_tokens": 0, "total_completion_tokens": 0, "total_tokens": 0, - "by_action": [] + "by_action": [], } def act( - self, - goal: str, - max_retries: int = 2, - snapshot_options: Optional[SnapshotOptions] = None + self, goal: str, max_retries: int = 2, snapshot_options: SnapshotOptions | None = None ) -> AgentActionResult: """ Execute a high-level goal using observe → think → act loop @@ -123,8 +119,8 @@ def act( screenshot_param = snap_opts.screenshot if isinstance(snap_opts.screenshot, ScreenshotConfig): screenshot_param = { - 'format': snap_opts.screenshot.format, - 'quality': snap_opts.screenshot.quality + "format": snap_opts.screenshot.format, + "quality": snap_opts.screenshot.quality, } # Call snapshot with converted parameters @@ -133,7 +129,7 @@ def act( screenshot=screenshot_param, limit=snap_opts.limit, filter=snap_opts.filter.model_dump() if snap_opts.filter else None, - use_api=snap_opts.use_api + use_api=snap_opts.use_api, ) if snap.status != "success": @@ -151,7 +147,7 @@ def act( elements=filtered_elements, screenshot=snap.screenshot, screenshot_format=snap.screenshot_format, - error=snap.error + error=snap.error, ) # 2. GROUND: Format elements for LLM context @@ -187,18 +183,20 @@ def act( outcome=result_dict.get("outcome"), url_changed=result_dict.get("url_changed"), error=result_dict.get("error"), - message=result_dict.get("message") + message=result_dict.get("message"), ) # 5. RECORD: Track history - self.history.append({ - "goal": goal, - "action": action_str, - "result": result.model_dump(), # Store as dict - "success": result.success, - "attempt": attempt, - "duration_ms": duration_ms - }) + self.history.append( + { + "goal": goal, + "action": action_str, + "result": result.model_dump(), # Store as dict + "success": result.success, + "attempt": attempt, + "duration_ms": duration_ms, + } + ) if self.verbose: status = "✅" if result.success else "❌" @@ -220,16 +218,18 @@ def act( goal=goal, duration_ms=0, attempt=attempt, - error=str(e) + error=str(e), + ) + self.history.append( + { + "goal": goal, + "action": "error", + "result": error_result.model_dump(), + "success": False, + "attempt": attempt, + "duration_ms": 0, + } ) - self.history.append({ - "goal": goal, - "action": "error", - "result": error_result.model_dump(), - "success": False, - "attempt": attempt, - "duration_ms": 0 - }) raise RuntimeError(f"Failed after {max_retries} retries: {e}") def _build_context(self, snap: Snapshot, goal: str) -> str: @@ -259,10 +259,12 @@ def _build_context(self, snap: Snapshot, goal: str) -> str: # Format element line cues_str = f" {{{','.join(cues)}}}" if cues else "" - text_preview = (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "") + text_preview = ( + (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "") + ) lines.append( - f"[{el.id}] <{el.role}> \"{text_preview}\"{cues_str} " + f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} ' f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})" ) @@ -311,7 +313,7 @@ def _query_llm(self, dom_context: str, goal: str) -> LLMResponse: return self.llm.generate(system_prompt, user_prompt, temperature=0.0) - def _execute_action(self, action_str: str, snap: Snapshot) -> Dict[str, Any]: + def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]: """ Parse action string and execute SDK call @@ -323,7 +325,7 @@ def _execute_action(self, action_str: str, snap: Snapshot) -> Dict[str, Any]: Execution result dictionary """ # Parse CLICK(42) - if match := re.match(r'CLICK\s*\(\s*(\d+)\s*\)', action_str, re.IGNORECASE): + if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE): element_id = int(match.group(1)) result = click(self.browser, element_id) return { @@ -331,11 +333,13 @@ def _execute_action(self, action_str: str, snap: Snapshot) -> Dict[str, Any]: "action": "click", "element_id": element_id, "outcome": result.outcome, - "url_changed": result.url_changed + "url_changed": result.url_changed, } # Parse TYPE(42, "hello world") - elif match := re.match(r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)', action_str, re.IGNORECASE): + elif match := re.match( + r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)', action_str, re.IGNORECASE + ): element_id = int(match.group(1)) text = match.group(2) result = type_text(self.browser, element_id, text) @@ -344,7 +348,7 @@ def _execute_action(self, action_str: str, snap: Snapshot) -> Dict[str, Any]: "action": "type", "element_id": element_id, "text": text, - "outcome": result.outcome + "outcome": result.outcome, } # Parse PRESS("Enter") @@ -355,21 +359,17 @@ def _execute_action(self, action_str: str, snap: Snapshot) -> Dict[str, Any]: "success": result.success, "action": "press", "key": key, - "outcome": result.outcome + "outcome": result.outcome, } # Parse FINISH() - elif re.match(r'FINISH\s*\(\s*\)', action_str, re.IGNORECASE): - return { - "success": True, - "action": "finish", - "message": "Task marked as complete" - } + elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE): + return {"success": True, "action": "finish", "message": "Task marked as complete"} else: raise ValueError( f"Unknown action format: {action_str}\n" - f"Expected: CLICK(id), TYPE(id, \"text\"), PRESS(\"key\"), or FINISH()" + f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()' ) def _track_tokens(self, goal: str, llm_response: LLMResponse): @@ -387,13 +387,15 @@ def _track_tokens(self, goal: str, llm_response: LLMResponse): if llm_response.total_tokens: self._token_usage_raw["total_tokens"] += llm_response.total_tokens - self._token_usage_raw["by_action"].append({ - "goal": goal, - "prompt_tokens": llm_response.prompt_tokens or 0, - "completion_tokens": llm_response.completion_tokens or 0, - "total_tokens": llm_response.total_tokens or 0, - "model": llm_response.model_name - }) + self._token_usage_raw["by_action"].append( + { + "goal": goal, + "prompt_tokens": llm_response.prompt_tokens or 0, + "completion_tokens": llm_response.completion_tokens or 0, + "total_tokens": llm_response.total_tokens or 0, + "model": llm_response.model_name, + } + ) def get_token_stats(self) -> TokenStats: """ @@ -402,18 +404,15 @@ def get_token_stats(self) -> TokenStats: Returns: TokenStats with token usage breakdown """ - by_action = [ - ActionTokenUsage(**action) - for action in self._token_usage_raw["by_action"] - ] + by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]] return TokenStats( total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"], total_completion_tokens=self._token_usage_raw["total_completion_tokens"], total_tokens=self._token_usage_raw["total_tokens"], - by_action=by_action + by_action=by_action, ) - def get_history(self) -> List[ActionHistory]: + def get_history(self) -> list[ActionHistory]: """ Get execution history @@ -429,14 +428,10 @@ def clear_history(self) -> None: "total_prompt_tokens": 0, "total_completion_tokens": 0, "total_tokens": 0, - "by_action": [] + "by_action": [], } - def filter_elements( - self, - snapshot: Snapshot, - goal: Optional[str] = None - ) -> List[Element]: + def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]: """ Filter elements from snapshot based on goal context. @@ -454,7 +449,7 @@ def filter_elements( # If no goal provided, return all elements (up to limit) if not goal: - return elements[:self.default_snapshot_limit] + return elements[: self.default_snapshot_limit] goal_lower = goal.lower() @@ -486,9 +481,9 @@ def filter_elements( scored_elements.sort(key=lambda x: x[0], reverse=True) elements = [el for _, el in scored_elements] - return elements[:self.default_snapshot_limit] + return elements[: self.default_snapshot_limit] - def _extract_keywords(self, text: str) -> List[str]: + def _extract_keywords(self, text: str) -> list[str]: """ Extract meaningful keywords from goal text @@ -499,8 +494,24 @@ def _extract_keywords(self, text: str) -> List[str]: List of keywords """ stopwords = { - "the", "a", "an", "and", "or", "but", "in", "on", "at", - "to", "for", "of", "with", "by", "from", "as", "is", "was" + "the", + "a", + "an", + "and", + "or", + "but", + "in", + "on", + "at", + "to", + "for", + "of", + "with", + "by", + "from", + "as", + "is", + "was", } words = text.split() return [w for w in words if w not in stopwords and len(w) > 2] diff --git a/sentience/base_agent.py b/sentience/base_agent.py index 8a15a25..07ce76f 100644 --- a/sentience/base_agent.py +++ b/sentience/base_agent.py @@ -4,14 +4,8 @@ """ from abc import ABC, abstractmethod -from typing import List, Optional -from .models import ( - Snapshot, - Element, - AgentActionResult, - TokenStats, - ActionHistory -) + +from .models import ActionHistory, AgentActionResult, Element, Snapshot, TokenStats class BaseAgent(ABC): @@ -35,11 +29,7 @@ class BaseAgent(ABC): """ @abstractmethod - def act( - self, - goal: str, - **kwargs - ) -> AgentActionResult: + def act(self, goal: str, **kwargs) -> AgentActionResult: """ Execute a natural language goal using the agent. @@ -56,7 +46,7 @@ def act( pass @abstractmethod - def get_history(self) -> List[ActionHistory]: + def get_history(self) -> list[ActionHistory]: """ Get the execution history of all actions taken. @@ -84,11 +74,7 @@ def clear_history(self) -> None: """ pass - def filter_elements( - self, - snapshot: Snapshot, - goal: Optional[str] = None - ) -> List[Element]: + def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]: """ Filter elements from a snapshot based on goal context. diff --git a/sentience/browser.py b/sentience/browser.py index 3b2d9ad..cbeff56 100644 --- a/sentience/browser.py +++ b/sentience/browser.py @@ -3,16 +3,17 @@ """ import os -import tempfile import shutil +import tempfile import time from pathlib import Path -from typing import Optional -from playwright.sync_api import sync_playwright, BrowserContext, Page, Playwright + +from playwright.sync_api import BrowserContext, Page, Playwright, sync_playwright # Import stealth for bot evasion (optional - graceful fallback if not available) try: from playwright_stealth import stealth_sync + STEALTH_AVAILABLE = True except ImportError: STEALTH_AVAILABLE = False @@ -20,16 +21,16 @@ class SentienceBrowser: """Main browser session with Sentience extension loaded""" - + def __init__( self, - api_key: Optional[str] = None, - api_url: Optional[str] = None, - headless: Optional[bool] = None + api_key: str | None = None, + api_url: str | None = None, + headless: bool | None = None, ): """ Initialize Sentience browser - + Args: api_key: Optional API key for server-side processing (Pro/Enterprise tiers) If None, uses free tier (local extension only) @@ -46,32 +47,32 @@ def __init__( self.api_url = "https://api.sentienceapi.com" else: self.api_url = api_url - + # Determine headless mode if headless is None: # Default to False for local dev, True for CI self.headless = os.environ.get("CI", "").lower() == "true" else: self.headless = headless - - self.playwright: Optional[Playwright] = None - self.context: Optional[BrowserContext] = None - self.page: Optional[Page] = None - self._extension_path: Optional[str] = None - + + self.playwright: Playwright | None = None + self.context: BrowserContext | None = None + self.page: Page | None = None + self._extension_path: str | None = None + def start(self) -> None: """Launch browser with extension loaded""" # Get extension source path (relative to project root/package) # Handle both development (src/) and installed package cases - + # 1. Try relative to this file (installed package structure) # sentience/browser.py -> sentience/extension/ package_ext_path = Path(__file__).parent / "extension" - + # 2. Try development root (if running from source repo) # sentience/browser.py -> ../sentience-chrome dev_ext_path = Path(__file__).parent.parent.parent / "sentience-chrome" - + if package_ext_path.exists() and (package_ext_path / "manifest.json").exists(): extension_source = package_ext_path elif dev_ext_path.exists() and (dev_ext_path / "manifest.json").exists(): @@ -95,7 +96,7 @@ def start(self) -> None: args = [ f"--disable-extensions-except={self._extension_path}", f"--load-extension={self._extension_path}", - "--disable-blink-features=AutomationControlled", # Hides 'navigator.webdriver' + "--disable-blink-features=AutomationControlled", # Hides 'navigator.webdriver' "--no-sandbox", "--disable-infobars", ] @@ -103,20 +104,20 @@ def start(self) -> None: # Handle headless mode correctly for extensions # 'headless=True' DOES NOT support extensions in standard Chrome # We must use 'headless="new"' (Chrome 112+) or run visible - launch_headless_arg = False # Default to visible + # launch_headless_arg = False # Default to visible if self.headless: - args.append("--headless=new") # Use new headless mode via args - + args.append("--headless=new") # Use new headless mode via args + # Launch persistent context (required for extensions) # Note: We pass headless=False to launch_persistent_context because we handle # headless mode via the --headless=new arg above. This is a Playwright workaround. self.context = self.playwright.chromium.launch_persistent_context( - user_data_dir="", # Ephemeral temp dir - headless=False, # IMPORTANT: See note above + user_data_dir="", # Ephemeral temp dir + headless=False, # IMPORTANT: See note above args=args, viewport={"width": 1280, "height": 800}, # Remove "HeadlessChrome" from User Agent automatically - user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", ) self.page = self.context.pages[0] if self.context.pages else self.context.new_page() @@ -124,7 +125,7 @@ def start(self) -> None: # Apply stealth if available if STEALTH_AVAILABLE: stealth_sync(self.page) - + # Wait a moment for extension to initialize time.sleep(0.5) @@ -132,20 +133,22 @@ def goto(self, url: str) -> None: """Navigate to a URL and ensure extension is ready""" if not self.page: raise RuntimeError("Browser not started. Call start() first.") - + self.page.goto(url, wait_until="domcontentloaded") - + # Wait for extension to be ready (injected into page) if not self._wait_for_extension(): # Gather diagnostic info before failing try: - diag = self.page.evaluate("""() => ({ + diag = self.page.evaluate( + """() => ({ sentience_defined: typeof window.sentience !== 'undefined', registry_defined: typeof window.sentience_registry !== 'undefined', snapshot_defined: window.sentience && typeof window.sentience.snapshot === 'function', extension_id: document.documentElement.dataset.sentienceExtensionId || 'not set', url: window.location.href - })""") + })""" + ) except Exception as e: diag = f"Failed to get diagnostics: {str(e)}" @@ -162,16 +165,17 @@ def _wait_for_extension(self, timeout_sec: float = 5.0) -> bool: """Poll for window.sentience to be available""" start_time = time.time() last_error = None - + while time.time() - start_time < timeout_sec: try: # Check if API exists and WASM is ready (optional check for _wasmModule) - result = self.page.evaluate("""() => { + result = self.page.evaluate( + """() => { if (typeof window.sentience === 'undefined') { return { ready: false, reason: 'window.sentience undefined' }; } // Check if WASM loaded (if exposed) or if basic API works - // Note: injected_api.js defines window.sentience immediately, + // Note: injected_api.js defines window.sentience immediately, // but _wasmModule might take a few ms to load. if (window.sentience._wasmModule === null) { // It's defined but WASM isn't linked yet @@ -181,8 +185,9 @@ def _wait_for_extension(self, timeout_sec: float = 5.0) -> bool: // Just verify the API structure is correct return { ready: true }; } - """) - + """ + ) + if isinstance(result, dict): if result.get("ready"): return True @@ -190,16 +195,17 @@ def _wait_for_extension(self, timeout_sec: float = 5.0) -> bool: except Exception as e: # Continue waiting on errors last_error = f"Evaluation error: {str(e)}" - + time.sleep(0.3) - + # Log the last error for debugging if last_error: import warnings + warnings.warn(f"Extension wait timeout. Last status: {last_error}") - + return False - + def close(self) -> None: """Close browser and cleanup""" if self.context: @@ -208,12 +214,12 @@ def close(self) -> None: self.playwright.stop() if self._extension_path and os.path.exists(self._extension_path): shutil.rmtree(self._extension_path) - + def __enter__(self): """Context manager entry""" self.start() return self - + def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" - self.close() \ No newline at end of file + self.close() diff --git a/sentience/cli.py b/sentience/cli.py index b5144c5..088e819 100644 --- a/sentience/cli.py +++ b/sentience/cli.py @@ -5,11 +5,11 @@ import argparse import sys from pathlib import Path + from .browser import SentienceBrowser -from .inspector import inspect -from .recorder import record from .generator import ScriptGenerator -from .recorder import Trace +from .inspector import inspect +from .recorder import Trace, record def cmd_inspect(args): @@ -19,10 +19,11 @@ def cmd_inspect(args): browser.start() print("✅ Inspector started. Hover elements to see info, click to see full details.") print("Press Ctrl+C to stop.") - + with inspect(browser): # Keep running until interrupted import time + try: while True: time.sleep(1) @@ -37,22 +38,23 @@ def cmd_record(args): browser = SentienceBrowser(headless=False) try: browser.start() - + # Navigate to start URL if provided if args.url: browser.page.goto(args.url) browser.page.wait_for_load_state("networkidle") - + print("✅ Recording started. Perform actions in the browser.") print("Press Ctrl+C to stop and save trace.") - + with record(browser, capture_snapshots=args.snapshots) as rec: # Add mask patterns if provided for pattern in args.mask or []: rec.add_mask_pattern(pattern) - + # Keep running until interrupted import time + try: while True: time.sleep(1) @@ -69,10 +71,10 @@ def cmd_gen(args): """Generate script from trace""" # Load trace trace = Trace.load(args.trace) - + # Generate script generator = ScriptGenerator(trace) - + if args.lang == "py": code = generator.generate_python() output = args.output or "generated.py" @@ -84,7 +86,7 @@ def cmd_gen(args): else: print(f"❌ Unsupported language: {args.lang}") sys.exit(1) - + print(f"✅ Generated {args.lang.upper()} script: {output}") @@ -92,35 +94,38 @@ def main(): """Main CLI entry point""" parser = argparse.ArgumentParser(description="Sentience SDK CLI") subparsers = parser.add_subparsers(dest="command", help="Commands") - + # Inspect command inspect_parser = subparsers.add_parser("inspect", help="Start inspector mode") inspect_parser.set_defaults(func=cmd_inspect) - + # Record command record_parser = subparsers.add_parser("record", help="Start recording mode") record_parser.add_argument("--url", help="Start URL") record_parser.add_argument("--output", "-o", help="Output trace file", default="trace.json") - record_parser.add_argument("--snapshots", action="store_true", help="Capture snapshots at each step") - record_parser.add_argument("--mask", action="append", help="Pattern to mask in recorded text (e.g., password)") + record_parser.add_argument( + "--snapshots", action="store_true", help="Capture snapshots at each step" + ) + record_parser.add_argument( + "--mask", action="append", help="Pattern to mask in recorded text (e.g., password)" + ) record_parser.set_defaults(func=cmd_record) - + # Generate command gen_parser = subparsers.add_parser("gen", help="Generate script from trace") gen_parser.add_argument("trace", help="Trace JSON file") gen_parser.add_argument("--lang", choices=["py", "ts"], default="py", help="Output language") gen_parser.add_argument("--output", "-o", help="Output script file") gen_parser.set_defaults(func=cmd_gen) - + args = parser.parse_args() - + if not args.command: parser.print_help() sys.exit(1) - + args.func(args) if __name__ == "__main__": main() - diff --git a/sentience/conversational_agent.py b/sentience/conversational_agent.py index 6ada4b9..27a6943 100644 --- a/sentience/conversational_agent.py +++ b/sentience/conversational_agent.py @@ -5,12 +5,13 @@ import json import time -from typing import Dict, Any, List, Optional -from .llm_provider import LLMProvider, LLMResponse -from .browser import SentienceBrowser +from typing import Any, Dict, List, Optional + from .agent import SentienceAgent -from .snapshot import snapshot +from .browser import SentienceBrowser +from .llm_provider import LLMProvider, LLMResponse from .models import Snapshot +from .snapshot import snapshot class ConversationalAgent: @@ -28,12 +29,7 @@ class ConversationalAgent: The top result is from amazon.com selling the Apple Magic Mouse 2 for $79." """ - def __init__( - self, - browser: SentienceBrowser, - llm: LLMProvider, - verbose: bool = True - ): + def __init__(self, browser: SentienceBrowser, llm: LLMProvider, verbose: bool = True): """ Initialize conversational agent @@ -50,12 +46,12 @@ def __init__( self.technical_agent = SentienceAgent(browser, llm, verbose=False) # Conversation history and context - self.conversation_history: List[Dict[str, Any]] = [] - self.execution_context: Dict[str, Any] = { + self.conversation_history: list[dict[str, Any]] = [] + self.execution_context: dict[str, Any] = { "current_url": None, "last_action": None, "discovered_elements": [], - "session_data": {} + "session_data": {}, } def execute(self, user_input: str) -> str: @@ -85,16 +81,16 @@ def execute(self, user_input: str) -> str: if self.verbose: print(f"\n📋 Execution Plan:") - for i, step in enumerate(plan['steps'], 1): + for i, step in enumerate(plan["steps"], 1): print(f" {i}. {step['description']}") # Step 2: Execute each step execution_results = [] - for step in plan['steps']: + for step in plan["steps"]: step_result = self._execute_step(step) execution_results.append(step_result) - if not step_result.get('success', False): + if not step_result.get("success", False): # Early exit on failure if self.verbose: print(f"⚠️ Step failed: {step['description']}") @@ -106,13 +102,15 @@ def execute(self, user_input: str) -> str: duration_ms = int((time.time() - start_time) * 1000) # Step 4: Update conversation history - self.conversation_history.append({ - "user_input": user_input, - "plan": plan, - "results": execution_results, - "response": response, - "duration_ms": duration_ms - }) + self.conversation_history.append( + { + "user_input": user_input, + "plan": plan, + "results": execution_results, + "response": response, + "duration_ms": duration_ms, + } + ) if self.verbose: print(f"\n🤖 Agent: {response}") @@ -120,7 +118,7 @@ def execute(self, user_input: str) -> str: return response - def _create_plan(self, user_input: str) -> Dict[str, Any]: + def _create_plan(self, user_input: str) -> dict[str, Any]: """ Use LLM to break down user input into atomic executable steps @@ -178,10 +176,7 @@ def _create_plan(self, user_input: str) -> Dict[str, Any]: try: response = self.llm.generate( - system_prompt, - user_prompt, - json_mode=self.llm.supports_json_mode(), - temperature=0.0 + system_prompt, user_prompt, json_mode=self.llm.supports_json_mode(), temperature=0.0 ) # Parse JSON response @@ -199,13 +194,13 @@ def _create_plan(self, user_input: str) -> Dict[str, Any]: { "action": "FIND_AND_CLICK", "description": user_input, - "parameters": {"element_description": user_input} + "parameters": {"element_description": user_input}, } ], - "expected_outcome": "Complete user request" + "expected_outcome": "Complete user request", } - def _execute_step(self, step: Dict[str, Any]) -> Dict[str, Any]: + def _execute_step(self, step: dict[str, Any]) -> dict[str, Any]: """ Execute a single atomic step from the plan @@ -215,70 +210,62 @@ def _execute_step(self, step: Dict[str, Any]) -> Dict[str, Any]: Returns: Execution result with success status and data """ - action = step['action'] - params = step.get('parameters', {}) + action = step["action"] + params = step.get("parameters", {}) if self.verbose: print(f"\n⚙️ Executing: {step['description']}") try: if action == "NAVIGATE": - url = params['url'] + url = params["url"] # Add https:// if missing - if not url.startswith(('http://', 'https://')): - url = 'https://' + url + if not url.startswith(("http://", "https://")): + url = "https://" + url self.browser.page.goto(url, wait_until="domcontentloaded") - self.execution_context['current_url'] = url + self.execution_context["current_url"] = url time.sleep(1) # Brief wait for page to settle - return { - "success": True, - "action": action, - "data": {"url": url} - } + return {"success": True, "action": action, "data": {"url": url}} elif action == "FIND_AND_CLICK": - element_desc = params['element_description'] + element_desc = params["element_description"] # Use technical agent to find and click (returns AgentActionResult) result = self.technical_agent.act(f"Click the {element_desc}") return { "success": result.success, # Use attribute access "action": action, - "data": result.model_dump() # Convert to dict for flexibility + "data": result.model_dump(), # Convert to dict for flexibility } elif action == "FIND_AND_TYPE": - element_desc = params['element_description'] - text = params['text'] + element_desc = params["element_description"] + text = params["text"] # Use technical agent to find input and type (returns AgentActionResult) result = self.technical_agent.act(f"Type '{text}' into {element_desc}") return { "success": result.success, # Use attribute access "action": action, - "data": {"text": text, "result": result.model_dump()} + "data": {"text": text, "result": result.model_dump()}, } elif action == "PRESS_KEY": - key = params['key'] + key = params["key"] result = self.technical_agent.act(f"Press {key} key") return { "success": result.success, # Use attribute access "action": action, - "data": {"key": key, "result": result.model_dump()} + "data": {"key": key, "result": result.model_dump()}, } elif action == "WAIT": - duration = params.get('duration', 2.0) + duration = params.get("duration", 2.0) time.sleep(duration) - return { - "success": True, - "action": action, - "data": {"duration": duration} - } + return {"success": True, "action": action, "data": {"duration": duration}} elif action == "EXTRACT_INFO": - info_type = params['info_type'] + info_type = params["info_type"] # Get current page snapshot and extract info snap = snapshot(self.browser, limit=50) @@ -288,17 +275,17 @@ def _execute_step(self, step: Dict[str, Any]) -> Dict[str, Any]: return { "success": True, "action": action, - "data": {"extracted": extracted, "info_type": info_type} + "data": {"extracted": extracted, "info_type": info_type}, } elif action == "VERIFY": - condition = params['condition'] + condition = params["condition"] # Verify condition using current page state is_verified = self._verify_condition(condition) return { "success": is_verified, "action": action, - "data": {"condition": condition, "verified": is_verified} + "data": {"condition": condition, "verified": is_verified}, } else: @@ -307,13 +294,9 @@ def _execute_step(self, step: Dict[str, Any]) -> Dict[str, Any]: except Exception as e: if self.verbose: print(f"❌ Step failed: {e}") - return { - "success": False, - "action": action, - "error": str(e) - } + return {"success": False, "action": action, "error": str(e)} - def _extract_information(self, snap: Snapshot, info_type: str) -> Dict[str, Any]: + def _extract_information(self, snap: Snapshot, info_type: str) -> dict[str, Any]: """ Extract specific information from snapshot using LLM @@ -325,10 +308,12 @@ def _extract_information(self, snap: Snapshot, info_type: str) -> Dict[str, Any] Extracted information dictionary """ # Build context from snapshot - elements_text = "\n".join([ - f"[{el.id}] {el.role}: {el.text} (importance: {el.importance})" - for el in snap.elements[:30] # Top 30 elements - ]) + elements_text = "\n".join( + [ + f"[{el.id}] {el.role}: {el.text} (importance: {el.importance})" + for el in snap.elements[:30] # Top 30 elements + ] + ) system_prompt = f"""Extract {info_type} from the following page elements. @@ -348,9 +333,7 @@ def _extract_information(self, snap: Snapshot, info_type: str) -> Dict[str, Any] try: response = self.llm.generate( - system_prompt, - user_prompt, - json_mode=self.llm.supports_json_mode() + system_prompt, user_prompt, json_mode=self.llm.supports_json_mode() ) return json.loads(response.content) except: @@ -370,10 +353,7 @@ def _verify_condition(self, condition: str) -> bool: snap = snapshot(self.browser, limit=30) # Build context - elements_text = "\n".join([ - f"{el.role}: {el.text}" - for el in snap.elements[:20] - ]) + elements_text = "\n".join([f"{el.role}: {el.text}" for el in snap.elements[:20]]) system_prompt = f"""Verify if the following condition is met based on page elements. @@ -388,21 +368,14 @@ def _verify_condition(self, condition: str) -> bool: "reasoning": "explanation" }}""" - response = self.llm.generate( - system_prompt, - "", - json_mode=self.llm.supports_json_mode() - ) + response = self.llm.generate(system_prompt, "", json_mode=self.llm.supports_json_mode()) result = json.loads(response.content) - return result.get('verified', False) + return result.get("verified", False) except: return False def _synthesize_response( - self, - user_input: str, - plan: Dict[str, Any], - execution_results: List[Dict[str, Any]] + self, user_input: str, plan: dict[str, Any], execution_results: list[dict[str, Any]] ) -> str: """ Synthesize a natural language response from execution results @@ -416,14 +389,14 @@ def _synthesize_response( Human-readable response string """ # Build summary of what happened - successful_steps = [r for r in execution_results if r.get('success')] - failed_steps = [r for r in execution_results if not r.get('success')] + successful_steps = [r for r in execution_results if r.get("success")] + failed_steps = [r for r in execution_results if not r.get("success")] # Extract key data extracted_data = [] for result in execution_results: - if result.get('action') == 'EXTRACT_INFO': - extracted_data.append(result.get('data', {}).get('extracted', {})) + if result.get("action") == "EXTRACT_INFO": + extracted_data.append(result.get("data", {}).get("extracted", {})) # Use LLM to create natural response system_prompt = """You are a helpful assistant that summarizes web automation results @@ -439,12 +412,12 @@ def _synthesize_response( results_summary = { "user_request": user_input, - "plan_intent": plan.get('intent'), + "plan_intent": plan.get("intent"), "total_steps": len(execution_results), "successful_steps": len(successful_steps), "failed_steps": len(failed_steps), "extracted_data": extracted_data, - "final_url": self.browser.page.url if self.browser.page else None + "final_url": self.browser.page.url if self.browser.page else None, } user_prompt = f"""Summarize these automation results in 1-3 natural sentences: @@ -497,12 +470,9 @@ def get_summary(self) -> str: session_data = { "total_interactions": len(self.conversation_history), "actions": [ - { - "request": h['user_input'], - "outcome": h['response'] - } + {"request": h["user_input"], "outcome": h["response"]} for h in self.conversation_history - ] + ], } user_prompt = f"Summarize this session:\n{json.dumps(session_data, indent=2)}" @@ -510,8 +480,8 @@ def get_summary(self) -> str: try: summary = self.llm.generate(system_prompt, user_prompt) return summary.content.strip() - except: - return f"Session with {len(self.conversation_history)} interactions completed." + except Exception as ex: + return f"Session with {len(self.conversation_history)} interactions completed with exception: {ex}" def clear_history(self): """Clear conversation history""" @@ -521,5 +491,5 @@ def clear_history(self): "current_url": None, "last_action": None, "discovered_elements": [], - "session_data": {} + "session_data": {}, } diff --git a/sentience/expect.py b/sentience/expect.py index ac0d9e4..01de429 100644 --- a/sentience/expect.py +++ b/sentience/expect.py @@ -3,99 +3,91 @@ """ import time -from typing import Union, Optional +from typing import Optional, Union + from .browser import SentienceBrowser from .models import Element -from .wait import wait_for from .query import query +from .wait import wait_for class Expectation: """Assertion helper for element expectations""" - - def __init__(self, browser: SentienceBrowser, selector: Union[str, dict]): + + def __init__(self, browser: SentienceBrowser, selector: str | dict): self.browser = browser self.selector = selector - + def to_be_visible(self, timeout: float = 10.0) -> Element: """Assert element is visible (exists and in viewport)""" result = wait_for(self.browser, self.selector, timeout=timeout) - + if not result.found: - raise AssertionError( - f"Element not found: {self.selector} (timeout: {timeout}s)" - ) - + raise AssertionError(f"Element not found: {self.selector} (timeout: {timeout}s)") + element = result.element if not element.in_viewport: - raise AssertionError( - f"Element found but not visible in viewport: {self.selector}" - ) - + raise AssertionError(f"Element found but not visible in viewport: {self.selector}") + return element - + def to_exist(self, timeout: float = 10.0) -> Element: """Assert element exists""" result = wait_for(self.browser, self.selector, timeout=timeout) - + if not result.found: - raise AssertionError( - f"Element does not exist: {self.selector} (timeout: {timeout}s)" - ) - + raise AssertionError(f"Element does not exist: {self.selector} (timeout: {timeout}s)") + return result.element - + def to_have_text(self, expected_text: str, timeout: float = 10.0) -> Element: """Assert element has specific text""" result = wait_for(self.browser, self.selector, timeout=timeout) - + if not result.found: - raise AssertionError( - f"Element not found: {self.selector} (timeout: {timeout}s)" - ) - + raise AssertionError(f"Element not found: {self.selector} (timeout: {timeout}s)") + element = result.element if not element.text or expected_text not in element.text: raise AssertionError( f"Element text mismatch. Expected '{expected_text}', got '{element.text}'" ) - + return element - + def to_have_count(self, expected_count: int, timeout: float = 10.0) -> None: """Assert selector matches exactly N elements""" from .snapshot import snapshot - + start_time = time.time() while time.time() - start_time < timeout: snap = snapshot(self.browser) matches = query(snap, self.selector) - + if len(matches) == expected_count: return - + time.sleep(0.25) - + # Final check snap = snapshot(self.browser) matches = query(snap, self.selector) actual_count = len(matches) - + raise AssertionError( f"Element count mismatch. Expected {expected_count}, got {actual_count}" ) -def expect(browser: SentienceBrowser, selector: Union[str, dict]) -> Expectation: +def expect(browser: SentienceBrowser, selector: str | dict) -> Expectation: """ Create expectation helper for assertions - + Args: browser: SentienceBrowser instance selector: String DSL or dict query - + Returns: Expectation helper """ return Expectation(browser, selector) - diff --git a/sentience/extension/content.js b/sentience/extension/content.js index de24fa5..6955acd 100644 --- a/sentience/extension/content.js +++ b/sentience/extension/content.js @@ -19,4 +19,4 @@ window.addEventListener('message', (event) => { }, '*'); } ); -}); \ No newline at end of file +}); diff --git a/sentience/extension/injected_api.js b/sentience/extension/injected_api.js index 941478e..fbd0aac 100644 --- a/sentience/extension/injected_api.js +++ b/sentience/extension/injected_api.js @@ -3,7 +3,7 @@ // 1. Get Extension ID (Wait for content.js to set it) const getExtensionId = () => document.documentElement.dataset.sentienceExtensionId; let extId = getExtensionId(); - + // Safety poller for async loading race conditions if (!extId) { await new Promise(resolve => { @@ -39,7 +39,7 @@ return NodeFilter.FILTER_ACCEPT; } }; - + const walker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, filter); while(walker.nextNode()) { const node = walker.currentNode; @@ -100,13 +100,13 @@ // Fast center-point check const cx = rect.x + rect.width / 2; const cy = rect.y + rect.height / 2; - + // If point is off-screen, elementFromPoint returns null, assume NOT occluded for safety if (cx < 0 || cx > window.innerWidth || cy < 0 || cy > window.innerHeight) return false; const topEl = document.elementFromPoint(cx, cy); if (!topEl) return false; - + // It's visible if the top element is us, or contains us, or we contain it return !(el === topEl || el.contains(topEl) || topEl.contains(el)); } @@ -131,7 +131,7 @@ function getRawHTML(root) { const sourceRoot = root || document.body; const clone = sourceRoot.cloneNode(true); - + // Remove unwanted elements by tag name (simple and reliable) const unwantedTags = ['nav', 'footer', 'header', 'script', 'style', 'noscript', 'iframe', 'svg']; unwantedTags.forEach(tag => { @@ -157,7 +157,7 @@ while (node = walker.nextNode()) { const tag = node.tagName.toLowerCase(); if (tag === 'head' || tag === 'title') continue; - + const style = window.getComputedStyle(node); if (style.display === 'none' || style.visibility === 'hidden' || (node.offsetWidth === 0 && node.offsetHeight === 0)) { @@ -222,11 +222,11 @@ function convertToMarkdown(root) { // Get cleaned HTML first const rawHTML = getRawHTML(root); - + // Create a temporary container to parse the HTML const tempDiv = document.createElement('div'); tempDiv.innerHTML = rawHTML; - + let markdown = ''; let insideLink = false; // Track if we're inside an tag @@ -279,7 +279,7 @@ } walk(tempDiv); - + // Cleanup: remove excessive newlines return markdown.replace(/\n{3,}/g, '\n\n').trim(); } @@ -299,17 +299,17 @@ const style = window.getComputedStyle(node); if (style.display === 'none' || style.visibility === 'hidden') return; - + // Block level elements get a newline const isBlock = style.display === 'block' || style.display === 'flex' || node.tagName === 'P' || node.tagName === 'DIV'; if (isBlock) text += ' '; - + if (node.shadowRoot) { Array.from(node.shadowRoot.childNodes).forEach(walk); } else { node.childNodes.forEach(walk); } - + if (isBlock) text += '\n'; } } @@ -331,7 +331,7 @@ }; await module.default(undefined, imports); wasmModule = module; - + // Verify functions are available if (!wasmModule.analyze_page) { console.error('[SentienceAPI.com] available'); @@ -354,16 +354,16 @@ const rawData = []; // Remove textMap as we include text in rawData window.sentience_registry = []; - + const nodes = getAllElements(); - + nodes.forEach((el, idx) => { if (!el.getBoundingClientRect) return; const rect = el.getBoundingClientRect(); if (rect.width < 5 || rect.height < 5) return; window.sentience_registry[idx] = el; - + // Calculate properties for Fat Payload const textVal = getText(el); const inView = isInViewport(rect); @@ -453,7 +453,7 @@ // Prune raw elements using WASM before sending to API // This prevents 413 errors on large sites (Amazon: 5000+ -> ~200-400) const prunedRawData = wasmModule.prune_for_api(rawData); - + // Clean up null/undefined fields in raw_elements as well const cleanedRawElements = cleanElement(prunedRawData); @@ -469,7 +469,7 @@ read: (options = {}) => { const format = options.format || 'raw'; // 'raw', 'text', or 'markdown' let content; - + if (format === 'raw') { // Return raw HTML suitable for Turndown or other Node.js libraries content = getRawHTML(document.body); @@ -480,7 +480,7 @@ // Default to text content = convertToText(document.body); } - + return { status: "success", url: window.location.href, @@ -497,4 +497,4 @@ return false; } }; -})(); \ No newline at end of file +})(); diff --git a/sentience/extension/manifest.json b/sentience/extension/manifest.json index a27f969..105341b 100644 --- a/sentience/extension/manifest.json +++ b/sentience/extension/manifest.json @@ -27,4 +27,4 @@ "content_security_policy": { "extension_pages": "script-src 'self' 'wasm-unsafe-eval'; object-src 'self'" } -} \ No newline at end of file +} diff --git a/sentience/generator.py b/sentience/generator.py index 90c74d1..01fd12a 100644 --- a/sentience/generator.py +++ b/sentience/generator.py @@ -4,199 +4,203 @@ import json from typing import List, Optional -from .recorder import Trace, TraceStep + from .query import find +from .recorder import Trace, TraceStep class ScriptGenerator: """Generates Python or TypeScript code from a trace""" - + def __init__(self, trace: Trace): self.trace = trace - + def generate_python(self) -> str: """Generate Python script from trace""" lines = [ '"""', - f'Generated script from trace: {self.trace.start_url}', - f'Created: {self.trace.created_at}', + f"Generated script from trace: {self.trace.start_url}", + f"Created: {self.trace.created_at}", '"""', - '', - 'from sentience import SentienceBrowser, snapshot, find, click, type_text, press', - '', - 'def main():', - ' with SentienceBrowser(headless=False) as browser:', + "", + "from sentience import SentienceBrowser, snapshot, find, click, type_text, press", + "", + "def main():", + " with SentienceBrowser(headless=False) as browser:", f' browser.page.goto("{self.trace.start_url}")', ' browser.page.wait_for_load_state("networkidle")', - '', + "", ] - + for step in self.trace.steps: - lines.extend(self._generate_python_step(step, indent=' ')) - - lines.extend([ - '', - 'if __name__ == "__main__":', - ' main()', - ]) - - return '\n'.join(lines) - + lines.extend(self._generate_python_step(step, indent=" ")) + + lines.extend( + [ + "", + 'if __name__ == "__main__":', + " main()", + ] + ) + + return "\n".join(lines) + def generate_typescript(self) -> str: """Generate TypeScript script from trace""" lines = [ - '/**', - f' * Generated script from trace: {self.trace.start_url}', - f' * Created: {self.trace.created_at}', - ' */', - '', + "/**", + f" * Generated script from trace: {self.trace.start_url}", + f" * Created: {self.trace.created_at}", + " */", + "", "import { SentienceBrowser, snapshot, find, click, typeText, press } from './src';", - '', - 'async function main() {', - ' const browser = new SentienceBrowser(undefined, false);', - '', - ' try {', - ' await browser.start();', - f' await browser.getPage().goto(\'{self.trace.start_url}\');', - ' await browser.getPage().waitForLoadState(\'networkidle\');', - '', + "", + "async function main() {", + " const browser = new SentienceBrowser(undefined, false);", + "", + " try {", + " await browser.start();", + f" await browser.getPage().goto('{self.trace.start_url}');", + " await browser.getPage().waitForLoadState('networkidle');", + "", ] - + for step in self.trace.steps: - lines.extend(self._generate_typescript_step(step, indent=' ')) - - lines.extend([ - ' } finally {', - ' await browser.close();', - ' }', - '}', - '', - 'main().catch(console.error);', - ]) - - return '\n'.join(lines) - - def _generate_python_step(self, step: TraceStep, indent: str = '') -> List[str]: + lines.extend(self._generate_typescript_step(step, indent=" ")) + + lines.extend( + [ + " } finally {", + " await browser.close();", + " }", + "}", + "", + "main().catch(console.error);", + ] + ) + + return "\n".join(lines) + + def _generate_python_step(self, step: TraceStep, indent: str = "") -> list[str]: """Generate Python code for a single step""" lines = [] - - if step.type == 'navigation': - lines.append(f'{indent}# Navigate to {step.url}') + + if step.type == "navigation": + lines.append(f"{indent}# Navigate to {step.url}") lines.append(f'{indent}browser.page.goto("{step.url}")') lines.append(f'{indent}browser.page.wait_for_load_state("networkidle")') - - elif step.type == 'click': + + elif step.type == "click": if step.selector: # Use semantic selector - lines.append(f'{indent}# Click: {step.selector}') - lines.append(f'{indent}snap = snapshot(browser)') + lines.append(f"{indent}# Click: {step.selector}") + lines.append(f"{indent}snap = snapshot(browser)") lines.append(f'{indent}element = find(snap, "{step.selector}")') - lines.append(f'{indent}if element:') - lines.append(f'{indent} click(browser, element.id)') - lines.append(f'{indent}else:') + lines.append(f"{indent}if element:") + lines.append(f"{indent} click(browser, element.id)") + lines.append(f"{indent}else:") lines.append(f'{indent} raise Exception("Element not found: {step.selector}")') elif step.element_id is not None: # Fallback to element ID - lines.append(f'{indent}# TODO: replace with semantic selector') - lines.append(f'{indent}click(browser, {step.element_id})') - lines.append('') - - elif step.type == 'type': + lines.append(f"{indent}# TODO: replace with semantic selector") + lines.append(f"{indent}click(browser, {step.element_id})") + lines.append("") + + elif step.type == "type": if step.selector: - lines.append(f'{indent}# Type into: {step.selector}') - lines.append(f'{indent}snap = snapshot(browser)') + lines.append(f"{indent}# Type into: {step.selector}") + lines.append(f"{indent}snap = snapshot(browser)") lines.append(f'{indent}element = find(snap, "{step.selector}")') - lines.append(f'{indent}if element:') + lines.append(f"{indent}if element:") lines.append(f'{indent} type_text(browser, element.id, "{step.text}")') - lines.append(f'{indent}else:') + lines.append(f"{indent}else:") lines.append(f'{indent} raise Exception("Element not found: {step.selector}")') elif step.element_id is not None: - lines.append(f'{indent}# TODO: replace with semantic selector') + lines.append(f"{indent}# TODO: replace with semantic selector") lines.append(f'{indent}type_text(browser, {step.element_id}, "{step.text}")') - lines.append('') - - elif step.type == 'press': - lines.append(f'{indent}# Press key: {step.key}') + lines.append("") + + elif step.type == "press": + lines.append(f"{indent}# Press key: {step.key}") lines.append(f'{indent}press(browser, "{step.key}")') - lines.append('') - + lines.append("") + return lines - - def _generate_typescript_step(self, step: TraceStep, indent: str = '') -> List[str]: + + def _generate_typescript_step(self, step: TraceStep, indent: str = "") -> list[str]: """Generate TypeScript code for a single step""" lines = [] - - if step.type == 'navigation': - lines.append(f'{indent}// Navigate to {step.url}') - lines.append(f'{indent}await browser.getPage().goto(\'{step.url}\');') - lines.append(f'{indent}await browser.getPage().waitForLoadState(\'networkidle\');') - - elif step.type == 'click': + + if step.type == "navigation": + lines.append(f"{indent}// Navigate to {step.url}") + lines.append(f"{indent}await browser.getPage().goto('{step.url}');") + lines.append(f"{indent}await browser.getPage().waitForLoadState('networkidle');") + + elif step.type == "click": if step.selector: - lines.append(f'{indent}// Click: {step.selector}') - lines.append(f'{indent}const snap = await snapshot(browser);') - lines.append(f'{indent}const element = find(snap, \'{step.selector}\');') - lines.append(f'{indent}if (element) {{') - lines.append(f'{indent} await click(browser, element.id);') - lines.append(f'{indent}}} else {{') - lines.append(f'{indent} throw new Error(\'Element not found: {step.selector}\');') - lines.append(f'{indent}}}') + lines.append(f"{indent}// Click: {step.selector}") + lines.append(f"{indent}const snap = await snapshot(browser);") + lines.append(f"{indent}const element = find(snap, '{step.selector}');") + lines.append(f"{indent}if (element) {{") + lines.append(f"{indent} await click(browser, element.id);") + lines.append(f"{indent}}} else {{") + lines.append(f"{indent} throw new Error('Element not found: {step.selector}');") + lines.append(f"{indent}}}") elif step.element_id is not None: - lines.append(f'{indent}// TODO: replace with semantic selector') - lines.append(f'{indent}await click(browser, {step.element_id});') - lines.append('') - - elif step.type == 'type': + lines.append(f"{indent}// TODO: replace with semantic selector") + lines.append(f"{indent}await click(browser, {step.element_id});") + lines.append("") + + elif step.type == "type": if step.selector: - lines.append(f'{indent}// Type into: {step.selector}') - lines.append(f'{indent}const snap = await snapshot(browser);') - lines.append(f'{indent}const element = find(snap, \'{step.selector}\');') - lines.append(f'{indent}if (element) {{') - lines.append(f'{indent} await typeText(browser, element.id, \'{step.text}\');') - lines.append(f'{indent}}} else {{') - lines.append(f'{indent} throw new Error(\'Element not found: {step.selector}\');') - lines.append(f'{indent}}}') + lines.append(f"{indent}// Type into: {step.selector}") + lines.append(f"{indent}const snap = await snapshot(browser);") + lines.append(f"{indent}const element = find(snap, '{step.selector}');") + lines.append(f"{indent}if (element) {{") + lines.append(f"{indent} await typeText(browser, element.id, '{step.text}');") + lines.append(f"{indent}}} else {{") + lines.append(f"{indent} throw new Error('Element not found: {step.selector}');") + lines.append(f"{indent}}}") elif step.element_id is not None: - lines.append(f'{indent}// TODO: replace with semantic selector') - lines.append(f'{indent}await typeText(browser, {step.element_id}, \'{step.text}\');') - lines.append('') - - elif step.type == 'press': - lines.append(f'{indent}// Press key: {step.key}') - lines.append(f'{indent}await press(browser, \'{step.key}\');') - lines.append('') - + lines.append(f"{indent}// TODO: replace with semantic selector") + lines.append(f"{indent}await typeText(browser, {step.element_id}, '{step.text}');") + lines.append("") + + elif step.type == "press": + lines.append(f"{indent}// Press key: {step.key}") + lines.append(f"{indent}await press(browser, '{step.key}');") + lines.append("") + return lines - + def save_python(self, filepath: str) -> None: """Generate and save Python script""" code = self.generate_python() - with open(filepath, 'w') as f: + with open(filepath, "w") as f: f.write(code) - + def save_typescript(self, filepath: str) -> None: """Generate and save TypeScript script""" code = self.generate_typescript() - with open(filepath, 'w') as f: + with open(filepath, "w") as f: f.write(code) -def generate(trace: Trace, language: str = 'py') -> str: +def generate(trace: Trace, language: str = "py") -> str: """ Generate script from trace - + Args: trace: Trace object language: 'py' or 'ts' - + Returns: Generated code as string """ generator = ScriptGenerator(trace) - if language == 'py': + if language == "py": return generator.generate_python() - elif language == 'ts': + elif language == "ts": return generator.generate_typescript() else: raise ValueError(f"Unsupported language: {language}. Use 'py' or 'ts'") - diff --git a/sentience/inspector.py b/sentience/inspector.py index eb99bef..c6abe91 100644 --- a/sentience/inspector.py +++ b/sentience/inspector.py @@ -3,42 +3,44 @@ """ from typing import Optional + from .browser import SentienceBrowser -from .snapshot import snapshot from .query import find +from .snapshot import snapshot class Inspector: """Inspector for debugging - shows element info on hover/click""" - + def __init__(self, browser: SentienceBrowser): self.browser = browser self._active = False - self._last_element_id: Optional[int] = None - + self._last_element_id: int | None = None + def start(self) -> None: """Start inspection mode - prints element info on mouse move/click""" if not self.browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - + self._active = True - + # Inject inspector script into page - self.browser.page.evaluate(""" + self.browser.page.evaluate( + """ (() => { // Remove existing inspector if any if (window.__sentience_inspector_active) { return; } - + window.__sentience_inspector_active = true; window.__sentience_last_element_id = null; - + // Get element at point function getElementAtPoint(x, y) { const el = document.elementFromPoint(x, y); if (!el) return null; - + // Find element in registry if (window.sentience_registry) { for (let i = 0; i < window.sentience_registry.length; i++) { @@ -49,46 +51,46 @@ def start(self) -> None: } return null; } - + // Mouse move handler function handleMouseMove(e) { if (!window.__sentience_inspector_active) return; - + const elementId = getElementAtPoint(e.clientX, e.clientY); if (elementId === null || elementId === window.__sentience_last_element_id) { return; } - + window.__sentience_last_element_id = elementId; - + // Get element info from snapshot if available if (window.sentience && window.sentience_registry) { const el = window.sentience_registry[elementId]; if (el) { const rect = el.getBoundingClientRect(); - const text = el.getAttribute('aria-label') || - el.value || - el.placeholder || - el.alt || + const text = el.getAttribute('aria-label') || + el.value || + el.placeholder || + el.alt || (el.innerText || '').substring(0, 50); - + const role = el.getAttribute('role') || el.tagName.toLowerCase(); - + console.log(`[Sentience Inspector] Element #${elementId}: role=${role}, text="${text}", bbox=(${Math.round(rect.x)}, ${Math.round(rect.y)}, ${Math.round(rect.width)}, ${Math.round(rect.height)})`); } } } - + // Click handler function handleClick(e) { if (!window.__sentience_inspector_active) return; - + e.preventDefault(); e.stopPropagation(); - + const elementId = getElementAtPoint(e.clientX, e.clientY); if (elementId === null) return; - + // Get full element info if (window.sentience && window.sentience_registry) { const el = window.sentience_registry[elementId]; @@ -98,10 +100,10 @@ def start(self) -> None: id: elementId, tag: el.tagName.toLowerCase(), role: el.getAttribute('role') || 'generic', - text: el.getAttribute('aria-label') || - el.value || - el.placeholder || - el.alt || + text: el.getAttribute('aria-label') || + el.value || + el.placeholder || + el.alt || (el.innerText || '').substring(0, 100), bbox: { x: Math.round(rect.x), @@ -116,9 +118,9 @@ def start(self) -> None: type: el.type || null } }; - + console.log('[Sentience Inspector] Clicked element:', JSON.stringify(info, null, 2)); - + // Also try to get from snapshot if available window.sentience.snapshot({ limit: 100 }).then(snap => { const element = snap.elements.find(el => el.id === elementId); @@ -129,43 +131,46 @@ def start(self) -> None: } } } - + // Add event listeners document.addEventListener('mousemove', handleMouseMove, true); document.addEventListener('click', handleClick, true); - + // Store cleanup function window.__sentience_inspector_cleanup = () => { document.removeEventListener('mousemove', handleMouseMove, true); document.removeEventListener('click', handleClick, true); window.__sentience_inspector_active = false; }; - + console.log('[Sentience Inspector] ✅ Inspection mode active. Hover elements to see info, click to see full details.'); })(); - """) - + """ + ) + def stop(self) -> None: """Stop inspection mode""" if not self.browser.page: return - + self._active = False - + # Cleanup inspector - self.browser.page.evaluate(""" + self.browser.page.evaluate( + """ () => { if (window.__sentience_inspector_cleanup) { window.__sentience_inspector_cleanup(); } } - """) - + """ + ) + def __enter__(self): """Context manager entry""" self.start() return self - + def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" self.stop() @@ -174,12 +179,11 @@ def __exit__(self, exc_type, exc_val, exc_tb): def inspect(browser: SentienceBrowser) -> Inspector: """ Create an inspector instance - + Args: browser: SentienceBrowser instance - + Returns: Inspector instance """ return Inspector(browser) - diff --git a/sentience/llm_provider.py b/sentience/llm_provider.py index a8bf8a0..1c4200f 100644 --- a/sentience/llm_provider.py +++ b/sentience/llm_provider.py @@ -4,19 +4,20 @@ """ from abc import ABC, abstractmethod -from typing import Optional, Dict, Any from dataclasses import dataclass +from typing import Any, Dict, Optional @dataclass class LLMResponse: """Standardized LLM response across all providers""" + content: str - prompt_tokens: Optional[int] = None - completion_tokens: Optional[int] = None - total_tokens: Optional[int] = None - model_name: Optional[str] = None - finish_reason: Optional[str] = None + prompt_tokens: int | None = None + completion_tokens: int | None = None + total_tokens: int | None = None + model_name: str | None = None + finish_reason: str | None = None class LLMProvider(ABC): @@ -32,12 +33,7 @@ class LLMProvider(ABC): """ @abstractmethod - def generate( - self, - system_prompt: str, - user_prompt: str, - **kwargs - ) -> LLMResponse: + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: """ Generate a response from the LLM @@ -86,10 +82,10 @@ class OpenAIProvider(LLMProvider): def __init__( self, - api_key: Optional[str] = None, + api_key: str | None = None, model: str = "gpt-4o", - base_url: Optional[str] = None, - organization: Optional[str] = None + base_url: str | None = None, + organization: str | None = None, ): """ Initialize OpenAI provider @@ -103,15 +99,9 @@ def __init__( try: from openai import OpenAI except ImportError: - raise ImportError( - "OpenAI package not installed. Install with: pip install openai" - ) + raise ImportError("OpenAI package not installed. Install with: pip install openai") - self.client = OpenAI( - api_key=api_key, - base_url=base_url, - organization=organization - ) + self.client = OpenAI(api_key=api_key, base_url=base_url, organization=organization) self._model_name = model def generate( @@ -119,9 +109,9 @@ def generate( system_prompt: str, user_prompt: str, temperature: float = 0.0, - max_tokens: Optional[int] = None, + max_tokens: int | None = None, json_mode: bool = False, - **kwargs + **kwargs, ) -> LLMResponse: """ Generate response using OpenAI API @@ -170,7 +160,7 @@ def generate( completion_tokens=usage.completion_tokens if usage else None, total_tokens=usage.total_tokens if usage else None, model_name=response.model, - finish_reason=choice.finish_reason + finish_reason=choice.finish_reason, ) def supports_json_mode(self) -> bool: @@ -194,11 +184,7 @@ class AnthropicProvider(LLMProvider): >>> print(response.content) """ - def __init__( - self, - api_key: Optional[str] = None, - model: str = "claude-3-5-sonnet-20241022" - ): + def __init__(self, api_key: str | None = None, model: str = "claude-3-5-sonnet-20241022"): """ Initialize Anthropic provider @@ -222,7 +208,7 @@ def generate( user_prompt: str, temperature: float = 0.0, max_tokens: int = 1024, - **kwargs + **kwargs, ) -> LLMResponse: """ Generate response using Anthropic API @@ -242,7 +228,7 @@ def generate( "model": self._model_name, "max_tokens": max_tokens, "temperature": temperature, - "messages": [{"role": "user", "content": user_prompt}] + "messages": [{"role": "user", "content": user_prompt}], } if system_prompt: @@ -258,11 +244,15 @@ def generate( return LLMResponse( content=content, - prompt_tokens=response.usage.input_tokens if hasattr(response, 'usage') else None, - completion_tokens=response.usage.output_tokens if hasattr(response, 'usage') else None, - total_tokens=(response.usage.input_tokens + response.usage.output_tokens) if hasattr(response, 'usage') else None, + prompt_tokens=response.usage.input_tokens if hasattr(response, "usage") else None, + completion_tokens=response.usage.output_tokens if hasattr(response, "usage") else None, + total_tokens=( + (response.usage.input_tokens + response.usage.output_tokens) + if hasattr(response, "usage") + else None + ), model_name=response.model, - finish_reason=response.stop_reason + finish_reason=response.stop_reason, ) def supports_json_mode(self) -> bool: @@ -291,7 +281,7 @@ def __init__( device: str = "auto", load_in_4bit: bool = False, load_in_8bit: bool = False, - torch_dtype: str = "auto" + torch_dtype: str = "auto", ): """ Initialize local LLM using HuggingFace Transformers @@ -310,7 +300,7 @@ def __init__( """ try: import torch - from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig + from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig except ImportError: raise ImportError( "transformers and torch required for local LLM. " @@ -320,10 +310,7 @@ def __init__( self._model_name = model_name # Load tokenizer - self.tokenizer = AutoTokenizer.from_pretrained( - model_name, - trust_remote_code=True - ) + self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Set padding token if not present if self.tokenizer.pad_token is None: @@ -336,7 +323,7 @@ def __init__( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4" + bnb_4bit_quant_type="nf4", ) elif load_in_8bit: quantization_config = BitsAndBytesConfig(load_in_8bit=True) @@ -354,7 +341,7 @@ def __init__( torch_dtype=dtype if quantization_config is None else None, device_map=device, trust_remote_code=True, - low_cpu_mem_usage=True + low_cpu_mem_usage=True, ) self.model.eval() @@ -365,7 +352,7 @@ def generate( max_new_tokens: int = 512, temperature: float = 0.1, top_p: float = 0.9, - **kwargs + **kwargs, ) -> LLMResponse: """ Generate response using local model @@ -393,11 +380,9 @@ def generate( messages.append({"role": "user", "content": user_prompt}) # Use model's native chat template if available - if hasattr(self.tokenizer, 'apply_chat_template'): + if hasattr(self.tokenizer, "apply_chat_template"): formatted_prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True + messages, tokenize=False, add_generation_prompt=True ) else: # Fallback formatting @@ -407,13 +392,11 @@ def generate( formatted_prompt += f"User: {user_prompt}\n\nAssistant:" # Tokenize - inputs = self.tokenizer( - formatted_prompt, - return_tensors="pt", - truncation=True - ).to(self.model.device) + inputs = self.tokenizer(formatted_prompt, return_tensors="pt", truncation=True).to( + self.model.device + ) - input_length = inputs['input_ids'].shape[1] + input_length = inputs["input_ids"].shape[1] # Generate with torch.no_grad(): @@ -425,22 +408,19 @@ def generate( do_sample=do_sample, pad_token_id=self.tokenizer.pad_token_id, eos_token_id=self.tokenizer.eos_token_id, - **kwargs + **kwargs, ) # Decode only the new tokens generated_tokens = outputs[0][input_length:] - response_text = self.tokenizer.decode( - generated_tokens, - skip_special_tokens=True - ).strip() + response_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip() return LLMResponse( content=response_text, prompt_tokens=input_length, completion_tokens=len(generated_tokens), total_tokens=input_length + len(generated_tokens), - model_name=self._model_name + model_name=self._model_name, ) def supports_json_mode(self) -> bool: diff --git a/sentience/models.py b/sentience/models.py index 18a2795..fc7bb98 100644 --- a/sentience/models.py +++ b/sentience/models.py @@ -2,13 +2,14 @@ Pydantic models for Sentience SDK - matches spec/snapshot.schema.json """ +from typing import List, Literal, Optional, Union + from pydantic import BaseModel, Field -from typing import Optional, List, Literal, Union -from datetime import datetime class BBox(BaseModel): """Bounding box coordinates""" + x: float y: float width: float @@ -17,22 +18,25 @@ class BBox(BaseModel): class Viewport(BaseModel): """Viewport dimensions""" + width: float height: float class VisualCues(BaseModel): """Visual analysis cues""" + is_primary: bool - background_color_name: Optional[str] = None + background_color_name: str | None = None is_clickable: bool class Element(BaseModel): """Element from snapshot""" + id: int role: str - text: Optional[str] = None + text: str | None = None importance: int bbox: BBox visual_cues: VisualCues @@ -43,54 +47,61 @@ class Element(BaseModel): class Snapshot(BaseModel): """Snapshot response from extension""" + status: Literal["success", "error"] - timestamp: Optional[str] = None + timestamp: str | None = None url: str - viewport: Optional[Viewport] = None - elements: List[Element] - screenshot: Optional[str] = None - screenshot_format: Optional[Literal["png", "jpeg"]] = None - error: Optional[str] = None - requires_license: Optional[bool] = None + viewport: Viewport | None = None + elements: list[Element] + screenshot: str | None = None + screenshot_format: Literal["png", "jpeg"] | None = None + error: str | None = None + requires_license: bool | None = None def save(self, filepath: str) -> None: """Save snapshot as JSON file""" import json - with open(filepath, 'w') as f: + + with open(filepath, "w") as f: json.dump(self.model_dump(), f, indent=2) class ActionResult(BaseModel): """Result of an action (click, type, press)""" + success: bool duration_ms: int - outcome: Optional[Literal["navigated", "dom_updated", "no_change", "error"]] = None - url_changed: Optional[bool] = None - snapshot_after: Optional[Snapshot] = None - error: Optional[dict] = None + outcome: Literal["navigated", "dom_updated", "no_change", "error"] | None = None + url_changed: bool | None = None + snapshot_after: Snapshot | None = None + error: dict | None = None class WaitResult(BaseModel): """Result of wait_for operation""" + found: bool - element: Optional[Element] = None + element: Element | None = None duration_ms: int timeout: bool # ========== Agent Layer Models ========== + class ScreenshotConfig(BaseModel): """Screenshot format configuration""" - format: Literal['png', 'jpeg'] = 'png' - quality: Optional[int] = Field(None, ge=1, le=100) # Only for JPEG (1-100) + + format: Literal["png", "jpeg"] = "png" + quality: int | None = Field(None, ge=1, le=100) # Only for JPEG (1-100) class SnapshotFilter(BaseModel): """Filter options for snapshot elements""" - min_area: Optional[int] = Field(None, ge=0) - allowed_roles: Optional[List[str]] = None - min_z_index: Optional[int] = None + + min_area: int | None = Field(None, ge=0) + allowed_roles: list[str] | None = None + min_z_index: int | None = None class SnapshotOptions(BaseModel): @@ -98,10 +109,11 @@ class SnapshotOptions(BaseModel): Configuration for snapshot calls. Matches TypeScript SnapshotOptions interface from sdk-ts/src/snapshot.ts """ - screenshot: Union[bool, ScreenshotConfig] = False # Union type: boolean or config + + screenshot: bool | ScreenshotConfig = False # Union type: boolean or config limit: int = Field(50, ge=1, le=500) - filter: Optional[SnapshotFilter] = None - use_api: Optional[bool] = None # Force API vs extension + filter: SnapshotFilter | None = None + use_api: bool | None = None # Force API vs extension class Config: arbitrary_types_allowed = True @@ -109,6 +121,7 @@ class Config: class AgentActionResult(BaseModel): """Result of a single agent action (from agent.act())""" + success: bool action: Literal["click", "type", "press", "finish", "error"] goal: str @@ -116,13 +129,13 @@ class AgentActionResult(BaseModel): attempt: int # Optional fields based on action type - element_id: Optional[int] = None - text: Optional[str] = None - key: Optional[str] = None - outcome: Optional[Literal["navigated", "dom_updated", "no_change", "error"]] = None - url_changed: Optional[bool] = None - error: Optional[str] = None - message: Optional[str] = None # For FINISH action + element_id: int | None = None + text: str | None = None + key: str | None = None + outcome: Literal["navigated", "dom_updated", "no_change", "error"] | None = None + url_changed: bool | None = None + error: str | None = None + message: str | None = None # For FINISH action def __getitem__(self, key): """ @@ -130,16 +143,18 @@ def __getitem__(self, key): This allows existing code using result["success"] to continue working. """ import warnings + warnings.warn( f"Dict-style access result['{key}'] is deprecated. Use result.{key} instead.", DeprecationWarning, - stacklevel=2 + stacklevel=2, ) return getattr(self, key) class ActionTokenUsage(BaseModel): """Token usage for a single action""" + goal: str prompt_tokens: int completion_tokens: int @@ -149,18 +164,19 @@ class ActionTokenUsage(BaseModel): class TokenStats(BaseModel): """Token usage statistics for an agent session""" + total_prompt_tokens: int total_completion_tokens: int total_tokens: int - by_action: List[ActionTokenUsage] + by_action: list[ActionTokenUsage] class ActionHistory(BaseModel): """Single history entry from agent execution""" + goal: str action: str # The raw action string from LLM result: dict # Will be AgentActionResult but stored as dict for flexibility success: bool attempt: int duration_ms: int - diff --git a/sentience/query.py b/sentience/query.py index b835ac5..141fee5 100644 --- a/sentience/query.py +++ b/sentience/query.py @@ -3,14 +3,15 @@ """ import re -from typing import List, Optional, Union, Dict, Any -from .models import Snapshot, Element +from typing import Any, Dict, List, Optional, Union +from .models import Element, Snapshot -def parse_selector(selector: str) -> Dict[str, Any]: + +def parse_selector(selector: str) -> dict[str, Any]: """ Parse string DSL selector into structured query - + Examples: "role=button text~'Sign in'" "role=textbox name~'email'" @@ -20,20 +21,20 @@ def parse_selector(selector: str) -> Dict[str, Any]: "text^='Sign'" "text$='in'" """ - query: Dict[str, Any] = {} - + query: dict[str, Any] = {} + # Match patterns like: key=value, key~'value', key!="value", key>123, key^='prefix', key$='suffix' # Updated regex to support: =, !=, ~, ^=, $=, >, >=, <, <= # Supports dot notation: attr.id, css.color # Note: Handle ^= and $= first (before single char operators) to avoid regex conflicts # Pattern matches: key, operator (including ^= and $=), and value (quoted or unquoted) - pattern = r'([\w.]+)(\^=|\$=|>=|<=|!=|[=~<>])((?:\'[^\']+\'|\"[^\"]+\"|[^\s]+))' + pattern = r"([\w.]+)(\^=|\$=|>=|<=|!=|[=~<>])((?:\'[^\']+\'|\"[^\"]+\"|[^\s]+))" matches = re.findall(pattern, selector) - + for key, op, value in matches: # Remove quotes from value - value = value.strip().strip('"\'') - + value = value.strip().strip("\"'") + # Handle numeric comparisons is_numeric = False try: @@ -41,27 +42,27 @@ def parse_selector(selector: str) -> Dict[str, Any]: is_numeric = True except ValueError: pass - - if op == '!=': + + if op == "!=": if key == "role": query["role_exclude"] = value elif key == "clickable": query["clickable"] = False elif key == "visible": query["visible"] = False - elif op == '~': + elif op == "~": # Substring match (case-insensitive) if key == "text" or key == "name": query["text_contains"] = value - elif op == '^=': + elif op == "^=": # Prefix match if key == "text" or key == "name": query["text_prefix"] = value - elif op == '$=': + elif op == "$=": # Suffix match if key == "text" or key == "name": query["text_suffix"] = value - elif op == '>': + elif op == ">": # Greater than if is_numeric: if key == "importance": @@ -72,7 +73,7 @@ def parse_selector(selector: str) -> Dict[str, Any]: query["z_index_min"] = numeric_value + 0.0001 elif key.startswith("attr.") or key.startswith("css."): query[f"{key}_gt"] = value - elif op == '>=': + elif op == ">=": # Greater than or equal if is_numeric: if key == "importance": @@ -83,7 +84,7 @@ def parse_selector(selector: str) -> Dict[str, Any]: query["z_index_min"] = numeric_value elif key.startswith("attr.") or key.startswith("css."): query[f"{key}_gte"] = value - elif op == '<': + elif op == "<": # Less than if is_numeric: if key == "importance": @@ -94,7 +95,7 @@ def parse_selector(selector: str) -> Dict[str, Any]: query["z_index_max"] = numeric_value - 0.0001 elif key.startswith("attr.") or key.startswith("css."): query[f"{key}_lt"] = value - elif op == '<=': + elif op == "<=": # Less than or equal if is_numeric: if key == "importance": @@ -105,7 +106,7 @@ def parse_selector(selector: str) -> Dict[str, Any]: query["z_index_max"] = numeric_value elif key.startswith("attr.") or key.startswith("css."): query[f"{key}_lte"] = value - elif op == '=': + elif op == "=": # Exact match if key == "role": query["role"] = value @@ -131,66 +132,66 @@ def parse_selector(selector: str) -> Dict[str, Any]: if "css" not in query: query["css"] = {} query["css"][css_key] = value - + return query -def match_element(element: Element, query: Dict[str, Any]) -> bool: +def match_element(element: Element, query: dict[str, Any]) -> bool: """Check if element matches query criteria""" - + # Role exact match if "role" in query: if element.role != query["role"]: return False - + # Role exclusion if "role_exclude" in query: if element.role == query["role_exclude"]: return False - + # Clickable if "clickable" in query: if element.visual_cues.is_clickable != query["clickable"]: return False - + # Visible (using in_viewport and !is_occluded) if "visible" in query: is_visible = element.in_viewport and not element.is_occluded if is_visible != query["visible"]: return False - + # Tag (not yet in Element model, but prepare for future) if "tag" in query: # For now, this will always fail since tag is not in Element model # This is a placeholder for future implementation pass - + # Text exact match if "text" in query: if not element.text or element.text != query["text"]: return False - + # Text contains (case-insensitive) if "text_contains" in query: if not element.text: return False if query["text_contains"].lower() not in element.text.lower(): return False - + # Text prefix match if "text_prefix" in query: if not element.text: return False if not element.text.lower().startswith(query["text_prefix"].lower()): return False - + # Text suffix match if "text_suffix" in query: if not element.text: return False if not element.text.lower().endswith(query["text_suffix"].lower()): return False - + # Importance filtering if "importance" in query: if element.importance != query["importance"]: @@ -201,7 +202,7 @@ def match_element(element: Element, query: Dict[str, Any]) -> bool: if "importance_max" in query: if element.importance > query["importance_max"]: return False - + # BBox filtering (spatial) if "bbox.x_min" in query: if element.bbox.x < query["bbox.x_min"]: @@ -227,7 +228,7 @@ def match_element(element: Element, query: Dict[str, Any]) -> bool: if "bbox.height_max" in query: if element.bbox.height > query["bbox.height_max"]: return False - + # Z-index filtering if "z_index_min" in query: if element.z_index < query["z_index_min"]: @@ -235,40 +236,40 @@ def match_element(element: Element, query: Dict[str, Any]) -> bool: if "z_index_max" in query: if element.z_index > query["z_index_max"]: return False - + # In viewport filtering if "in_viewport" in query: if element.in_viewport != query["in_viewport"]: return False - + # Occlusion filtering if "is_occluded" in query: if element.is_occluded != query["is_occluded"]: return False - + # Attribute filtering (dot notation: attr.id="submit-btn") if "attr" in query: # This requires DOM access, which is not available in the Element model # This is a placeholder for future implementation when we add DOM access pass - + # CSS property filtering (dot notation: css.color="red") if "css" in query: # This requires DOM access, which is not available in the Element model # This is a placeholder for future implementation when we add DOM access pass - + return True -def query(snapshot: Snapshot, selector: Union[str, Dict[str, Any]]) -> List[Element]: +def query(snapshot: Snapshot, selector: str | dict[str, Any]) -> list[Element]: """ Query elements from snapshot using semantic selector - + Args: snapshot: Snapshot object selector: String DSL (e.g., "role=button text~'Sign in'") or dict query - + Returns: List of matching elements, sorted by importance (descending) """ @@ -277,27 +278,26 @@ def query(snapshot: Snapshot, selector: Union[str, Dict[str, Any]]) -> List[Elem query_dict = parse_selector(selector) else: query_dict = selector - + # Filter elements matches = [el for el in snapshot.elements if match_element(el, query_dict)] - + # Sort by importance (descending) matches.sort(key=lambda el: el.importance, reverse=True) - + return matches -def find(snapshot: Snapshot, selector: Union[str, Dict[str, Any]]) -> Optional[Element]: +def find(snapshot: Snapshot, selector: str | dict[str, Any]) -> Element | None: """ Find single element matching selector (best match by importance) - + Args: snapshot: Snapshot object selector: String DSL or dict query - + Returns: Best matching element or None """ results = query(snapshot, selector) return results[0] if results else None - diff --git a/sentience/read.py b/sentience/read.py index b9d2bc4..33fc8a0 100644 --- a/sentience/read.py +++ b/sentience/read.py @@ -3,6 +3,7 @@ """ from typing import Literal + from .browser import SentienceBrowser @@ -13,14 +14,14 @@ def read( ) -> dict: """ Read page content as raw HTML, text, or markdown - + Args: browser: SentienceBrowser instance output_format: Output format - "raw" (default, returns HTML for external processing), "text" (plain text), or "markdown" (lightweight or enhanced markdown). enhance_markdown: If True and output_format is "markdown", uses markdownify for better conversion. If False, uses the extension's lightweight markdown converter. - + Returns: dict with: - status: "success" or "error" @@ -29,23 +30,23 @@ def read( - content: Page content as string - length: Content length in characters - error: Error message if status is "error" - + Examples: # Get raw HTML (default) - can be used with markdownify for better conversion result = read(browser) html_content = result["content"] - + # Get high-quality markdown (uses markdownify internally) result = read(browser, output_format="markdown") markdown = result["content"] - + # Get plain text result = read(browser, output_format="text") text = result["content"] """ if not browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - + if output_format == "markdown" and enhance_markdown: # Get raw HTML from the extension first raw_html_result = browser.page.evaluate( @@ -56,12 +57,13 @@ def read( """, {"format": "raw"}, ) - + if raw_html_result.get("status") == "success": html_content = raw_html_result["content"] try: # Use markdownify for enhanced markdown conversion - from markdownify import markdownify, MarkdownifyError + from markdownify import MarkdownifyError, markdownify + markdown_content = markdownify(html_content, heading_style="ATX", wrap=True) return { "status": "success", @@ -71,11 +73,15 @@ def read( "length": len(markdown_content), } except ImportError: - print("Warning: 'markdownify' not installed. Install with 'pip install markdownify' for enhanced markdown. Falling back to extension's markdown.") + print( + "Warning: 'markdownify' not installed. Install with 'pip install markdownify' for enhanced markdown. Falling back to extension's markdown." + ) except MarkdownifyError as e: print(f"Warning: markdownify failed ({e}), falling back to extension's markdown.") except Exception as e: - print(f"Warning: An unexpected error occurred with markdownify ({e}), falling back to extension's markdown.") + print( + f"Warning: An unexpected error occurred with markdownify ({e}), falling back to extension's markdown." + ) # If not enhanced markdown, or fallback, call extension with requested format result = browser.page.evaluate( @@ -86,5 +92,5 @@ def read( """, {"format": output_format}, ) - + return result diff --git a/sentience/recorder.py b/sentience/recorder.py index 2d9baaf..7f0a84f 100644 --- a/sentience/recorder.py +++ b/sentience/recorder.py @@ -4,26 +4,27 @@ import json from datetime import datetime -from typing import List, Optional, Dict, Any +from typing import Any, Dict, List, Optional + from .browser import SentienceBrowser -from .models import Snapshot, Element -from .snapshot import snapshot +from .models import Element, Snapshot from .query import find +from .snapshot import snapshot class TraceStep: """A single step in a trace""" - + def __init__( self, ts: int, type: str, - selector: Optional[str] = None, - element_id: Optional[int] = None, - text: Optional[str] = None, - key: Optional[str] = None, - url: Optional[str] = None, - snapshot: Optional[Snapshot] = None, + selector: str | None = None, + element_id: int | None = None, + text: str | None = None, + key: str | None = None, + url: str | None = None, + snapshot: Snapshot | None = None, ): self.ts = ts self.type = type @@ -33,8 +34,8 @@ def __init__( self.key = key self.url = url self.snapshot = snapshot - - def to_dict(self) -> Dict[str, Any]: + + def to_dict(self) -> dict[str, Any]: """Convert to dictionary for JSON serialization""" result = { "ts": self.ts, @@ -57,44 +58,48 @@ def to_dict(self) -> Dict[str, Any]: class Trace: """Trace of user actions""" - + def __init__(self, start_url: str): self.version = "1.0.0" self.created_at = datetime.now().isoformat() self.start_url = start_url - self.steps: List[TraceStep] = [] + self.steps: list[TraceStep] = [] self._start_time = datetime.now() - + def add_step(self, step: TraceStep) -> None: """Add a step to the trace""" self.steps.append(step) - + def add_navigation(self, url: str) -> None: """Add navigation step""" ts = int((datetime.now() - self._start_time).total_seconds() * 1000) step = TraceStep(ts=ts, type="navigation", url=url) self.add_step(step) - - def add_click(self, element_id: int, selector: Optional[str] = None) -> None: + + def add_click(self, element_id: int, selector: str | None = None) -> None: """Add click step""" ts = int((datetime.now() - self._start_time).total_seconds() * 1000) step = TraceStep(ts=ts, type="click", element_id=element_id, selector=selector) self.add_step(step) - - def add_type(self, element_id: int, text: str, selector: Optional[str] = None, mask: bool = False) -> None: + + def add_type( + self, element_id: int, text: str, selector: str | None = None, mask: bool = False + ) -> None: """Add type step""" ts = int((datetime.now() - self._start_time).total_seconds() * 1000) # Mask sensitive data if requested masked_text = "***" if mask else text - step = TraceStep(ts=ts, type="type", element_id=element_id, text=masked_text, selector=selector) + step = TraceStep( + ts=ts, type="type", element_id=element_id, text=masked_text, selector=selector + ) self.add_step(step) - + def add_press(self, key: str) -> None: """Add press key step""" ts = int((datetime.now() - self._start_time).total_seconds() * 1000) step = TraceStep(ts=ts, type="press", key=key) self.add_step(step) - + def save(self, filepath: str) -> None: """Save trace to JSON file""" data = { @@ -103,25 +108,25 @@ def save(self, filepath: str) -> None: "start_url": self.start_url, "steps": [step.to_dict() for step in self.steps], } - with open(filepath, 'w') as f: + with open(filepath, "w") as f: json.dump(data, f, indent=2) - + @classmethod - def load(cls, filepath: str) -> 'Trace': + def load(cls, filepath: str) -> "Trace": """Load trace from JSON file""" - with open(filepath, 'r') as f: + with open(filepath) as f: data = json.load(f) - + trace = cls(data["start_url"]) trace.version = data["version"] trace.created_at = data["created_at"] - + for step_data in data["steps"]: snapshot_data = step_data.get("snapshot") snapshot_obj = None if snapshot_data: snapshot_obj = Snapshot(**snapshot_data) - + step = TraceStep( ts=step_data["ts"], type=step_data["type"], @@ -133,60 +138,60 @@ def load(cls, filepath: str) -> 'Trace': snapshot=snapshot_obj, ) trace.steps.append(step) - + return trace class Recorder: """Recorder for capturing user actions""" - + def __init__(self, browser: SentienceBrowser, capture_snapshots: bool = False): self.browser = browser self.capture_snapshots = capture_snapshots - self.trace: Optional[Trace] = None + self.trace: Trace | None = None self._active = False - self._mask_patterns: List[str] = [] # Patterns to mask (e.g., "password", "email") - + self._mask_patterns: list[str] = [] # Patterns to mask (e.g., "password", "email") + def start(self) -> None: """Start recording""" if not self.browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - + self._active = True start_url = self.browser.page.url self.trace = Trace(start_url) - + # Set up event listeners in the browser self._setup_listeners() - + def stop(self) -> None: """Stop recording""" self._active = False self._cleanup_listeners() - + def add_mask_pattern(self, pattern: str) -> None: """Add a pattern to mask in recorded text (e.g., "password", "email")""" self._mask_patterns.append(pattern.lower()) - + def _should_mask(self, text: str) -> bool: """Check if text should be masked""" text_lower = text.lower() return any(pattern in text_lower for pattern in self._mask_patterns) - + def _setup_listeners(self) -> None: """Set up event listeners to capture actions""" # Note: We'll capture actions through the SDK methods rather than DOM events # This is cleaner and more reliable pass - + def _cleanup_listeners(self) -> None: """Clean up event listeners""" pass - - def _infer_selector(self, element_id: int) -> Optional[str]: + + def _infer_selector(self, element_id: int) -> str | None: """ Infer a semantic selector for an element - + Uses heuristics to build a robust selector: - role=... text~"..." - If text empty: use name/aria-label/placeholder @@ -196,24 +201,24 @@ def _infer_selector(self, element_id: int) -> Optional[str]: try: # Take a snapshot to get element info snap = snapshot(self.browser) - + # Find the element in the snapshot element = None for el in snap.elements: if el.id == element_id: element = el break - + if not element: return None - + # Build candidate selector parts = [] - + # Add role - if element.role and element.role != 'generic': + if element.role and element.role != "generic": parts.append(f"role={element.role}") - + # Add text if available if element.text: # Use contains match for text @@ -222,7 +227,8 @@ def _infer_selector(self, element_id: int) -> Optional[str]: else: # Try to get name/aria-label/placeholder from DOM try: - el = self.browser.page.evaluate(f""" + el = self.browser.page.evaluate( + f""" () => {{ const el = window.sentience_registry[{element_id}]; if (!el) return null; @@ -232,30 +238,31 @@ def _infer_selector(self, element_id: int) -> Optional[str]: placeholder: el.placeholder || null }}; }} - """) - + """ + ) + if el: - if el.get('name'): + if el.get("name"): parts.append(f'name="{el["name"]}"') - elif el.get('ariaLabel'): + elif el.get("ariaLabel"): parts.append(f'text~"{el["ariaLabel"]}"') - elif el.get('placeholder'): + elif el.get("placeholder"): parts.append(f'text~"{el["placeholder"]}"') except Exception: pass - + # Add clickable if relevant if element.visual_cues.is_clickable: parts.append("clickable=true") - + if not parts: return None - + selector = " ".join(parts) - + # Validate selector - should match exactly 1 element matches = [el for el in snap.elements if self._match_element(el, selector)] - + if len(matches) == 1: return selector elif len(matches) > 1: @@ -265,32 +272,33 @@ def _infer_selector(self, element_id: int) -> Optional[str]: else: # Selector doesn't match - return None (will use element_id) return None - + except Exception: return None - + def _match_element(self, element: Element, selector: str) -> bool: """Simple selector matching (basic implementation)""" # This is a simplified version - in production, use the full query engine - from .query import parse_selector, match_element + from .query import match_element, parse_selector + try: query_dict = parse_selector(selector) return match_element(element, query_dict) except Exception: return False - + def record_navigation(self, url: str) -> None: """Record a navigation event""" if self._active and self.trace: self.trace.add_navigation(url) - - def record_click(self, element_id: int, selector: Optional[str] = None) -> None: + + def record_click(self, element_id: int, selector: str | None = None) -> None: """Record a click event with smart selector inference""" if self._active and self.trace: # If no selector provided, try to infer one if selector is None: selector = self._infer_selector(element_id) - + # Optionally capture snapshot if self.capture_snapshots: try: @@ -308,33 +316,33 @@ def record_click(self, element_id: int, selector: Optional[str] = None) -> None: self.trace.add_click(element_id, selector) else: self.trace.add_click(element_id, selector) - - def record_type(self, element_id: int, text: str, selector: Optional[str] = None) -> None: + + def record_type(self, element_id: int, text: str, selector: str | None = None) -> None: """Record a type event with smart selector inference""" if self._active and self.trace: # If no selector provided, try to infer one if selector is None: selector = self._infer_selector(element_id) - + mask = self._should_mask(text) self.trace.add_type(element_id, text, selector, mask=mask) - + def record_press(self, key: str) -> None: """Record a key press event""" if self._active and self.trace: self.trace.add_press(key) - + def save(self, filepath: str) -> None: """Save trace to file""" if not self.trace: raise RuntimeError("No trace to save. Start recording first.") self.trace.save(filepath) - + def __enter__(self): """Context manager entry""" self.start() return self - + def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" self.stop() @@ -343,13 +351,12 @@ def __exit__(self, exc_type, exc_val, exc_tb): def record(browser: SentienceBrowser, capture_snapshots: bool = False) -> Recorder: """ Create a recorder instance - + Args: browser: SentienceBrowser instance capture_snapshots: Whether to capture snapshots at each step - + Returns: Recorder instance """ return Recorder(browser, capture_snapshots=capture_snapshots) - diff --git a/sentience/screenshot.py b/sentience/screenshot.py index 4a34d26..b5ce7fe 100644 --- a/sentience/screenshot.py +++ b/sentience/screenshot.py @@ -2,52 +2,53 @@ Screenshot functionality - standalone screenshot capture """ -from typing import Optional, Literal, Dict, Any +from typing import Any, Dict, Literal, Optional + from .browser import SentienceBrowser def screenshot( browser: SentienceBrowser, format: Literal["png", "jpeg"] = "png", - quality: Optional[int] = None, + quality: int | None = None, ) -> str: """ Capture screenshot of current page - + Args: browser: SentienceBrowser instance format: Image format - "png" or "jpeg" quality: JPEG quality (1-100), only used for JPEG format - + Returns: Base64-encoded screenshot data URL (e.g., "data:image/png;base64,...") - + Raises: RuntimeError: If browser not started ValueError: If quality is invalid for JPEG """ if not browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - + if format == "jpeg" and quality is not None: if not (1 <= quality <= 100): raise ValueError("Quality must be between 1 and 100 for JPEG format") - + # Use Playwright's screenshot with base64 encoding - screenshot_options: Dict[str, Any] = { + screenshot_options: dict[str, Any] = { "type": format, } - + if format == "jpeg" and quality is not None: screenshot_options["quality"] = quality - + # Capture screenshot as base64 # Playwright returns bytes when encoding is not specified, so we encode manually import base64 + image_bytes = browser.page.screenshot(**screenshot_options) - base64_data = base64.b64encode(image_bytes).decode('utf-8') - + base64_data = base64.b64encode(image_bytes).decode("utf-8") + # Return as data URL mime_type = "image/png" if format == "png" else "image/jpeg" return f"data:{mime_type};base64,{base64_data}" - diff --git a/sentience/snapshot.py b/sentience/snapshot.py index f0bc780..377af51 100644 --- a/sentience/snapshot.py +++ b/sentience/snapshot.py @@ -2,23 +2,25 @@ Snapshot functionality - calls window.sentience.snapshot() or server-side API """ -from typing import Optional, Dict, Any import json +from typing import Any, Dict, Optional + import requests + from .browser import SentienceBrowser from .models import Snapshot def snapshot( browser: SentienceBrowser, - screenshot: Optional[bool] = None, - limit: Optional[int] = None, - filter: Optional[Dict[str, Any]] = None, - use_api: Optional[bool] = None, + screenshot: bool | None = None, + limit: int | None = None, + filter: dict[str, Any] | None = None, + use_api: bool | None = None, ) -> Snapshot: """ Take a snapshot of the current page - + Args: browser: SentienceBrowser instance screenshot: Whether to capture screenshot (bool or dict with format/quality) @@ -26,13 +28,13 @@ def snapshot( filter: Filter options (min_area, allowed_roles, min_z_index) use_api: Force use of server-side API if True, local extension if False. If None, uses API if api_key is set, otherwise uses local extension. - + Returns: Snapshot object """ # Determine if we should use server-side API should_use_api = use_api if use_api is not None else (browser.api_key is not None) - + if should_use_api and browser.api_key: # Use server-side API (Pro/Enterprise tier) return _snapshot_via_api(browser, screenshot, limit, filter) @@ -43,23 +45,23 @@ def snapshot( def _snapshot_via_extension( browser: SentienceBrowser, - screenshot: Optional[bool], - limit: Optional[int], - filter: Optional[Dict[str, Any]], + screenshot: bool | None, + limit: int | None, + filter: dict[str, Any] | None, ) -> Snapshot: """Take snapshot using local extension (Free tier)""" if not browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - + # Build options - options: Dict[str, Any] = {} + options: dict[str, Any] = {} if screenshot is not None: options["screenshot"] = screenshot if limit is not None: options["limit"] = limit if filter is not None: options["filter"] = filter - + # Call extension API result = browser.page.evaluate( """ @@ -69,7 +71,7 @@ def _snapshot_via_extension( """, options, ) - + # Validate and parse with Pydantic snapshot_obj = Snapshot(**result) return snapshot_obj @@ -77,25 +79,25 @@ def _snapshot_via_extension( def _snapshot_via_api( browser: SentienceBrowser, - screenshot: Optional[bool], - limit: Optional[int], - filter: Optional[Dict[str, Any]], + screenshot: bool | None, + limit: int | None, + filter: dict[str, Any] | None, ) -> Snapshot: """Take snapshot using server-side API (Pro/Enterprise tier)""" if not browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - + if not browser.api_key: raise ValueError("API key required for server-side processing") - + if not browser.api_url: raise ValueError("API URL required for server-side processing") - + # Step 1: Get raw data from local extension (always happens locally) - raw_options: Dict[str, Any] = {} + raw_options: dict[str, Any] = {} if screenshot is not None: raw_options["screenshot"] = screenshot - + raw_result = browser.page.evaluate( """ (options) => { @@ -104,7 +106,7 @@ def _snapshot_via_api( """, raw_options, ) - + # Step 2: Send to server for smart ranking/filtering # Use raw_elements (raw data) instead of elements (processed data) # Server validates API key and applies proprietary ranking logic @@ -115,14 +117,14 @@ def _snapshot_via_api( "options": { "limit": limit, "filter": filter, - } + }, } - + headers = { "Authorization": f"Bearer {browser.api_key}", "Content-Type": "application/json", } - + try: response = requests.post( f"{browser.api_url}/v1/snapshot", @@ -131,9 +133,9 @@ def _snapshot_via_api( timeout=30, ) response.raise_for_status() - + api_result = response.json() - + # Merge API result with local data (screenshot, etc.) snapshot_data = { "status": api_result.get("status", "success"), @@ -145,8 +147,7 @@ def _snapshot_via_api( "screenshot_format": raw_result.get("screenshot_format"), "error": api_result.get("error"), } - + return Snapshot(**snapshot_data) except requests.exceptions.RequestException as e: raise RuntimeError(f"API request failed: {e}") - diff --git a/sentience/wait.py b/sentience/wait.py index 7cd2221..3b458eb 100644 --- a/sentience/wait.py +++ b/sentience/wait.py @@ -3,23 +3,24 @@ """ import time -from typing import Union, Optional +from typing import Optional, Union + from .browser import SentienceBrowser from .models import WaitResult -from .snapshot import snapshot from .query import find +from .snapshot import snapshot def wait_for( browser: SentienceBrowser, - selector: Union[str, dict], + selector: str | dict, timeout: float = 10.0, - interval: Optional[float] = None, - use_api: Optional[bool] = None, + interval: float | None = None, + use_api: bool | None = None, ) -> WaitResult: """ Wait for element matching selector to appear - + Args: browser: SentienceBrowser instance selector: String DSL or dict query @@ -29,7 +30,7 @@ def wait_for( - 1.5s for remote API (use_api=True or default, network latency) use_api: Force use of server-side API if True, local extension if False. If None, uses API if api_key is set, otherwise uses local extension. - + Returns: WaitResult """ @@ -41,16 +42,16 @@ def wait_for( interval = 1.5 # Longer interval for API calls (network latency) else: interval = 0.25 # Shorter interval for local extension (fast) - + start_time = time.time() - + while time.time() - start_time < timeout: # Take snapshot (may be local extension or remote API) snap = snapshot(browser, use_api=use_api) - + # Try to find element element = find(snap, selector) - + if element: duration_ms = int((time.time() - start_time) * 1000) return WaitResult( @@ -59,10 +60,10 @@ def wait_for( duration_ms=duration_ms, timeout=False, ) - + # Wait before next poll time.sleep(interval) - + # Timeout duration_ms = int((time.time() - start_time) * 1000) return WaitResult( @@ -71,4 +72,3 @@ def wait_for( duration_ms=duration_ms, timeout=True, ) - diff --git a/spec/README.md b/spec/README.md index 008a18b..b2276f5 100644 --- a/spec/README.md +++ b/spec/README.md @@ -67,6 +67,5 @@ validate(snapshot_data); --- -**Last Updated**: Day 1 Implementation +**Last Updated**: Day 1 Implementation **Status**: ✅ Stable - diff --git a/spec/SNAPSHOT_V1.md b/spec/SNAPSHOT_V1.md index 389df5c..b6617eb 100644 --- a/spec/SNAPSHOT_V1.md +++ b/spec/SNAPSHOT_V1.md @@ -1,7 +1,7 @@ # Sentience Snapshot API Contract v1 -**Version**: 1.0.0 -**Last Updated**: [Current Date] +**Version**: 1.0.0 +**Last Updated**: [Current Date] **Status**: Stable This document defines the **single source of truth** for the snapshot data structure returned by `window.sentience.snapshot()`. Both Python and TypeScript SDKs must implement this contract exactly. @@ -205,4 +205,3 @@ Both Python and TypeScript SDKs must: - `snapshot.schema.json` - JSON Schema validation - Extension implementation: `sentience-chrome/injected_api.js` - WASM implementation: `sentience-chrome/src/lib.rs` - diff --git a/spec/sdk-types.md b/spec/sdk-types.md index 0b9c1df..8e9a7a1 100644 --- a/spec/sdk-types.md +++ b/spec/sdk-types.md @@ -256,4 +256,3 @@ class Trace(BaseModel): - SDKs should handle missing optional fields gracefully - Default values should match extension behavior - Type coercion should be minimal (prefer validation errors) - diff --git a/spec/snapshot.schema.json b/spec/snapshot.schema.json index 3a1ab6b..a8125ae 100644 --- a/spec/snapshot.schema.json +++ b/spec/snapshot.schema.json @@ -145,4 +145,3 @@ } } } - diff --git a/tests/README.md b/tests/README.md index e73d2be..fa56745 100644 --- a/tests/README.md +++ b/tests/README.md @@ -144,4 +144,3 @@ Make sure the package is installed in development mode: ```bash pip install -e ".[dev]" ``` - diff --git a/tests/__init__.py b/tests/__init__.py index e7991ee..d4839a6 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,2 +1 @@ # Tests package - diff --git a/tests/conftest.py b/tests/conftest.py index 5d6b57b..07e179a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,12 @@ """ Pytest configuration and fixtures for Sentience SDK tests """ + import os -import pytest from pathlib import Path +import pytest + def pytest_configure(config): """Register custom markers""" @@ -31,7 +33,7 @@ def extension_available(): # parent.parent.parent = Sentience/ (project root) repo_root = Path(__file__).parent.parent.parent extension_source = repo_root / "sentience-chrome" - + # Also check for required extension files if extension_source.exists(): required_files = ["manifest.json", "content.js", "injected_api.js"] @@ -39,13 +41,11 @@ def extension_available(): if pkg_dir.exists(): # Check if WASM files exist wasm_files = ["sentience_core.js", "sentience_core_bg.wasm"] - all_exist = all( - (extension_source / f).exists() for f in required_files - ) and all( + all_exist = all((extension_source / f).exists() for f in required_files) and all( (pkg_dir / f).exists() for f in wasm_files ) return all_exist - + return False @@ -54,14 +54,11 @@ def skip_if_no_extension(request, extension_available): """Automatically skip tests that require extension if it's not available""" # Check if test is marked as requiring extension marker = request.node.get_closest_marker("requires_extension") - + if marker and not extension_available: # In CI, skip silently # Otherwise, show a helpful message if os.getenv("CI"): pytest.skip("Extension not available in CI environment") else: - pytest.skip( - "Extension not found. Build it first: cd ../sentience-chrome && ./build.sh" - ) - + pytest.skip("Extension not found. Build it first: cd ../sentience-chrome && ./build.sh") diff --git a/tests/test_actions.py b/tests/test_actions.py index 869f5a7..2b731cf 100644 --- a/tests/test_actions.py +++ b/tests/test_actions.py @@ -3,7 +3,8 @@ """ import pytest -from sentience import SentienceBrowser, snapshot, find, click, type_text, press, click_rect, BBox + +from sentience import BBox, SentienceBrowser, click, click_rect, find, press, snapshot, type_text def test_click(): @@ -11,10 +12,10 @@ def test_click(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) link = find(snap, "role=link") - + if link: result = click(browser, link.id) assert result.success is True @@ -28,11 +29,11 @@ def test_type_text(): # Use a page with a text input browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + # Find textbox if available snap = snapshot(browser) textbox = find(snap, "role=textbox") - + if textbox: result = type_text(browser, textbox.id, "hello") assert result.success is True @@ -44,7 +45,7 @@ def test_press(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + result = press(browser, "Enter") assert result.success is True assert result.duration_ms > 0 @@ -55,7 +56,7 @@ def test_click_rect(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + # Click at a specific rectangle (top-left area) result = click_rect(browser, {"x": 100, "y": 100, "w": 50, "h": 30}) assert result.success is True @@ -68,18 +69,16 @@ def test_click_rect_with_bbox(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + # Get an element and click its bbox snap = snapshot(browser) link = find(snap, "role=link") - + if link: - result = click_rect(browser, { - "x": link.bbox.x, - "y": link.bbox.y, - "w": link.bbox.width, - "h": link.bbox.height - }) + result = click_rect( + browser, + {"x": link.bbox.x, "y": link.bbox.y, "w": link.bbox.width, "h": link.bbox.height}, + ) assert result.success is True assert result.duration_ms > 0 @@ -89,7 +88,7 @@ def test_click_rect_without_highlight(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + result = click_rect(browser, {"x": 100, "y": 100, "w": 50, "h": 30}, highlight=False) assert result.success is True assert result.duration_ms > 0 @@ -100,13 +99,13 @@ def test_click_rect_invalid_rect(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + # Invalid: zero width result = click_rect(browser, {"x": 100, "y": 100, "w": 0, "h": 30}) assert result.success is False assert result.error is not None assert result.error["code"] == "invalid_rect" - + # Invalid: negative height result = click_rect(browser, {"x": 100, "y": 100, "w": 50, "h": -10}) assert result.success is False @@ -119,7 +118,7 @@ def test_click_rect_with_snapshot(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + result = click_rect(browser, {"x": 100, "y": 100, "w": 50, "h": 30}, take_snapshot=True) assert result.success is True assert result.snapshot_after is not None @@ -132,10 +131,10 @@ def test_click_hybrid_approach(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) link = find(snap, "role=link") - + if link: # Test hybrid approach (mouse.click at center) result = click(browser, link.id, use_mouse=True) @@ -150,10 +149,10 @@ def test_click_js_approach(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) link = find(snap, "role=link") - + if link: # Test JS-based click (legacy approach) result = click(browser, link.id, use_mouse=False) @@ -161,4 +160,3 @@ def test_click_js_approach(): assert result.duration_ms > 0 # Navigation may happen, which is expected for links assert result.outcome in ["navigated", "dom_updated"] - diff --git a/tests/test_agent.py b/tests/test_agent.py index 32de611..cb68823 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -3,11 +3,13 @@ Tests LLM providers and SentienceAgent without requiring browser """ +from unittest.mock import MagicMock, Mock, patch + import pytest -from unittest.mock import Mock, MagicMock, patch -from sentience.llm_provider import LLMProvider, LLMResponse, OpenAIProvider, AnthropicProvider + from sentience.agent import SentienceAgent -from sentience.models import Snapshot, Element, BBox, VisualCues, Viewport +from sentience.llm_provider import AnthropicProvider, LLMProvider, LLMResponse, OpenAIProvider +from sentience.models import BBox, Element, Snapshot, Viewport, VisualCues class MockLLMProvider(LLMProvider): @@ -19,11 +21,7 @@ def __init__(self, responses=None): self.calls = [] def generate(self, system_prompt: str, user_prompt: str, **kwargs): - self.calls.append({ - "system": system_prompt, - "user": user_prompt, - "kwargs": kwargs - }) + self.calls.append({"system": system_prompt, "user": user_prompt, "kwargs": kwargs}) if self.responses: response = self.responses[self.call_count % len(self.responses)] @@ -37,7 +35,7 @@ def generate(self, system_prompt: str, user_prompt: str, **kwargs): prompt_tokens=100, completion_tokens=20, total_tokens=120, - model_name="mock-model" + model_name="mock-model", ) def supports_json_mode(self) -> bool: @@ -50,6 +48,7 @@ def model_name(self) -> str: # ========== LLM Provider Tests ========== + def test_llm_response_dataclass(): """Test LLMResponse dataclass creation""" response = LLMResponse( @@ -57,7 +56,7 @@ def test_llm_response_dataclass(): prompt_tokens=100, completion_tokens=20, total_tokens=120, - model_name="gpt-4o" + model_name="gpt-4o", ) assert response.content == "CLICK(42)" @@ -69,7 +68,7 @@ def test_llm_response_dataclass(): def test_mock_llm_provider(): """Test mock LLM provider""" - provider = MockLLMProvider(responses=["CLICK(1)", "TYPE(2, \"test\")"]) + provider = MockLLMProvider(responses=["CLICK(1)", 'TYPE(2, "test")']) # First call response1 = provider.generate("system", "user") @@ -78,7 +77,7 @@ def test_mock_llm_provider(): # Second call response2 = provider.generate("system", "user") - assert response2.content == "TYPE(2, \"test\")" + assert response2.content == 'TYPE(2, "test")' assert provider.call_count == 2 # Check calls were recorded @@ -104,6 +103,7 @@ def test_anthropic_provider_init(): # ========== SentienceAgent Tests ========== + def create_mock_browser(): """Create mock browser for testing""" browser = Mock() @@ -122,13 +122,11 @@ def create_mock_snapshot(): importance=900, bbox=BBox(x=100, y=200, width=80, height=30), visual_cues=VisualCues( - is_primary=True, - is_clickable=True, - background_color_name="blue" + is_primary=True, is_clickable=True, background_color_name="blue" ), in_viewport=True, is_occluded=False, - z_index=10 + z_index=10, ), Element( id=2, @@ -136,15 +134,11 @@ def create_mock_snapshot(): text="", importance=850, bbox=BBox(x=100, y=100, width=200, height=40), - visual_cues=VisualCues( - is_primary=False, - is_clickable=True, - background_color_name=None - ), + visual_cues=VisualCues(is_primary=False, is_clickable=True, background_color_name=None), in_viewport=True, is_occluded=False, - z_index=5 - ) + z_index=5, + ), ] return Snapshot( @@ -152,7 +146,7 @@ def create_mock_snapshot(): timestamp="2024-12-24T10:00:00Z", url="https://example.com", viewport=Viewport(width=1920, height=1080), - elements=elements + elements=elements, ) @@ -203,13 +197,11 @@ def test_agent_execute_click_action(): snap = create_mock_snapshot() # Mock click function - with patch('sentience.agent.click') as mock_click: + with patch("sentience.agent.click") as mock_click: from sentience.models import ActionResult + mock_click.return_value = ActionResult( - success=True, - duration_ms=150, - outcome="dom_updated", - url_changed=False + success=True, duration_ms=150, outcome="dom_updated", url_changed=False ) result = agent._execute_action("CLICK(1)", snap) @@ -229,13 +221,10 @@ def test_agent_execute_type_action(): snap = create_mock_snapshot() # Mock type_text function - with patch('sentience.agent.type_text') as mock_type: + with patch("sentience.agent.type_text") as mock_type: from sentience.models import ActionResult - mock_type.return_value = ActionResult( - success=True, - duration_ms=200, - outcome="dom_updated" - ) + + mock_type.return_value = ActionResult(success=True, duration_ms=200, outcome="dom_updated") result = agent._execute_action('TYPE(2, "hello world")', snap) @@ -255,13 +244,10 @@ def test_agent_execute_press_action(): snap = create_mock_snapshot() # Mock press function - with patch('sentience.agent.press') as mock_press: + with patch("sentience.agent.press") as mock_press: from sentience.models import ActionResult - mock_press.return_value = ActionResult( - success=True, - duration_ms=50, - outcome="dom_updated" - ) + + mock_press.return_value = ActionResult(success=True, duration_ms=50, outcome="dom_updated") result = agent._execute_action('PRESS("Enter")', snap) @@ -303,16 +289,15 @@ def test_agent_act_full_cycle(): agent = SentienceAgent(browser, llm, verbose=False) # Mock snapshot and click - with patch('sentience.agent.snapshot') as mock_snapshot, \ - patch('sentience.agent.click') as mock_click: + with ( + patch("sentience.agent.snapshot") as mock_snapshot, + patch("sentience.agent.click") as mock_click, + ): from sentience.models import ActionResult + mock_snapshot.return_value = create_mock_snapshot() - mock_click.return_value = ActionResult( - success=True, - duration_ms=150, - outcome="dom_updated" - ) + mock_click.return_value = ActionResult(success=True, duration_ms=150, outcome="dom_updated") result = agent.act("Click the button", max_retries=0) @@ -342,8 +327,20 @@ def test_agent_token_tracking(): agent = SentienceAgent(browser, llm, verbose=False) # Simulate multiple actions - response1 = LLMResponse(content="CLICK(1)", prompt_tokens=100, completion_tokens=20, total_tokens=120, model_name="mock-model") - response2 = LLMResponse(content="TYPE(2, \"test\")", prompt_tokens=150, completion_tokens=30, total_tokens=180, model_name="mock-model") + response1 = LLMResponse( + content="CLICK(1)", + prompt_tokens=100, + completion_tokens=20, + total_tokens=120, + model_name="mock-model", + ) + response2 = LLMResponse( + content='TYPE(2, "test")', + prompt_tokens=150, + completion_tokens=30, + total_tokens=180, + model_name="mock-model", + ) agent._track_tokens("goal 1", response1) agent._track_tokens("goal 2", response2) @@ -365,7 +362,16 @@ def test_agent_clear_history(): agent = SentienceAgent(browser, llm, verbose=False) # Add some history - agent.history.append({"goal": "test", "action": "test", "result": {}, "success": True, "attempt": 0, "duration_ms": 0}) + agent.history.append( + { + "goal": "test", + "action": "test", + "result": {}, + "success": True, + "attempt": 0, + "duration_ms": 0, + } + ) agent._token_usage_raw["total_tokens"] = 100 agent.clear_history() @@ -382,8 +388,10 @@ def test_agent_retry_on_failure(): agent = SentienceAgent(browser, llm, verbose=False) # Mock snapshot and click (click will fail) - with patch('sentience.agent.snapshot') as mock_snapshot, \ - patch('sentience.agent.click') as mock_click: + with ( + patch("sentience.agent.snapshot") as mock_snapshot, + patch("sentience.agent.click") as mock_click, + ): mock_snapshot.return_value = create_mock_snapshot() # Simulate click failure @@ -404,11 +412,14 @@ def test_agent_action_parsing_variations(): snap = create_mock_snapshot() - with patch('sentience.agent.click') as mock_click, \ - patch('sentience.agent.type_text') as mock_type, \ - patch('sentience.agent.press') as mock_press: + with ( + patch("sentience.agent.click") as mock_click, + patch("sentience.agent.type_text") as mock_type, + patch("sentience.agent.press") as mock_press, + ): from sentience.models import ActionResult + mock_result = ActionResult(success=True, duration_ms=100, outcome="dom_updated") mock_click.return_value = mock_result mock_type.return_value = mock_result diff --git a/tests/test_bot.py b/tests/test_bot.py index 323aa9c..647b281 100644 --- a/tests/test_bot.py +++ b/tests/test_bot.py @@ -1,5 +1,6 @@ from sentience.browser import SentienceBrowser + def test_bot(): browser = SentienceBrowser() browser.start() @@ -10,4 +11,4 @@ def test_bot(): if __name__ == "__main__": - test_bot() \ No newline at end of file + test_bot() diff --git a/tests/test_conversational_agent.py b/tests/test_conversational_agent.py index 7411be9..3f1f585 100644 --- a/tests/test_conversational_agent.py +++ b/tests/test_conversational_agent.py @@ -3,12 +3,14 @@ Tests natural language interface without requiring browser """ -import pytest import json -from unittest.mock import Mock, MagicMock, patch +from unittest.mock import MagicMock, Mock, patch + +import pytest + from sentience.conversational_agent import ConversationalAgent from sentience.llm_provider import LLMProvider, LLMResponse -from sentience.models import Snapshot, Element, BBox, VisualCues, Viewport +from sentience.models import BBox, Element, Snapshot, Viewport, VisualCues class MockLLMProvider(LLMProvider): @@ -20,28 +22,28 @@ def __init__(self, responses=None): self.calls = [] def generate(self, system_prompt: str, user_prompt: str, **kwargs): - self.calls.append({ - "system": system_prompt, - "user": user_prompt, - "kwargs": kwargs - }) + self.calls.append({"system": system_prompt, "user": user_prompt, "kwargs": kwargs}) # Determine response based on content if "planning assistant" in system_prompt.lower(): # Return plan - response = self.responses.get('plan', self._default_plan()) + response = self.responses.get("plan", self._default_plan()) elif "extract" in system_prompt.lower(): # Return extraction result - response = self.responses.get('extract', '{"found": true, "data": {}, "summary": "Info extracted"}') + response = self.responses.get( + "extract", '{"found": true, "data": {}, "summary": "Info extracted"}' + ) elif "verify" in system_prompt.lower(): # Return verification result - response = self.responses.get('verify', '{"verified": true, "reasoning": "Condition met"}') + response = self.responses.get( + "verify", '{"verified": true, "reasoning": "Condition met"}' + ) elif "summarize" in system_prompt.lower(): # Return summary - response = self.responses.get('summary', "Task completed successfully") + response = self.responses.get("summary", "Task completed successfully") else: # Default technical agent response - response = self.responses.get('action', "CLICK(1)") + response = self.responses.get("action", "CLICK(1)") self.call_count += 1 @@ -50,21 +52,23 @@ def generate(self, system_prompt: str, user_prompt: str, **kwargs): prompt_tokens=100, completion_tokens=20, total_tokens=120, - model_name="mock-model" + model_name="mock-model", ) def _default_plan(self): - return json.dumps({ - "intent": "Test intent", - "steps": [ - { - "action": "NAVIGATE", - "description": "Go to test.com", - "parameters": {"url": "https://test.com"} - } - ], - "expected_outcome": "Success" - }) + return json.dumps( + { + "intent": "Test intent", + "steps": [ + { + "action": "NAVIGATE", + "description": "Go to test.com", + "parameters": {"url": "https://test.com"}, + } + ], + "expected_outcome": "Success", + } + ) def supports_json_mode(self) -> bool: return True @@ -93,13 +97,11 @@ def create_mock_snapshot(): importance=900, bbox=BBox(x=100, y=200, width=80, height=30), visual_cues=VisualCues( - is_primary=True, - is_clickable=True, - background_color_name="blue" + is_primary=True, is_clickable=True, background_color_name="blue" ), in_viewport=True, is_occluded=False, - z_index=10 + z_index=10, ) ] @@ -108,12 +110,13 @@ def create_mock_snapshot(): timestamp="2024-12-24T10:00:00Z", url="https://test.com", viewport=Viewport(width=1920, height=1080), - elements=elements + elements=elements, ) # ========== ConversationalAgent Tests ========== + def test_conversational_agent_initialization(): """Test ConversationalAgent initialization""" browser = create_mock_browser() @@ -132,46 +135,48 @@ def test_create_plan(): """Test plan creation from natural language""" browser = create_mock_browser() - plan_json = json.dumps({ - "intent": "Search for magic mouse", - "steps": [ - { - "action": "NAVIGATE", - "description": "Go to google.com", - "parameters": {"url": "https://google.com"} - }, - { - "action": "FIND_AND_CLICK", - "description": "Click search box", - "parameters": {"element_description": "search box"} - } - ], - "expected_outcome": "Search initiated" - }) + plan_json = json.dumps( + { + "intent": "Search for magic mouse", + "steps": [ + { + "action": "NAVIGATE", + "description": "Go to google.com", + "parameters": {"url": "https://google.com"}, + }, + { + "action": "FIND_AND_CLICK", + "description": "Click search box", + "parameters": {"element_description": "search box"}, + }, + ], + "expected_outcome": "Search initiated", + } + ) - llm = MockLLMProvider(responses={'plan': plan_json}) + llm = MockLLMProvider(responses={"plan": plan_json}) agent = ConversationalAgent(browser, llm, verbose=False) plan = agent._create_plan("Search for magic mouse on google") - assert plan['intent'] == "Search for magic mouse" - assert len(plan['steps']) == 2 - assert plan['steps'][0]['action'] == "NAVIGATE" - assert plan['steps'][1]['action'] == "FIND_AND_CLICK" + assert plan["intent"] == "Search for magic mouse" + assert len(plan["steps"]) == 2 + assert plan["steps"][0]["action"] == "NAVIGATE" + assert plan["steps"][1]["action"] == "FIND_AND_CLICK" def test_create_plan_json_fallback(): """Test plan creation with invalid JSON fallback""" browser = create_mock_browser() - llm = MockLLMProvider(responses={'plan': 'INVALID JSON{'}) + llm = MockLLMProvider(responses={"plan": "INVALID JSON{"}) agent = ConversationalAgent(browser, llm, verbose=False) plan = agent._create_plan("Click button") # Should fall back to simple plan - assert 'intent' in plan - assert 'steps' in plan - assert len(plan['steps']) > 0 + assert "intent" in plan + assert "steps" in plan + assert len(plan["steps"]) > 0 def test_execute_navigate_step(): @@ -183,13 +188,13 @@ def test_execute_navigate_step(): step = { "action": "NAVIGATE", "description": "Go to google.com", - "parameters": {"url": "google.com"} # Without https:// + "parameters": {"url": "google.com"}, # Without https:// } result = agent._execute_step(step) - assert result['success'] is True - assert result['action'] == "NAVIGATE" + assert result["success"] is True + assert result["action"] == "NAVIGATE" browser.page.goto.assert_called_once() # Should have added https:// assert "https://google.com" in str(browser.page.goto.call_args) @@ -198,30 +203,29 @@ def test_execute_navigate_step(): def test_execute_find_and_click_step(): """Test FIND_AND_CLICK step execution""" browser = create_mock_browser() - llm = MockLLMProvider(responses={'action': 'CLICK(1)'}) + llm = MockLLMProvider(responses={"action": "CLICK(1)"}) agent = ConversationalAgent(browser, llm, verbose=False) step = { "action": "FIND_AND_CLICK", "description": "Click the button", - "parameters": {"element_description": "button"} + "parameters": {"element_description": "button"}, } # Patch at the agent module level where it's imported - with patch('sentience.agent.snapshot') as mock_snapshot, \ - patch('sentience.agent.click') as mock_click: + with ( + patch("sentience.agent.snapshot") as mock_snapshot, + patch("sentience.agent.click") as mock_click, + ): from sentience.models import ActionResult + mock_snapshot.return_value = create_mock_snapshot() - mock_click.return_value = ActionResult( - success=True, - duration_ms=150, - outcome="dom_updated" - ) + mock_click.return_value = ActionResult(success=True, duration_ms=150, outcome="dom_updated") result = agent._execute_step(step) - assert result['action'] == "FIND_AND_CLICK" + assert result["action"] == "FIND_AND_CLICK" # Technical agent should have been called assert len(agent.technical_agent.history) > 0 @@ -229,34 +233,30 @@ def test_execute_find_and_click_step(): def test_execute_find_and_type_step(): """Test FIND_AND_TYPE step execution""" browser = create_mock_browser() - llm = MockLLMProvider(responses={'action': 'TYPE(1, "test")'}) + llm = MockLLMProvider(responses={"action": 'TYPE(1, "test")'}) agent = ConversationalAgent(browser, llm, verbose=False) step = { "action": "FIND_AND_TYPE", "description": "Type into search box", - "parameters": { - "element_description": "search box", - "text": "magic mouse" - } + "parameters": {"element_description": "search box", "text": "magic mouse"}, } # Patch at the agent module level where it's imported - with patch('sentience.agent.snapshot') as mock_snapshot, \ - patch('sentience.agent.type_text') as mock_type: + with ( + patch("sentience.agent.snapshot") as mock_snapshot, + patch("sentience.agent.type_text") as mock_type, + ): from sentience.models import ActionResult + mock_snapshot.return_value = create_mock_snapshot() - mock_type.return_value = ActionResult( - success=True, - duration_ms=200, - outcome="dom_updated" - ) + mock_type.return_value = ActionResult(success=True, duration_ms=200, outcome="dom_updated") result = agent._execute_step(step) - assert result['action'] == "FIND_AND_TYPE" - assert result['data']['text'] == "magic mouse" + assert result["action"] == "FIND_AND_TYPE" + assert result["data"]["text"] == "magic mouse" def test_execute_wait_step(): @@ -268,92 +268,83 @@ def test_execute_wait_step(): step = { "action": "WAIT", "description": "Wait for page to load", - "parameters": {"duration": 0.1} # Short wait for testing + "parameters": {"duration": 0.1}, # Short wait for testing } result = agent._execute_step(step) - assert result['success'] is True - assert result['action'] == "WAIT" - assert result['data']['duration'] == 0.1 + assert result["success"] is True + assert result["action"] == "WAIT" + assert result["data"]["duration"] == 0.1 def test_execute_extract_info_step(): """Test EXTRACT_INFO step execution""" browser = create_mock_browser() - extract_response = json.dumps({ - "found": True, - "data": {"price": "$79"}, - "summary": "Found price information" - }) + extract_response = json.dumps( + {"found": True, "data": {"price": "$79"}, "summary": "Found price information"} + ) - llm = MockLLMProvider(responses={'extract': extract_response}) + llm = MockLLMProvider(responses={"extract": extract_response}) agent = ConversationalAgent(browser, llm, verbose=False) step = { "action": "EXTRACT_INFO", "description": "Extract price", - "parameters": {"info_type": "product price"} + "parameters": {"info_type": "product price"}, } - with patch('sentience.conversational_agent.snapshot') as mock_snapshot: + with patch("sentience.conversational_agent.snapshot") as mock_snapshot: mock_snapshot.return_value = create_mock_snapshot() result = agent._execute_step(step) - assert result['success'] is True - assert result['action'] == "EXTRACT_INFO" - assert result['data']['extracted']['found'] is True + assert result["success"] is True + assert result["action"] == "EXTRACT_INFO" + assert result["data"]["extracted"]["found"] is True def test_execute_verify_step(): """Test VERIFY step execution""" browser = create_mock_browser() - verify_response = json.dumps({ - "verified": True, - "reasoning": "Page contains results" - }) + verify_response = json.dumps({"verified": True, "reasoning": "Page contains results"}) - llm = MockLLMProvider(responses={'verify': verify_response}) + llm = MockLLMProvider(responses={"verify": verify_response}) agent = ConversationalAgent(browser, llm, verbose=False) step = { "action": "VERIFY", "description": "Verify results", - "parameters": {"condition": "page contains search results"} + "parameters": {"condition": "page contains search results"}, } - with patch('sentience.conversational_agent.snapshot') as mock_snapshot: + with patch("sentience.conversational_agent.snapshot") as mock_snapshot: mock_snapshot.return_value = create_mock_snapshot() result = agent._execute_step(step) - assert result['success'] is True - assert result['action'] == "VERIFY" - assert result['data']['verified'] is True + assert result["success"] is True + assert result["action"] == "VERIFY" + assert result["data"]["verified"] is True def test_synthesize_response(): """Test natural language response synthesis""" browser = create_mock_browser() - llm = MockLLMProvider(responses={ - 'summary': "I navigated to google.com and found the search results you requested." - }) + llm = MockLLMProvider( + responses={ + "summary": "I navigated to google.com and found the search results you requested." + } + ) agent = ConversationalAgent(browser, llm, verbose=False) - plan = { - "intent": "Search for magic mouse", - "steps": [], - "expected_outcome": "Success" - } + plan = {"intent": "Search for magic mouse", "steps": [], "expected_outcome": "Success"} - execution_results = [ - {"success": True, "action": "NAVIGATE"} - ] + execution_results = [{"success": True, "action": "NAVIGATE"}] response = agent._synthesize_response("Search for magic mouse", plan, execution_results) @@ -365,22 +356,23 @@ def test_execute_full_workflow(): """Test full execute() workflow""" browser = create_mock_browser() - plan_json = json.dumps({ - "intent": "Navigate to test site", - "steps": [ - { - "action": "NAVIGATE", - "description": "Go to test.com", - "parameters": {"url": "https://test.com"} - } - ], - "expected_outcome": "Navigation complete" - }) + plan_json = json.dumps( + { + "intent": "Navigate to test site", + "steps": [ + { + "action": "NAVIGATE", + "description": "Go to test.com", + "parameters": {"url": "https://test.com"}, + } + ], + "expected_outcome": "Navigation complete", + } + ) - llm = MockLLMProvider(responses={ - 'plan': plan_json, - 'summary': "Successfully navigated to test.com" - }) + llm = MockLLMProvider( + responses={"plan": plan_json, "summary": "Successfully navigated to test.com"} + ) agent = ConversationalAgent(browser, llm, verbose=False) @@ -388,23 +380,16 @@ def test_execute_full_workflow(): assert isinstance(response, str) assert len(agent.conversation_history) == 1 - assert agent.conversation_history[0]['user_input'] == "Go to test.com" + assert agent.conversation_history[0]["user_input"] == "Go to test.com" def test_chat_method(): """Test chat() method as alias for execute()""" browser = create_mock_browser() - plan_json = json.dumps({ - "intent": "Test", - "steps": [], - "expected_outcome": "Done" - }) + plan_json = json.dumps({"intent": "Test", "steps": [], "expected_outcome": "Done"}) - llm = MockLLMProvider(responses={ - 'plan': plan_json, - 'summary': "Task complete" - }) + llm = MockLLMProvider(responses={"plan": plan_json, "summary": "Task complete"}) agent = ConversationalAgent(browser, llm, verbose=False) @@ -418,22 +403,18 @@ def test_get_summary(): """Test session summary generation""" browser = create_mock_browser() - llm = MockLLMProvider(responses={ - 'plan': '{"intent": "test", "steps": [], "expected_outcome": "done"}', - 'summary': "Session completed with 2 interactions" - }) + llm = MockLLMProvider( + responses={ + "plan": '{"intent": "test", "steps": [], "expected_outcome": "done"}', + "summary": "Session completed with 2 interactions", + } + ) agent = ConversationalAgent(browser, llm, verbose=False) # Add some history - agent.conversation_history.append({ - "user_input": "Test 1", - "response": "Done 1" - }) - agent.conversation_history.append({ - "user_input": "Test 2", - "response": "Done 2" - }) + agent.conversation_history.append({"user_input": "Test 1", "response": "Done 1"}) + agent.conversation_history.append({"user_input": "Test 2", "response": "Done 2"}) summary = agent.get_summary() diff --git a/tests/test_generator.py b/tests/test_generator.py index dd6bb43..2b1bf72 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -2,12 +2,14 @@ Tests for script generator functionality """ -import pytest -import tempfile import os +import tempfile + +import pytest + from sentience import SentienceBrowser, record -from sentience.recorder import Trace, TraceStep from sentience.generator import ScriptGenerator, generate +from sentience.recorder import Trace, TraceStep def test_generator_python(): @@ -15,16 +17,16 @@ def test_generator_python(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_navigation("https://example.com") rec.record_click(1, "role=button text~'Click'") rec.record_type(2, "hello", "role=textbox") rec.record_press("Enter") - + generator = ScriptGenerator(rec.trace) code = generator.generate_python() - + # Verify code contains expected elements assert "from sentience import" in code assert "def main():" in code @@ -40,14 +42,14 @@ def test_generator_typescript(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_navigation("https://example.com") rec.record_click(1, "role=button") - + generator = ScriptGenerator(rec.trace) code = generator.generate_typescript() - + # Verify code contains expected elements assert "import" in code assert "async function main()" in code @@ -60,19 +62,19 @@ def test_generator_save_python(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_click(1) - + generator = ScriptGenerator(rec.trace) - - with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: + + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: temp_path = f.name - + try: generator.save_python(temp_path) assert os.path.exists(temp_path) - + with open(temp_path) as f: code = f.read() assert "from sentience import" in code @@ -85,19 +87,19 @@ def test_generator_save_typescript(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_click(1) - + generator = ScriptGenerator(rec.trace) - - with tempfile.NamedTemporaryFile(mode='w', suffix='.ts', delete=False) as f: + + with tempfile.NamedTemporaryFile(mode="w", suffix=".ts", delete=False) as f: temp_path = f.name - + try: generator.save_typescript(temp_path) assert os.path.exists(temp_path) - + with open(temp_path) as f: code = f.read() assert "import" in code @@ -110,21 +112,16 @@ def test_generator_without_selector(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + # Create a trace manually with a step that has no selector # (The recorder automatically infers selectors, so we create the step directly) trace = Trace("https://example.com") - step = TraceStep( - ts=0, - type="click", - element_id=1, - selector=None # Explicitly no selector - ) + step = TraceStep(ts=0, type="click", element_id=1, selector=None) # Explicitly no selector trace.add_step(step) - + generator = ScriptGenerator(trace) code = generator.generate_python() - + # Should include TODO comment for missing selector assert "TODO: replace with semantic selector" in code assert "click(browser, 1)" in code @@ -135,15 +132,14 @@ def test_generate_helper(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_click(1) - + # Test Python generation - py_code = generate(rec.trace, 'py') + py_code = generate(rec.trace, "py") assert "from sentience import" in py_code - + # Test TypeScript generation - ts_code = generate(rec.trace, 'ts') + ts_code = generate(rec.trace, "ts") assert "import" in ts_code - diff --git a/tests/test_inspector.py b/tests/test_inspector.py index 6bbaece..f3d8786 100644 --- a/tests/test_inspector.py +++ b/tests/test_inspector.py @@ -3,6 +3,7 @@ """ import pytest + from sentience import SentienceBrowser, inspect @@ -11,16 +12,16 @@ def test_inspector_start_stop(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + inspector = inspect(browser) inspector.start() - + # Verify inspector is active active = browser.page.evaluate("window.__sentience_inspector_active === true") assert active is True - + inspector.stop() - + # Verify inspector is stopped active = browser.page.evaluate("window.__sentience_inspector_active === true") assert active is False @@ -31,12 +32,12 @@ def test_inspector_context_manager(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with inspect(browser): # Verify inspector is active active = browser.page.evaluate("window.__sentience_inspector_active === true") assert active is True - + # Verify inspector is stopped after context exit active = browser.page.evaluate("window.__sentience_inspector_active === true") assert active is False @@ -47,12 +48,12 @@ def test_inspector_mouse_move_detection(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with inspect(browser): # Simulate mouse move browser.page.mouse.move(100, 100) browser.page.wait_for_timeout(100) - + # Inspector should be active (we can't easily test console output) active = browser.page.evaluate("window.__sentience_inspector_active === true") assert active is True @@ -63,13 +64,12 @@ def test_inspector_click_detection(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with inspect(browser): # Simulate click browser.page.mouse.click(100, 100) browser.page.wait_for_timeout(100) - + # Inspector should be active active = browser.page.evaluate("window.__sentience_inspector_active === true") assert active is True - diff --git a/tests/test_query.py b/tests/test_query.py index d581713..f9c71d4 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -2,9 +2,9 @@ Tests for query engine """ -from sentience import SentienceBrowser, snapshot, query, find -from sentience.query import parse_selector, match_element -from sentience.models import Element, BBox, VisualCues +from sentience import SentienceBrowser, find, query, snapshot +from sentience.models import BBox, Element, VisualCues +from sentience.query import match_element, parse_selector def test_parse_selector(): @@ -12,53 +12,53 @@ def test_parse_selector(): # Simple role q = parse_selector("role=button") assert q["role"] == "button" - + # Text contains q = parse_selector("text~'Sign in'") assert q["text_contains"] == "Sign in" - + # Clickable q = parse_selector("clickable=true") assert q["clickable"] is True - + # Combined q = parse_selector("role=button text~'Submit'") assert q["role"] == "button" assert q["text_contains"] == "Submit" - + # Negation q = parse_selector("role!=link") assert q["role_exclude"] == "link" - + # New operators: prefix and suffix q = parse_selector("text^='Sign'") assert q["text_prefix"] == "Sign" - + q = parse_selector("text$='in'") assert q["text_suffix"] == "in" - + # Numeric comparisons: importance q = parse_selector("importance>500") assert "importance_min" in q assert q["importance_min"] > 500 - + q = parse_selector("importance>=500") assert q["importance_min"] == 500 - + q = parse_selector("importance<1000") assert "importance_max" in q assert q["importance_max"] < 1000 - + q = parse_selector("importance<=1000") assert q["importance_max"] == 1000 - + # Visible field q = parse_selector("visible=true") assert q["visible"] is True - + q = parse_selector("visible=false") assert q["visible"] is False - + # Tag field (placeholder for future) q = parse_selector("tag=button") assert q["tag"] == "button" @@ -77,27 +77,27 @@ def test_match_element(): is_occluded=False, z_index=10, ) - + # Role match assert match_element(element, {"role": "button"}) is True assert match_element(element, {"role": "link"}) is False - + # Text contains assert match_element(element, {"text_contains": "Sign"}) is True assert match_element(element, {"text_contains": "Logout"}) is False - + # Text prefix assert match_element(element, {"text_prefix": "Sign"}) is True assert match_element(element, {"text_prefix": "Login"}) is False - + # Text suffix assert match_element(element, {"text_suffix": "In"}) is True assert match_element(element, {"text_suffix": "Out"}) is False - + # Clickable assert match_element(element, {"clickable": True}) is True assert match_element(element, {"clickable": False}) is False - + # Visible (using in_viewport and !is_occluded) assert match_element(element, {"visible": True}) is True element_occluded = Element( @@ -113,25 +113,25 @@ def test_match_element(): ) assert match_element(element_occluded, {"visible": True}) is False assert match_element(element_occluded, {"visible": False}) is True - + # Importance filtering assert match_element(element, {"importance_min": 50}) is True assert match_element(element, {"importance_min": 150}) is False assert match_element(element, {"importance_max": 150}) is True assert match_element(element, {"importance_max": 50}) is False - + # BBox filtering assert match_element(element, {"bbox.x_min": -10}) is True assert match_element(element, {"bbox.x_min": 10}) is False assert match_element(element, {"bbox.width_min": 50}) is True assert match_element(element, {"bbox.width_min": 150}) is False - + # Z-index filtering assert match_element(element, {"z_index_min": 5}) is True assert match_element(element, {"z_index_min": 15}) is False assert match_element(element, {"z_index_max": 15}) is True assert match_element(element, {"z_index_max": 5}) is False - + # In viewport filtering assert match_element(element, {"in_viewport": True}) is True element_off_screen = Element( @@ -147,7 +147,7 @@ def test_match_element(): ) assert match_element(element_off_screen, {"in_viewport": False}) is True assert match_element(element_off_screen, {"in_viewport": True}) is False - + # Occlusion filtering assert match_element(element, {"is_occluded": False}) is True assert match_element(element_occluded, {"is_occluded": True}) is True @@ -158,14 +158,14 @@ def test_query_integration(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) - + # Query for links links = query(snap, "role=link") assert len(links) > 0 assert all(el.role == "link" for el in links) - + # Query for clickable clickables = query(snap, "clickable=true") assert len(clickables) > 0 @@ -177,9 +177,9 @@ def test_find_integration(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) - + # Find first link link = find(snap, "role=link") if link: @@ -225,49 +225,49 @@ def test_query_advanced_operators(): z_index=1, ), ] - + from sentience.models import Snapshot + snap = Snapshot( status="success", url="https://example.com", elements=elements, ) - + # Test importance filtering high_importance = query(snap, "importance>500") assert len(high_importance) == 1 assert high_importance[0].id == 1 - + low_importance = query(snap, "importance<300") assert len(low_importance) == 1 assert low_importance[0].id == 3 - + # Test prefix matching sign_prefix = query(snap, "text^='Sign'") assert len(sign_prefix) == 2 assert all("Sign" in el.text for el in sign_prefix) - + # Test suffix matching in_suffix = query(snap, "text$='In'") assert len(in_suffix) == 1 assert in_suffix[0].text == "Sign In" - + # Test BBox filtering right_side = query(snap, "bbox.x>100") assert len(right_side) == 1 assert right_side[0].id == 2 - + # Test combined queries combined = query(snap, "role=button importance>500") assert len(combined) == 1 assert combined[0].id == 1 - + # Test visible filtering visible = query(snap, "visible=true") assert len(visible) == 3 # All are visible - + # Test z-index filtering high_z = query(snap, "z_index>5") assert len(high_z) == 1 assert high_z[0].id == 1 - diff --git a/tests/test_read.py b/tests/test_read.py index 7d414a8..699144f 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -10,9 +10,9 @@ def test_read_text(): with SentienceBrowser(headless=True) as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + result = read(browser, output_format="text") - + assert result["status"] == "success" assert result["format"] == "text" assert "content" in result @@ -26,9 +26,9 @@ def test_read_markdown(): with SentienceBrowser(headless=True) as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + result = read(browser, output_format="markdown") - + assert result["status"] == "success" assert result["format"] == "markdown" assert "content" in result @@ -42,23 +42,22 @@ def test_read_markdown_enhanced(): with SentienceBrowser(headless=True) as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + # Test with enhancement (default) result_enhanced = read(browser, output_format="markdown", enhance_markdown=True) - + assert result_enhanced["status"] == "success" assert result_enhanced["format"] == "markdown" assert len(result_enhanced["content"]) > 0 - + # Test without enhancement result_basic = read(browser, output_format="markdown", enhance_markdown=False) - + assert result_basic["status"] == "success" assert result_basic["format"] == "markdown" assert len(result_basic["content"]) > 0 - + # Enhanced markdown should be different (and likely better formatted) # Note: They might be similar for simple pages, but enhanced should handle more cases assert isinstance(result_enhanced["content"], str) assert isinstance(result_basic["content"], str) - diff --git a/tests/test_recorder.py b/tests/test_recorder.py index 21a0875..0d061a8 100644 --- a/tests/test_recorder.py +++ b/tests/test_recorder.py @@ -2,10 +2,12 @@ Tests for recorder functionality """ -import pytest -import tempfile import os -from sentience import SentienceBrowser, record, Trace, TraceStep +import tempfile + +import pytest + +from sentience import SentienceBrowser, Trace, TraceStep, record from sentience.recorder import Recorder @@ -14,12 +16,12 @@ def test_recorder_start_stop(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + rec = record(browser) rec.start() assert rec._active is True assert rec.trace is not None - + rec.stop() assert rec._active is False @@ -29,11 +31,11 @@ def test_recorder_context_manager(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: assert rec._active is True assert rec.trace is not None - + assert rec._active is False @@ -42,10 +44,10 @@ def test_recorder_navigation(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_navigation("https://example.com/page2") - + assert len(rec.trace.steps) == 1 assert rec.trace.steps[0].type == "navigation" assert rec.trace.steps[0].url == "https://example.com/page2" @@ -56,10 +58,10 @@ def test_recorder_click(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_click(42, "role=button text~'Click'") - + assert len(rec.trace.steps) == 1 assert rec.trace.steps[0].type == "click" assert rec.trace.steps[0].element_id == 42 @@ -71,10 +73,10 @@ def test_recorder_type(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_type(10, "hello world", "role=textbox") - + assert len(rec.trace.steps) == 1 assert rec.trace.steps[0].type == "type" assert rec.trace.steps[0].element_id == 10 @@ -87,11 +89,11 @@ def test_recorder_type_masking(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.add_mask_pattern("password") rec.record_type(10, "mypassword123", "role=textbox") - + assert len(rec.trace.steps) == 1 assert rec.trace.steps[0].text == "***" # Should be masked @@ -101,10 +103,10 @@ def test_recorder_press(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_press("Enter") - + assert len(rec.trace.steps) == 1 assert rec.trace.steps[0].type == "press" assert rec.trace.steps[0].key == "Enter" @@ -115,20 +117,20 @@ def test_trace_save_load(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_navigation("https://example.com") rec.record_click(1, "role=button") rec.record_type(2, "text", "role=textbox") - + # Save trace - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: temp_path = f.name - + try: rec.save(temp_path) assert os.path.exists(temp_path) - + # Load trace loaded_trace = Trace.load(temp_path) assert loaded_trace.version == "1.0.0" @@ -145,16 +147,15 @@ def test_trace_format(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + with record(browser) as rec: rec.record_click(1) - + trace_dict = rec.trace.steps[0].to_dict() - + # Verify required fields assert "ts" in trace_dict assert "type" in trace_dict assert trace_dict["type"] == "click" assert "element_id" in trace_dict assert trace_dict["element_id"] == 1 - diff --git a/tests/test_screenshot.py b/tests/test_screenshot.py index cea8521..8d6740b 100644 --- a/tests/test_screenshot.py +++ b/tests/test_screenshot.py @@ -3,6 +3,7 @@ """ import base64 + from sentience import SentienceBrowser, screenshot @@ -11,11 +12,11 @@ def test_screenshot_png(): with SentienceBrowser(headless=True) as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + data_url = screenshot(browser, format="png") - + assert data_url.startswith("data:image/png;base64,") - + # Decode and verify it's valid base64 base64_data = data_url.split(",")[1] image_data = base64.b64decode(base64_data) @@ -27,11 +28,11 @@ def test_screenshot_jpeg(): with SentienceBrowser(headless=True) as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + data_url = screenshot(browser, format="jpeg", quality=80) - + assert data_url.startswith("data:image/jpeg;base64,") - + # Decode and verify it's valid base64 base64_data = data_url.split(",")[1] image_data = base64.b64decode(base64_data) @@ -43,28 +44,27 @@ def test_screenshot_default(): with SentienceBrowser(headless=True) as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + data_url = screenshot(browser) - + assert data_url.startswith("data:image/png;base64,") def test_screenshot_quality_validation(): """Test JPEG quality validation""" import pytest - + with SentienceBrowser(headless=True) as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + # Valid quality screenshot(browser, format="jpeg", quality=50) # Should not raise - + # Invalid quality - too low with pytest.raises(ValueError, match="Quality must be between 1 and 100"): screenshot(browser, format="jpeg", quality=0) - + # Invalid quality - too high with pytest.raises(ValueError, match="Quality must be between 1 and 100"): screenshot(browser, format="jpeg", quality=101) - diff --git a/tests/test_smart_selector.py b/tests/test_smart_selector.py index 66da62d..2bd770d 100644 --- a/tests/test_smart_selector.py +++ b/tests/test_smart_selector.py @@ -3,6 +3,7 @@ """ import pytest + from sentience import SentienceBrowser, record, snapshot @@ -11,16 +12,16 @@ def test_smart_selector_inference(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + # Take snapshot to get element snap = snapshot(browser) if len(snap.elements) > 0: element = snap.elements[0] - + with record(browser) as rec: # Record click without providing selector rec.record_click(element.id) - + # Should have inferred a selector step = rec.trace.steps[0] # Selector may or may not be inferred depending on element properties @@ -33,7 +34,7 @@ def test_smart_selector_with_text(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) # Find element with text element_with_text = None @@ -41,11 +42,11 @@ def test_smart_selector_with_text(): if el.text and len(el.text) > 0: element_with_text = el break - + if element_with_text: with record(browser) as rec: rec.record_click(element_with_text.id) - + step = rec.trace.steps[0] # If selector was inferred, it should include text if step.selector: @@ -57,20 +58,20 @@ def test_smart_selector_validation(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) if len(snap.elements) > 0: element = snap.elements[0] - + with record(browser) as rec: rec.record_click(element.id) - + step = rec.trace.steps[0] # If selector was inferred and validated, it should match the element if step.selector: # Verify selector would match the element from sentience.query import query + matches = query(snap, step.selector) # Should match at least the original element assert any(el.id == element.id for el in matches) - diff --git a/tests/test_snapshot.py b/tests/test_snapshot.py index 0b58a0b..50033c8 100644 --- a/tests/test_snapshot.py +++ b/tests/test_snapshot.py @@ -3,6 +3,7 @@ """ import pytest + from sentience import SentienceBrowser, snapshot from sentience.models import Snapshot @@ -13,15 +14,28 @@ def test_snapshot_basic(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) - + assert snap.status == "success" assert snap.url == "https://example.com/" assert len(snap.elements) > 0 assert all(el.id >= 0 for el in snap.elements) - assert all(el.role in ["button", "link", "textbox", "searchbox", "checkbox", "radio", "combobox", "image", "generic"] - for el in snap.elements) + assert all( + el.role + in [ + "button", + "link", + "textbox", + "searchbox", + "checkbox", + "radio", + "combobox", + "image", + "generic", + ] + for el in snap.elements + ) @pytest.mark.requires_extension @@ -31,20 +45,20 @@ def test_snapshot_roundtrip(): sites = [ "https://example.com", ] - + for site in sites: with SentienceBrowser() as browser: browser.page.goto(site) browser.page.wait_for_load_state("networkidle") - + # Wait a bit more for dynamic content and extension processing browser.page.wait_for_timeout(1000) - + snap = snapshot(browser) - + assert snap.status == "success" assert snap.url is not None - + # Most pages should have at least some elements # But we'll be lenient - at least verify structure is valid if len(snap.elements) > 0: @@ -62,23 +76,23 @@ def test_snapshot_roundtrip(): @pytest.mark.requires_extension def test_snapshot_save(): """Test snapshot save functionality""" - import tempfile - import os import json - + import os + import tempfile + with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) - + # Save to temp file - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: temp_path = f.name - + try: snap.save(temp_path) - + # Verify file exists and is valid JSON assert os.path.exists(temp_path) with open(temp_path) as f: @@ -87,4 +101,3 @@ def test_snapshot_save(): assert "elements" in data finally: os.unlink(temp_path) - diff --git a/tests/test_spec_validation.py b/tests/test_spec_validation.py index a414d3c..74129d8 100644 --- a/tests/test_spec_validation.py +++ b/tests/test_spec_validation.py @@ -3,8 +3,10 @@ """ import json -import pytest from pathlib import Path + +import pytest + from sentience import SentienceBrowser, snapshot from sentience.models import Snapshot as SnapshotModel @@ -16,7 +18,7 @@ def load_schema(): # parent.parent = sdk-python/ repo_root = Path(__file__).parent.parent schema_path = repo_root / "spec" / "snapshot.schema.json" - + with open(schema_path) as f: return json.load(f) @@ -24,19 +26,19 @@ def load_schema(): def validate_against_schema(data: dict, schema: dict) -> list: """Simple schema validation (basic checks)""" errors = [] - + # Check required fields required = schema.get("required", []) for field in required: if field not in data: errors.append(f"Missing required field: {field}") - + # Check status enum if "status" in data: allowed = schema["properties"]["status"]["enum"] if data["status"] not in allowed: errors.append(f"Invalid status: {data['status']}, must be one of {allowed}") - + # Check elements array if "elements" in data: if not isinstance(data["elements"], list): @@ -45,18 +47,18 @@ def validate_against_schema(data: dict, schema: dict) -> list: # Check element structure element_schema = schema["definitions"]["Element"] element_required = element_schema.get("required", []) - + for i, el in enumerate(data["elements"][:5]): # Check first 5 for field in element_required: if field not in el: errors.append(f"Element {i} missing required field: {field}") - + # Check role enum if "role" in el: allowed_roles = element_schema["properties"]["role"]["enum"] if el["role"] not in allowed_roles: errors.append(f"Element {i} has invalid role: {el['role']}") - + return errors @@ -64,24 +66,23 @@ def validate_against_schema(data: dict, schema: dict) -> list: def test_snapshot_matches_spec(): """Test that snapshot response matches spec schema""" schema = load_schema() - + with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + snap = snapshot(browser) - + # Convert to dict data = snap.model_dump() - + # Validate errors = validate_against_schema(data, schema) - + if errors: pytest.fail(f"Schema validation errors:\n" + "\n".join(errors)) - + # Also verify Pydantic model validation assert isinstance(snap, SnapshotModel) assert snap.status in ["success", "error"] assert len(snap.elements) > 0 - diff --git a/tests/test_stealth.py b/tests/test_stealth.py index 18854dc..9b4235f 100644 --- a/tests/test_stealth.py +++ b/tests/test_stealth.py @@ -24,27 +24,27 @@ def test_stealth_features(): print("=" * 60) print("Bot Evasion / Stealth Mode Test") print("=" * 60) - + browser = SentienceBrowser() - + try: browser.start() page = browser.page - + print("\n1. Testing navigator.webdriver...") webdriver_value = page.evaluate("() => navigator.webdriver") if webdriver_value is False or webdriver_value is None: print(f" ✅ navigator.webdriver = {webdriver_value} (stealth working)") else: print(f" ❌ navigator.webdriver = {webdriver_value} (detectable)") - + print("\n2. Testing window.chrome...") chrome_exists = page.evaluate("() => typeof window.chrome !== 'undefined'") if chrome_exists: print(f" ✅ window.chrome exists (stealth working)") else: print(f" ❌ window.chrome does not exist (detectable)") - + print("\n3. Testing user-agent...") user_agent = page.evaluate("() => navigator.userAgent") print(f" User-Agent: {user_agent}") @@ -52,15 +52,15 @@ def test_stealth_features(): print(" ✅ User-Agent looks realistic (no HeadlessChrome)") else: print(" ⚠️ User-Agent may be detectable") - + print("\n4. Testing viewport...") viewport = page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })") print(f" Viewport: {viewport['width']}x{viewport['height']}") - if viewport['width'] >= 1920 and viewport['height'] >= 1080: + if viewport["width"] >= 1920 and viewport["height"] >= 1080: print(" ✅ Viewport is realistic (1920x1080 or larger)") else: print(" ⚠️ Viewport may be smaller than expected") - + print("\n5. Testing navigator.plugins...") plugins_count = page.evaluate("() => navigator.plugins.length") print(f" Plugins count: {plugins_count}") @@ -68,32 +68,35 @@ def test_stealth_features(): print(" ✅ Plugins present (stealth working)") else: print(" ⚠️ No plugins (may be detectable)") - + print("\n6. Testing permissions API...") try: - permissions_works = page.evaluate(""" + permissions_works = page.evaluate( + """ () => { if (navigator.permissions && navigator.permissions.query) { return true; } return false; } - """) + """ + ) if permissions_works: print(" ✅ Permissions API is patched") else: print(" ⚠️ Permissions API may not be patched") except Exception as e: print(f" ⚠️ Could not test permissions: {e}") - + print("\n7. Testing against bot detection site...") try: # Navigate to a bot detection test site page.goto("https://bot.sannysoft.com/", wait_until="domcontentloaded", timeout=10000) page.wait_for_timeout(2000) # Wait for page to load - + # Check if we're detected - detection_results = page.evaluate(""" + detection_results = page.evaluate( + """ () => { const results = {}; // Check webdriver @@ -106,29 +109,30 @@ def test_stealth_features(): results.languages = navigator.languages.length; return results; } - """) - + """ + ) + print(f" Detection results: {detection_results}") - + # Count how many stealth features are working stealth_score = 0 - if detection_results.get('webdriver') is False: + if detection_results.get("webdriver") is False: stealth_score += 1 - if detection_results.get('chrome') is True: + if detection_results.get("chrome") is True: stealth_score += 1 - if detection_results.get('plugins', 0) > 0: + if detection_results.get("plugins", 0) > 0: stealth_score += 1 - + print(f" Stealth score: {stealth_score}/3") if stealth_score >= 2: print(" ✅ Most stealth features working") else: print(" ⚠️ Some stealth features may not be working") - + except Exception as e: print(f" ⚠️ Could not test against bot detection site: {e}") print(" (This is okay - site may be down or blocked)") - + print("\n" + "=" * 60) print("Test Summary") print("=" * 60) @@ -136,12 +140,13 @@ def test_stealth_features(): print("⚠️ Note: Bot detection is a cat-and-mouse game.") print(" No solution is 100% effective against all detection systems.") print("=" * 60) - + return True - + except Exception as e: print(f"\n❌ Test failed: {e}") import traceback + traceback.print_exc() return False finally: @@ -151,4 +156,3 @@ def test_stealth_features(): if __name__ == "__main__": success = test_stealth_features() sys.exit(0 if success else 1) - diff --git a/tests/test_wait.py b/tests/test_wait.py index fc50823..9600b27 100644 --- a/tests/test_wait.py +++ b/tests/test_wait.py @@ -2,9 +2,11 @@ Tests for wait functionality """ -import pytest import os -from sentience import SentienceBrowser, wait_for, expect + +import pytest + +from sentience import SentienceBrowser, expect, wait_for def test_wait_for(): @@ -13,7 +15,7 @@ def test_wait_for(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + result = wait_for(browser, "role=link", timeout=5.0) assert result.found is True assert result.element is not None @@ -26,7 +28,7 @@ def test_wait_for_timeout(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + # Wait for non-existent element result = wait_for(browser, "role=button text~'NonExistentButton'", timeout=1.0) assert result.found is False @@ -38,7 +40,7 @@ def test_expect_to_exist(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + element = expect(browser, "role=link").to_exist(timeout=5.0) assert element is not None assert element.role == "link" @@ -49,8 +51,7 @@ def test_expect_to_be_visible(): with SentienceBrowser() as browser: browser.page.goto("https://example.com") browser.page.wait_for_load_state("networkidle") - + element = expect(browser, "role=link").to_be_visible(timeout=5.0) assert element is not None assert element.in_viewport is True -