Skip to content
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,24 @@ pip install transformers torch # For local LLMs
pip install -e .
```

## 🧭 Manual driver CLI

Use the interactive CLI to open a page, inspect clickables, and drive actions:

```bash
sentience driver --url https://example.com
```

Commands:
- `open <url>`
- `state [limit]`
- `click <element_id>`
- `type <element_id> <text>`
- `press <key>`
- `screenshot [path]`
- `help`
- `close`

## Jest for AI Web Agent

### Semantic snapshots and assertions that let agents act, verify, and know when they're done.
Expand Down
11 changes: 9 additions & 2 deletions sentience/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
click_rect,
press,
scroll_to,
search,
search_async,
select_option,
send_keys,
send_keys_async,
submit,
type_text,
uncheck,
Expand Down Expand Up @@ -51,7 +55,7 @@

# Agent Layer (Phase 1 & 2)
from .base_agent import BaseAgent
from .browser import SentienceBrowser
from .browser import AsyncSentienceBrowser, SentienceBrowser
from .captcha import CaptchaContext, CaptchaHandlingError, CaptchaOptions, CaptchaResolution
from .captcha_strategies import ExternalSolver, HumanHandoffSolver, VisionSolver

Expand Down Expand Up @@ -86,6 +90,7 @@
Snapshot,
SnapshotFilter,
SnapshotOptions,
StepHookContext,
StorageState,
TextContext,
TextMatch,
Expand All @@ -101,13 +106,14 @@
from .ordinal import OrdinalIntent, boost_ordinal_elements, detect_ordinal_intent, select_by_ordinal
from .overlay import clear_overlay, show_overlay
from .query import find, query
from .read import read
from .read import extract, extract_async, read
from .recorder import Recorder, Trace, TraceStep, record
from .runtime_agent import RuntimeAgent, RuntimeStep, StepVerification
from .screenshot import screenshot
from .sentience_methods import AgentAction, SentienceMethod
from .snapshot import snapshot
from .text_search import find_text_rect
from .tools import BackendCapabilities, ToolContext, ToolRegistry, ToolSpec, register_default_tools
from .tracer_factory import SENTIENCE_API_URL, create_tracer
from .tracing import JsonlTraceSink, TraceEvent, Tracer, TraceSink

Expand Down Expand Up @@ -186,6 +192,7 @@
"backend_wait_for_stable",
# Core SDK
"SentienceBrowser",
"AsyncSentienceBrowser",
"Snapshot",
"Element",
"BBox",
Expand Down
230 changes: 229 additions & 1 deletion sentience/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
import asyncio
import time
from pathlib import Path
from urllib.parse import quote_plus

from .browser import AsyncSentienceBrowser, SentienceBrowser
from .browser_evaluator import BrowserEvaluator
from .cursor_policy import CursorPolicy, build_human_cursor_path
from .models import ActionResult, BBox, Snapshot
from .models import ActionResult, BBox, Snapshot, SnapshotOptions
from .sentience_methods import SentienceMethod
from .snapshot import snapshot, snapshot_async

Expand Down Expand Up @@ -709,6 +710,146 @@ def press(browser: SentienceBrowser, key: str, take_snapshot: bool = False) -> A
)


def _normalize_key_token(token: str) -> str:
lookup = {
"CMD": "Meta",
"COMMAND": "Meta",
"CTRL": "Control",
"CONTROL": "Control",
"ALT": "Alt",
"OPTION": "Alt",
"SHIFT": "Shift",
"ESC": "Escape",
"ESCAPE": "Escape",
"ENTER": "Enter",
"RETURN": "Enter",
"TAB": "Tab",
"SPACE": "Space",
}
upper = token.strip().upper()
return lookup.get(upper, token.strip())


def _parse_key_sequence(sequence: str) -> list[str]:
parts = []
for raw in sequence.replace(",", " ").split():
raw = raw.strip()
if not raw:
continue
if raw.startswith("{") and raw.endswith("}"):
raw = raw[1:-1]
if "+" in raw:
combo = "+".join(_normalize_key_token(tok) for tok in raw.split("+") if tok)
parts.append(combo)
else:
parts.append(_normalize_key_token(raw))
return parts


def send_keys(
browser: SentienceBrowser,
sequence: str,
take_snapshot: bool = False,
delay_ms: int = 50,
) -> ActionResult:
"""
Send a sequence of key presses (e.g., "CMD+H", "CTRL+SHIFT+P").

Supports sequences separated by commas/spaces, and brace-wrapped tokens
like "{ENTER}" or "{CTRL+L}".
"""
if not browser.page:
raise RuntimeError("Browser not started. Call browser.start() first.")

start_time = time.time()
url_before = browser.page.url

keys = _parse_key_sequence(sequence)
if not keys:
raise ValueError("send_keys sequence is empty")
for key in keys:
browser.page.keyboard.press(key)
if delay_ms > 0:
browser.page.wait_for_timeout(delay_ms)

duration_ms = int((time.time() - start_time) * 1000)
url_after = browser.page.url
url_changed = url_before != url_after
outcome = "navigated" if url_changed else "dom_updated"

snapshot_after: Snapshot | None = None
if take_snapshot:
snapshot_after = snapshot(browser)

return ActionResult(
success=True,
duration_ms=duration_ms,
outcome=outcome,
url_changed=url_changed,
snapshot_after=snapshot_after,
)


def _build_search_url(query: str, engine: str) -> str:
q = quote_plus(query)
key = engine.strip().lower()
if key in {"duckduckgo", "ddg"}:
return f"https://duckduckgo.com/?q={q}"
if key in {"google.com", "google"}:
return f"https://www.google.com/search?q={q}"
if key in {"google"}:
return f"https://www.google.com/search?q={q}"
if key in {"bing"}:
return f"https://www.bing.com/search?q={q}"
raise ValueError(f"unsupported search engine: {engine}")


def search(
browser: SentienceBrowser,
query: str,
engine: str = "duckduckgo",
take_snapshot: bool = False,
snapshot_options: SnapshotOptions | None = None,
) -> ActionResult:
"""
Navigate to a search results page for the given query.

Args:
browser: SentienceBrowser instance
query: Search query string
engine: Search engine name (duckduckgo, google, google.com, bing)
take_snapshot: Whether to take snapshot after navigation
snapshot_options: Snapshot options passed to snapshot() when take_snapshot is True.
"""
if not browser.page:
raise RuntimeError("Browser not started. Call browser.start() first.")
if not query.strip():
raise ValueError("search query is empty")

start_time = time.time()
url_before = browser.page.url
url = _build_search_url(query, engine)
browser.goto(url)
browser.page.wait_for_load_state("networkidle")

duration_ms = int((time.time() - start_time) * 1000)
url_after = browser.page.url
url_changed = url_before != url_after
outcome = "navigated" if url_changed else "dom_updated"

snapshot_after: Snapshot | None = None
if take_snapshot:
snapshot_after = snapshot(browser, snapshot_options)

return ActionResult(
success=True,
duration_ms=duration_ms,
outcome=outcome,
url_changed=url_changed,
snapshot_after=snapshot_after,
)


def scroll_to(
browser: SentienceBrowser,
element_id: int,
Expand Down Expand Up @@ -1698,6 +1839,93 @@ async def press_async(
)


async def send_keys_async(
browser: AsyncSentienceBrowser,
sequence: str,
take_snapshot: bool = False,
delay_ms: int = 50,
) -> ActionResult:
"""
Async version of send_keys().
"""
if not browser.page:
raise RuntimeError("Browser not started. Call await browser.start() first.")

start_time = time.time()
url_before = browser.page.url

keys = _parse_key_sequence(sequence)
if not keys:
raise ValueError("send_keys sequence is empty")
for key in keys:
await browser.page.keyboard.press(key)
if delay_ms > 0:
await browser.page.wait_for_timeout(delay_ms)

duration_ms = int((time.time() - start_time) * 1000)
url_after = browser.page.url
url_changed = url_before != url_after
outcome = "navigated" if url_changed else "dom_updated"

snapshot_after: Snapshot | None = None
if take_snapshot:
snapshot_after = await snapshot_async(browser)

return ActionResult(
success=True,
duration_ms=duration_ms,
outcome=outcome,
url_changed=url_changed,
snapshot_after=snapshot_after,
)


async def search_async(
browser: AsyncSentienceBrowser,
query: str,
engine: str = "duckduckgo",
take_snapshot: bool = False,
snapshot_options: SnapshotOptions | None = None,
) -> ActionResult:
"""
Async version of search().

Args:
browser: AsyncSentienceBrowser instance
query: Search query string
engine: Search engine name (duckduckgo, google, google.com, bing)
take_snapshot: Whether to take snapshot after navigation
snapshot_options: Snapshot options passed to snapshot_async() when take_snapshot is True.
"""
if not browser.page:
raise RuntimeError("Browser not started. Call await browser.start() first.")
if not query.strip():
raise ValueError("search query is empty")

start_time = time.time()
url_before = browser.page.url
url = _build_search_url(query, engine)
await browser.goto(url)
await browser.page.wait_for_load_state("networkidle")

duration_ms = int((time.time() - start_time) * 1000)
url_after = browser.page.url
url_changed = url_before != url_after
outcome = "navigated" if url_changed else "dom_updated"

snapshot_after: Snapshot | None = None
if take_snapshot:
snapshot_after = await snapshot_async(browser, snapshot_options)

return ActionResult(
success=True,
duration_ms=duration_ms,
outcome=outcome,
url_changed=url_changed,
snapshot_after=snapshot_after,
)


async def scroll_to_async(
browser: AsyncSentienceBrowser,
element_id: int,
Expand Down
Loading
Loading