Skip to content

Commit 133aa6c

Browse files
authored
Merge pull request #187 from SentienceAPI/parity_win1
Phase 1: Tabs + JS evaluator + Lifecycle hooks + cli
2 parents 876c007 + c0d3f64 commit 133aa6c

29 files changed

+2421
-113
lines changed

README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,24 @@ pip install transformers torch # For local LLMs
2222
pip install -e .
2323
```
2424

25+
## 🧭 Manual driver CLI
26+
27+
Use the interactive CLI to open a page, inspect clickables, and drive actions:
28+
29+
```bash
30+
sentience driver --url https://example.com
31+
```
32+
33+
Commands:
34+
- `open <url>`
35+
- `state [limit]`
36+
- `click <element_id>`
37+
- `type <element_id> <text>`
38+
- `press <key>`
39+
- `screenshot [path]`
40+
- `help`
41+
- `close`
42+
2543
## Jest for AI Web Agent
2644

2745
### Semantic snapshots and assertions that let agents act, verify, and know when they're done.

sentience/__init__.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@
1919
click_rect,
2020
press,
2121
scroll_to,
22+
search,
23+
search_async,
2224
select_option,
25+
send_keys,
26+
send_keys_async,
2327
submit,
2428
type_text,
2529
uncheck,
@@ -51,7 +55,7 @@
5155

5256
# Agent Layer (Phase 1 & 2)
5357
from .base_agent import BaseAgent
54-
from .browser import SentienceBrowser
58+
from .browser import AsyncSentienceBrowser, SentienceBrowser
5559
from .captcha import CaptchaContext, CaptchaHandlingError, CaptchaOptions, CaptchaResolution
5660
from .captcha_strategies import ExternalSolver, HumanHandoffSolver, VisionSolver
5761

@@ -86,6 +90,7 @@
8690
Snapshot,
8791
SnapshotFilter,
8892
SnapshotOptions,
93+
StepHookContext,
8994
StorageState,
9095
TextContext,
9196
TextMatch,
@@ -101,13 +106,14 @@
101106
from .ordinal import OrdinalIntent, boost_ordinal_elements, detect_ordinal_intent, select_by_ordinal
102107
from .overlay import clear_overlay, show_overlay
103108
from .query import find, query
104-
from .read import read
109+
from .read import extract, extract_async, read
105110
from .recorder import Recorder, Trace, TraceStep, record
106111
from .runtime_agent import RuntimeAgent, RuntimeStep, StepVerification
107112
from .screenshot import screenshot
108113
from .sentience_methods import AgentAction, SentienceMethod
109114
from .snapshot import snapshot
110115
from .text_search import find_text_rect
116+
from .tools import BackendCapabilities, ToolContext, ToolRegistry, ToolSpec, register_default_tools
111117
from .tracer_factory import SENTIENCE_API_URL, create_tracer
112118
from .tracing import JsonlTraceSink, TraceEvent, Tracer, TraceSink
113119

@@ -186,6 +192,7 @@
186192
"backend_wait_for_stable",
187193
# Core SDK
188194
"SentienceBrowser",
195+
"AsyncSentienceBrowser",
189196
"Snapshot",
190197
"Element",
191198
"BBox",

sentience/actions.py

Lines changed: 229 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@
55
import asyncio
66
import time
77
from pathlib import Path
8+
from urllib.parse import quote_plus
89

910
from .browser import AsyncSentienceBrowser, SentienceBrowser
1011
from .browser_evaluator import BrowserEvaluator
1112
from .cursor_policy import CursorPolicy, build_human_cursor_path
12-
from .models import ActionResult, BBox, Snapshot
13+
from .models import ActionResult, BBox, Snapshot, SnapshotOptions
1314
from .sentience_methods import SentienceMethod
1415
from .snapshot import snapshot, snapshot_async
1516

@@ -709,6 +710,146 @@ def press(browser: SentienceBrowser, key: str, take_snapshot: bool = False) -> A
709710
)
710711

711712

713+
def _normalize_key_token(token: str) -> str:
714+
lookup = {
715+
"CMD": "Meta",
716+
"COMMAND": "Meta",
717+
"CTRL": "Control",
718+
"CONTROL": "Control",
719+
"ALT": "Alt",
720+
"OPTION": "Alt",
721+
"SHIFT": "Shift",
722+
"ESC": "Escape",
723+
"ESCAPE": "Escape",
724+
"ENTER": "Enter",
725+
"RETURN": "Enter",
726+
"TAB": "Tab",
727+
"SPACE": "Space",
728+
}
729+
upper = token.strip().upper()
730+
return lookup.get(upper, token.strip())
731+
732+
733+
def _parse_key_sequence(sequence: str) -> list[str]:
734+
parts = []
735+
for raw in sequence.replace(",", " ").split():
736+
raw = raw.strip()
737+
if not raw:
738+
continue
739+
if raw.startswith("{") and raw.endswith("}"):
740+
raw = raw[1:-1]
741+
if "+" in raw:
742+
combo = "+".join(_normalize_key_token(tok) for tok in raw.split("+") if tok)
743+
parts.append(combo)
744+
else:
745+
parts.append(_normalize_key_token(raw))
746+
return parts
747+
748+
749+
def send_keys(
750+
browser: SentienceBrowser,
751+
sequence: str,
752+
take_snapshot: bool = False,
753+
delay_ms: int = 50,
754+
) -> ActionResult:
755+
"""
756+
Send a sequence of key presses (e.g., "CMD+H", "CTRL+SHIFT+P").
757+
758+
Supports sequences separated by commas/spaces, and brace-wrapped tokens
759+
like "{ENTER}" or "{CTRL+L}".
760+
"""
761+
if not browser.page:
762+
raise RuntimeError("Browser not started. Call browser.start() first.")
763+
764+
start_time = time.time()
765+
url_before = browser.page.url
766+
767+
keys = _parse_key_sequence(sequence)
768+
if not keys:
769+
raise ValueError("send_keys sequence is empty")
770+
for key in keys:
771+
browser.page.keyboard.press(key)
772+
if delay_ms > 0:
773+
browser.page.wait_for_timeout(delay_ms)
774+
775+
duration_ms = int((time.time() - start_time) * 1000)
776+
url_after = browser.page.url
777+
url_changed = url_before != url_after
778+
outcome = "navigated" if url_changed else "dom_updated"
779+
780+
snapshot_after: Snapshot | None = None
781+
if take_snapshot:
782+
snapshot_after = snapshot(browser)
783+
784+
return ActionResult(
785+
success=True,
786+
duration_ms=duration_ms,
787+
outcome=outcome,
788+
url_changed=url_changed,
789+
snapshot_after=snapshot_after,
790+
)
791+
792+
793+
def _build_search_url(query: str, engine: str) -> str:
794+
q = quote_plus(query)
795+
key = engine.strip().lower()
796+
if key in {"duckduckgo", "ddg"}:
797+
return f"https://duckduckgo.com/?q={q}"
798+
if key in {"google.com", "google"}:
799+
return f"https://www.google.com/search?q={q}"
800+
if key in {"google"}:
801+
return f"https://www.google.com/search?q={q}"
802+
if key in {"bing"}:
803+
return f"https://www.bing.com/search?q={q}"
804+
raise ValueError(f"unsupported search engine: {engine}")
805+
806+
807+
def search(
808+
browser: SentienceBrowser,
809+
query: str,
810+
engine: str = "duckduckgo",
811+
take_snapshot: bool = False,
812+
snapshot_options: SnapshotOptions | None = None,
813+
) -> ActionResult:
814+
"""
815+
Navigate to a search results page for the given query.
816+
817+
Args:
818+
browser: SentienceBrowser instance
819+
query: Search query string
820+
engine: Search engine name (duckduckgo, google, google.com, bing)
821+
take_snapshot: Whether to take snapshot after navigation
822+
snapshot_options: Snapshot options passed to snapshot() when take_snapshot is True.
823+
"""
824+
if not browser.page:
825+
raise RuntimeError("Browser not started. Call browser.start() first.")
826+
if not query.strip():
827+
raise ValueError("search query is empty")
828+
829+
start_time = time.time()
830+
url_before = browser.page.url
831+
url = _build_search_url(query, engine)
832+
browser.goto(url)
833+
browser.page.wait_for_load_state("networkidle")
834+
835+
duration_ms = int((time.time() - start_time) * 1000)
836+
url_after = browser.page.url
837+
url_changed = url_before != url_after
838+
outcome = "navigated" if url_changed else "dom_updated"
839+
840+
snapshot_after: Snapshot | None = None
841+
if take_snapshot:
842+
snapshot_after = snapshot(browser, snapshot_options)
843+
844+
return ActionResult(
845+
success=True,
846+
duration_ms=duration_ms,
847+
outcome=outcome,
848+
url_changed=url_changed,
849+
snapshot_after=snapshot_after,
850+
)
851+
852+
712853
def scroll_to(
713854
browser: SentienceBrowser,
714855
element_id: int,
@@ -1698,6 +1839,93 @@ async def press_async(
16981839
)
16991840

17001841

1842+
async def send_keys_async(
1843+
browser: AsyncSentienceBrowser,
1844+
sequence: str,
1845+
take_snapshot: bool = False,
1846+
delay_ms: int = 50,
1847+
) -> ActionResult:
1848+
"""
1849+
Async version of send_keys().
1850+
"""
1851+
if not browser.page:
1852+
raise RuntimeError("Browser not started. Call await browser.start() first.")
1853+
1854+
start_time = time.time()
1855+
url_before = browser.page.url
1856+
1857+
keys = _parse_key_sequence(sequence)
1858+
if not keys:
1859+
raise ValueError("send_keys sequence is empty")
1860+
for key in keys:
1861+
await browser.page.keyboard.press(key)
1862+
if delay_ms > 0:
1863+
await browser.page.wait_for_timeout(delay_ms)
1864+
1865+
duration_ms = int((time.time() - start_time) * 1000)
1866+
url_after = browser.page.url
1867+
url_changed = url_before != url_after
1868+
outcome = "navigated" if url_changed else "dom_updated"
1869+
1870+
snapshot_after: Snapshot | None = None
1871+
if take_snapshot:
1872+
snapshot_after = await snapshot_async(browser)
1873+
1874+
return ActionResult(
1875+
success=True,
1876+
duration_ms=duration_ms,
1877+
outcome=outcome,
1878+
url_changed=url_changed,
1879+
snapshot_after=snapshot_after,
1880+
)
1881+
1882+
1883+
async def search_async(
1884+
browser: AsyncSentienceBrowser,
1885+
query: str,
1886+
engine: str = "duckduckgo",
1887+
take_snapshot: bool = False,
1888+
snapshot_options: SnapshotOptions | None = None,
1889+
) -> ActionResult:
1890+
"""
1891+
Async version of search().
1892+
1893+
Args:
1894+
browser: AsyncSentienceBrowser instance
1895+
query: Search query string
1896+
engine: Search engine name (duckduckgo, google, google.com, bing)
1897+
take_snapshot: Whether to take snapshot after navigation
1898+
snapshot_options: Snapshot options passed to snapshot_async() when take_snapshot is True.
1899+
"""
1900+
if not browser.page:
1901+
raise RuntimeError("Browser not started. Call await browser.start() first.")
1902+
if not query.strip():
1903+
raise ValueError("search query is empty")
1904+
1905+
start_time = time.time()
1906+
url_before = browser.page.url
1907+
url = _build_search_url(query, engine)
1908+
await browser.goto(url)
1909+
await browser.page.wait_for_load_state("networkidle")
1910+
1911+
duration_ms = int((time.time() - start_time) * 1000)
1912+
url_after = browser.page.url
1913+
url_changed = url_before != url_after
1914+
outcome = "navigated" if url_changed else "dom_updated"
1915+
1916+
snapshot_after: Snapshot | None = None
1917+
if take_snapshot:
1918+
snapshot_after = await snapshot_async(browser, snapshot_options)
1919+
1920+
return ActionResult(
1921+
success=True,
1922+
duration_ms=duration_ms,
1923+
outcome=outcome,
1924+
url_changed=url_changed,
1925+
snapshot_after=snapshot_after,
1926+
)
1927+
1928+
17011929
async def scroll_to_async(
17021930
browser: AsyncSentienceBrowser,
17031931
element_id: int,

0 commit comments

Comments
 (0)