Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,37 @@ with SentienceBrowser(headless=False) as browser:

---

## 🆕 What's New (2026-01-06)

### Human-like Typing
Add realistic delays between keystrokes to mimic human typing:
```python
from sentience import type_text

# Type instantly (default)
type_text(browser, element_id, "Hello World")

# Type with human-like delay (~10ms between keystrokes)
type_text(browser, element_id, "Hello World", delay_ms=10)
```

### Scroll to Element
Scroll elements into view with smooth animation:
```python
from sentience import snapshot, find, scroll_to

snap = snapshot(browser)
button = find(snap, 'role=button text~"Submit"')

# Scroll element into view with smooth animation
scroll_to(browser, button.id)

# Scroll instantly to top of viewport
scroll_to(browser, button.id, behavior='instant', block='start')
```

---

<details>
<summary><h2>💼 Real-World Example: Amazon Shopping Bot</h2></summary>

Expand Down
3 changes: 2 additions & 1 deletion sentience/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Sentience Python SDK - AI Agent Browser Automation
"""

from .actions import click, click_rect, press, type_text
from .actions import click, click_rect, press, scroll_to, type_text
from .agent import SentienceAgent, SentienceAgentAsync
from .agent_config import AgentConfig

Expand Down Expand Up @@ -90,6 +90,7 @@
"click",
"type_text",
"press",
"scroll_to",
"click_rect",
"wait_for",
"expect",
Expand Down
210 changes: 204 additions & 6 deletions sentience/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,11 @@ def click( # noqa: C901


def type_text(
browser: SentienceBrowser, element_id: int, text: str, take_snapshot: bool = False
browser: SentienceBrowser,
element_id: int,
text: str,
take_snapshot: bool = False,
delay_ms: float = 0,
) -> ActionResult:
"""
Type text into an element (focus then input)
Expand All @@ -144,9 +148,16 @@ def type_text(
element_id: Element ID from snapshot
text: Text to type
take_snapshot: Whether to take snapshot after action
delay_ms: Delay between keystrokes in milliseconds for human-like typing (default: 0)

Returns:
ActionResult

Example:
>>> # Type instantly (default behavior)
>>> type_text(browser, element_id, "Hello World")
>>> # Type with human-like delay (~10ms between keystrokes)
>>> type_text(browser, element_id, "Hello World", delay_ms=10)
"""
if not browser.page:
raise RuntimeError("Browser not started. Call browser.start() first.")
Expand Down Expand Up @@ -177,8 +188,8 @@ def type_text(
error={"code": "focus_failed", "reason": "Element not found"},
)

# Type using Playwright keyboard
browser.page.keyboard.type(text)
# Type using Playwright keyboard with optional delay between keystrokes
browser.page.keyboard.type(text, delay=delay_ms)

duration_ms = int((time.time() - start_time) * 1000)
url_after = browser.page.url
Expand Down Expand Up @@ -242,6 +253,94 @@ def press(browser: SentienceBrowser, key: str, take_snapshot: bool = False) -> A
)


def scroll_to(
browser: SentienceBrowser,
element_id: int,
behavior: str = "smooth",
block: str = "center",
take_snapshot: bool = False,
) -> ActionResult:
"""
Scroll an element into view

Scrolls the page so that the specified element is visible in the viewport.
Uses the element registry to find the element and scrollIntoView() to scroll it.

Args:
browser: SentienceBrowser instance
element_id: Element ID from snapshot to scroll into view
behavior: Scroll behavior - 'smooth', 'instant', or 'auto' (default: 'smooth')
block: Vertical alignment - 'start', 'center', 'end', or 'nearest' (default: 'center')
take_snapshot: Whether to take snapshot after action

Returns:
ActionResult

Example:
>>> snap = snapshot(browser)
>>> button = find(snap, 'role=button[name="Submit"]')
>>> if button:
>>> # Scroll element into view with smooth animation
>>> scroll_to(browser, button.id)
>>> # Scroll instantly to top of viewport
>>> scroll_to(browser, button.id, behavior='instant', block='start')
"""
if not browser.page:
raise RuntimeError("Browser not started. Call browser.start() first.")

start_time = time.time()
url_before = browser.page.url

# Scroll element into view using the element registry
scrolled = browser.page.evaluate(
"""
(args) => {
const el = window.sentience_registry[args.id];
if (el && el.scrollIntoView) {
el.scrollIntoView({
behavior: args.behavior,
block: args.block,
inline: 'nearest'
});
return true;
}
return false;
}
""",
{"id": element_id, "behavior": behavior, "block": block},
)

if not scrolled:
return ActionResult(
success=False,
duration_ms=int((time.time() - start_time) * 1000),
outcome="error",
error={"code": "scroll_failed", "reason": "Element not found or not scrollable"},
)

# Wait a bit for scroll to complete (especially for smooth scrolling)
wait_time = 500 if behavior == "smooth" else 100
browser.page.wait_for_timeout(wait_time)

duration_ms = int((time.time() - start_time) * 1000)
url_after = browser.page.url
url_changed = url_before != url_after

outcome = "navigated" if url_changed else "dom_updated"

snapshot_after: Snapshot | None = None
if take_snapshot:
snapshot_after = snapshot(browser)

return ActionResult(
success=True,
duration_ms=duration_ms,
outcome=outcome,
url_changed=url_changed,
snapshot_after=snapshot_after,
)


def _highlight_rect(
browser: SentienceBrowser, rect: dict[str, float], duration_sec: float = 2.0
) -> None:
Expand Down Expand Up @@ -553,7 +652,11 @@ async def click_async(


async def type_text_async(
browser: AsyncSentienceBrowser, element_id: int, text: str, take_snapshot: bool = False
browser: AsyncSentienceBrowser,
element_id: int,
text: str,
take_snapshot: bool = False,
delay_ms: float = 0,
) -> ActionResult:
"""
Type text into an element (async)
Expand All @@ -563,9 +666,16 @@ async def type_text_async(
element_id: Element ID from snapshot
text: Text to type
take_snapshot: Whether to take snapshot after action
delay_ms: Delay between keystrokes in milliseconds for human-like typing (default: 0)

Returns:
ActionResult

Example:
>>> # Type instantly (default behavior)
>>> await type_text_async(browser, element_id, "Hello World")
>>> # Type with human-like delay (~10ms between keystrokes)
>>> await type_text_async(browser, element_id, "Hello World", delay_ms=10)
"""
if not browser.page:
raise RuntimeError("Browser not started. Call await browser.start() first.")
Expand Down Expand Up @@ -596,8 +706,8 @@ async def type_text_async(
error={"code": "focus_failed", "reason": "Element not found"},
)

# Type using Playwright keyboard
await browser.page.keyboard.type(text)
# Type using Playwright keyboard with optional delay between keystrokes
await browser.page.keyboard.type(text, delay=delay_ms)

duration_ms = int((time.time() - start_time) * 1000)
url_after = browser.page.url
Expand Down Expand Up @@ -663,6 +773,94 @@ async def press_async(
)


async def scroll_to_async(
browser: AsyncSentienceBrowser,
element_id: int,
behavior: str = "smooth",
block: str = "center",
take_snapshot: bool = False,
) -> ActionResult:
"""
Scroll an element into view (async)

Scrolls the page so that the specified element is visible in the viewport.
Uses the element registry to find the element and scrollIntoView() to scroll it.

Args:
browser: AsyncSentienceBrowser instance
element_id: Element ID from snapshot to scroll into view
behavior: Scroll behavior - 'smooth', 'instant', or 'auto' (default: 'smooth')
block: Vertical alignment - 'start', 'center', 'end', or 'nearest' (default: 'center')
take_snapshot: Whether to take snapshot after action

Returns:
ActionResult

Example:
>>> snap = await snapshot_async(browser)
>>> button = find(snap, 'role=button[name="Submit"]')
>>> if button:
>>> # Scroll element into view with smooth animation
>>> await scroll_to_async(browser, button.id)
>>> # Scroll instantly to top of viewport
>>> await scroll_to_async(browser, button.id, behavior='instant', block='start')
"""
if not browser.page:
raise RuntimeError("Browser not started. Call await browser.start() first.")

start_time = time.time()
url_before = browser.page.url

# Scroll element into view using the element registry
scrolled = await browser.page.evaluate(
"""
(args) => {
const el = window.sentience_registry[args.id];
if (el && el.scrollIntoView) {
el.scrollIntoView({
behavior: args.behavior,
block: args.block,
inline: 'nearest'
});
return true;
}
return false;
}
""",
{"id": element_id, "behavior": behavior, "block": block},
)

if not scrolled:
return ActionResult(
success=False,
duration_ms=int((time.time() - start_time) * 1000),
outcome="error",
error={"code": "scroll_failed", "reason": "Element not found or not scrollable"},
)

# Wait a bit for scroll to complete (especially for smooth scrolling)
wait_time = 500 if behavior == "smooth" else 100
await browser.page.wait_for_timeout(wait_time)

duration_ms = int((time.time() - start_time) * 1000)
url_after = browser.page.url
url_changed = url_before != url_after

outcome = "navigated" if url_changed else "dom_updated"

snapshot_after: Snapshot | None = None
if take_snapshot:
snapshot_after = await snapshot_async(browser)

return ActionResult(
success=True,
duration_ms=duration_ms,
outcome=outcome,
url_changed=url_changed,
snapshot_after=snapshot_after,
)


async def _highlight_rect_async(
browser: AsyncSentienceBrowser, rect: dict[str, float], duration_sec: float = 2.0
) -> None:
Expand Down
9 changes: 8 additions & 1 deletion sentience/async_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@

# ========== Actions (Phase 1) ==========
# Re-export async action functions from actions.py
from sentience.actions import click_async, click_rect_async, press_async, type_text_async
from sentience.actions import (
click_async,
click_rect_async,
press_async,
scroll_to_async,
type_text_async,
)

# ========== Phase 2C: Agent Layer ==========
# Re-export async agent classes from agent.py and base_agent.py
Expand Down Expand Up @@ -76,6 +82,7 @@
"click_async", # Re-exported from actions.py
"type_text_async", # Re-exported from actions.py
"press_async", # Re-exported from actions.py
"scroll_to_async", # Re-exported from actions.py
"click_rect_async", # Re-exported from actions.py
# Phase 2A: Core Utilities
"wait_for_async", # Re-exported from wait.py
Expand Down
6 changes: 3 additions & 3 deletions sentience/extension/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ async function handleSnapshotProcessing(rawData, options = {}) {
const startTime = performance.now();
try {
if (!Array.isArray(rawData)) throw new Error("rawData must be an array");
if (rawData.length > 1e4 && (rawData = rawData.slice(0, 1e4)), await initWASM(),
if (rawData.length > 1e4 && (rawData = rawData.slice(0, 1e4)), await initWASM(),
!wasmReady) throw new Error("WASM module not initialized");
let analyzedElements, prunedRawData;
try {
const wasmPromise = new Promise((resolve, reject) => {
try {
let result;
result = options.limit || options.filter ? analyze_page_with_options(rawData, options) : analyze_page(rawData),
result = options.limit || options.filter ? analyze_page_with_options(rawData, options) : analyze_page(rawData),
resolve(result);
} catch (e) {
reject(e);
Expand Down Expand Up @@ -101,4 +101,4 @@ initWASM().catch(err => {}), chrome.runtime.onMessage.addListener((request, send
event.preventDefault();
}), self.addEventListener("unhandledrejection", event => {
event.preventDefault();
});
});
Loading
Loading