diff --git a/sentience/__init__.py b/sentience/__init__.py index 31ccf6a..c94fd41 100644 --- a/sentience/__init__.py +++ b/sentience/__init__.py @@ -107,7 +107,7 @@ from .ordinal import OrdinalIntent, boost_ordinal_elements, detect_ordinal_intent, select_by_ordinal from .overlay import clear_overlay, show_overlay from .query import find, query -from .read import extract, extract_async, read +from .read import extract, extract_async, read, read_best_effort from .recorder import Recorder, Trace, TraceStep, record from .runtime_agent import RuntimeAgent, RuntimeStep, StepVerification from .screenshot import screenshot @@ -220,6 +220,7 @@ "ScriptGenerator", "generate", "read", + "read_best_effort", "screenshot", "show_overlay", "clear_overlay", diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py index e983204..6129e13 100644 --- a/sentience/agent_runtime.py +++ b/sentience/agent_runtime.py @@ -445,6 +445,29 @@ def _is_captcha_detected(self, snapshot: Snapshot) -> bool: captcha = getattr(snapshot.diagnostics, "captcha", None) if snapshot.diagnostics else None if not captcha or not getattr(captcha, "detected", False): return False + # IMPORTANT: Many sites load CAPTCHA libraries proactively. We only want to + # block execution when there's evidence it's actually *present/active*. + # If we block on low-signal detections (e.g. just a recaptcha script tag), + # interactive runs will “do nothing” and time out. + evidence = getattr(captcha, "evidence", None) + if evidence is not None: + def _list(name: str) -> list[str]: + try: + v = getattr(evidence, name, None) + except Exception: + v = None + if v is None and isinstance(evidence, dict): + v = evidence.get(name) + if not v: + return [] + return [str(x) for x in v if x is not None] + + iframe_hits = _list("iframe_src_hits") + url_hits = _list("url_hits") + text_hits = _list("text_hits") + # If we only saw selector/script hints, treat as non-blocking. + if not iframe_hits and not url_hits and not text_hits: + return False confidence = getattr(captcha, "confidence", 0.0) return confidence >= self._captcha_options.min_confidence diff --git a/sentience/async_api.py b/sentience/async_api.py index 362d947..f7ed012 100644 --- a/sentience/async_api.py +++ b/sentience/async_api.py @@ -52,8 +52,8 @@ from sentience.query import find, query # ========== Phase 2B: Supporting Utilities ========== -# Re-export async read function from read.py -from sentience.read import read_async +# Re-export async read functions from read.py +from sentience.read import read_async, read_best_effort_async # ========== Phase 2D: Developer Tools ========== # Re-export async recorder and inspector from their modules @@ -90,6 +90,7 @@ "find_text_rect_async", # Re-exported from text_search.py # Phase 2B: Supporting Utilities "read_async", # Re-exported from read.py + "read_best_effort_async", # Re-exported from read.py "show_overlay_async", # Re-exported from overlay.py "clear_overlay_async", # Re-exported from overlay.py "expect_async", # Re-exported from expect.py diff --git a/sentience/backends/actions.py b/sentience/backends/actions.py index a0bd168..01fa6aa 100644 --- a/sentience/backends/actions.py +++ b/sentience/backends/actions.py @@ -121,6 +121,7 @@ async def type_text( text: str, target: BBox | dict[str, float] | tuple[float, float] | None = None, clear_first: bool = False, + delay_ms: float | None = None, ) -> ActionResult: """ Type text, optionally clicking a target first. @@ -159,8 +160,8 @@ async def type_text( await backend.eval("document.execCommand('selectAll')") await asyncio.sleep(0.02) - # Type the text - await backend.type_text(text) + # Type the text (optional human-like delay) + await backend.type_text(text, delay_ms=delay_ms) duration_ms = int((time.time() - start_time) * 1000) return ActionResult( diff --git a/sentience/backends/cdp_backend.py b/sentience/backends/cdp_backend.py index 9c6741c..897e195 100644 --- a/sentience/backends/cdp_backend.py +++ b/sentience/backends/cdp_backend.py @@ -342,8 +342,10 @@ async def wheel( }, ) - async def type_text(self, text: str) -> None: + async def type_text(self, text: str, delay_ms: float | None = None) -> None: """Type text using keyboard input.""" + # Preserve historical default (~10ms) unless caller overrides. + per_char_delay_s = 0.01 if delay_ms is None else max(0.0, float(delay_ms) / 1000.0) for char in text: # Key down await self._transport.send( @@ -372,8 +374,9 @@ async def type_text(self, text: str) -> None: }, ) - # Small delay between characters - await asyncio.sleep(0.01) + # Delay between characters (human-like typing when requested) + if per_char_delay_s: + await asyncio.sleep(per_char_delay_s) async def wait_ready_state( self, diff --git a/sentience/backends/playwright_backend.py b/sentience/backends/playwright_backend.py index 47b9960..847cb0f 100644 --- a/sentience/backends/playwright_backend.py +++ b/sentience/backends/playwright_backend.py @@ -315,9 +315,10 @@ async def wheel( await self._page.mouse.wheel(0, delta_y) - async def type_text(self, text: str) -> None: + async def type_text(self, text: str, delay_ms: float | None = None) -> None: """Type text using keyboard input.""" - await self._page.keyboard.type(text) + delay = 0 if delay_ms is None else max(0, float(delay_ms)) + await self._page.keyboard.type(text, delay=delay) async def wait_ready_state( self, diff --git a/sentience/backends/protocol.py b/sentience/backends/protocol.py index 3c5517f..aa76fda 100644 --- a/sentience/backends/protocol.py +++ b/sentience/backends/protocol.py @@ -188,7 +188,7 @@ async def wheel( """ ... - async def type_text(self, text: str) -> None: + async def type_text(self, text: str, delay_ms: float | None = None) -> None: """ Type text using keyboard input. @@ -196,6 +196,8 @@ async def type_text(self, text: str) -> None: Args: text: Text to type + delay_ms: Optional delay between keystrokes in milliseconds. + If None, backend default behavior is used. """ ... diff --git a/sentience/llm_provider.py b/sentience/llm_provider.py index e220e75..cb25603 100644 --- a/sentience/llm_provider.py +++ b/sentience/llm_provider.py @@ -343,6 +343,14 @@ def __init__( base_url: str = "https://api.deepinfra.com/v1/openai", ): api_key = get_api_key_from_env(["DEEPINFRA_TOKEN", "DEEPINFRA_API_KEY"], api_key) + # IMPORTANT: If we pass api_key=None to the OpenAI SDK client, it may + # implicitly fall back to OPENAI_API_KEY from the environment. + # That leads to confusing 401s against DeepInfra with an OpenAI key. + if not api_key: + raise RuntimeError( + "DeepInfra API key is missing. Set DEEPINFRA_API_KEY (or DEEPINFRA_TOKEN), " + "or pass api_key=... to DeepInfraProvider." + ) super().__init__(api_key=api_key, model=model, base_url=base_url) diff --git a/sentience/read.py b/sentience/read.py index 8245f03..e11299f 100644 --- a/sentience/read.py +++ b/sentience/read.py @@ -2,6 +2,7 @@ Read page content - supports raw HTML, text, and markdown formats """ +import os import json import re from typing import Any, Literal @@ -9,10 +10,168 @@ from pydantic import BaseModel, ValidationError from .browser import AsyncSentienceBrowser, SentienceBrowser +from .browser_evaluator import BrowserEvaluator from .llm_provider import LLMProvider from .models import ExtractResult, ReadResult +_READ_EVAL_JS = r""" +(options) => { + const fmt = (options && options.format) ? options.format : "raw"; + try { + const api = + (typeof globalThis !== "undefined" && globalThis && globalThis.sentience) + ? globalThis.sentience + : (typeof window !== "undefined" && window && window.sentience) + ? window.sentience + : null; + + if (!api || typeof api.read !== "function") { + return { + status: "error", + url: (typeof location !== "undefined" && location && location.href) ? location.href : "", + format: fmt, + content: "", + length: 0, + error: "sentience extension not available (window.sentience.read missing)" + }; + } + + const res = api.read(options || { format: fmt }); + if (!res || typeof res !== "object") { + return { + status: "error", + url: (typeof location !== "undefined" && location && location.href) ? location.href : "", + format: fmt, + content: "", + length: 0, + error: "sentience.read returned non-object" + }; + } + + // Normalize to the ReadResult schema expected by SDK consumers. + // If the extension returns an error without an explicit status, treat it as error. + if (!res.status) res.status = (res.error ? "error" : "success"); + if (!res.url) res.url = (typeof location !== "undefined" && location && location.href) ? location.href : ""; + if (!res.format) res.format = fmt; + if (typeof res.content !== "string") res.content = String(res.content ?? ""); + if (typeof res.length !== "number") res.length = res.content.length; + if (!("error" in res)) res.error = null; + return res; + } catch (e) { + const msg = + (e && (e.stack || e.message)) ? (e.stack || e.message) : String(e); + return { + status: "error", + url: (typeof location !== "undefined" && location && location.href) ? location.href : "", + format: fmt, + content: "", + length: 0, + error: msg + }; + } +} +""" + + +def _looks_empty_content(content: str) -> bool: + # Some pages can legitimately be short, but for "read" the empty/near-empty + # case is almost always an integration failure (extension returned ""/"\n"/" "). + if content is None: + return True + if not isinstance(content, str): + content = str(content) + return len(content.strip()) == 0 + + +def _debug_read(msg: str) -> None: + if os.environ.get("SENTIENCE_DEBUG_READ", "").strip(): + print(f"[sentience][read] {msg}") + + +def _fallback_read_from_page_sync( + page, + *, + output_format: Literal["raw", "text", "markdown"], +) -> ReadResult | None: + """ + Fallback reader that does NOT rely on the extension. + Uses Playwright primitives directly. + """ + try: + url = getattr(page, "url", "") or "" + if output_format == "raw": + html = page.content() + if not isinstance(html, str) or _looks_empty_content(html): + return None + return ReadResult(status="success", url=url, format="raw", content=html, length=len(html)) + + if output_format == "text": + text = page.evaluate( + "() => (document && document.body) ? (document.body.innerText || '') : ''" + ) + if not isinstance(text, str) or _looks_empty_content(text): + return None + return ReadResult(status="success", url=url, format="text", content=text, length=len(text)) + + if output_format == "markdown": + try: + from markdownify import markdownify # type: ignore + except Exception: + return None + html = page.content() + if not isinstance(html, str) or _looks_empty_content(html): + return None + md = markdownify(html, heading_style="ATX", wrap=True) + if not isinstance(md, str) or _looks_empty_content(md): + return None + return ReadResult(status="success", url=url, format="markdown", content=md, length=len(md)) + except Exception: + return None + return None + + +async def _fallback_read_from_page_async( + page, + *, + output_format: Literal["raw", "text", "markdown"], +) -> ReadResult | None: + """ + Async variant of `_fallback_read_from_page_sync`. + """ + try: + url = getattr(page, "url", "") or "" + if output_format == "raw": + html = await page.content() + if not isinstance(html, str) or _looks_empty_content(html): + return None + return ReadResult(status="success", url=url, format="raw", content=html, length=len(html)) + + if output_format == "text": + text = await page.evaluate( + "() => (document && document.body) ? (document.body.innerText || '') : ''" + ) + if not isinstance(text, str) or _looks_empty_content(text): + return None + return ReadResult(status="success", url=url, format="text", content=text, length=len(text)) + + if output_format == "markdown": + try: + from markdownify import markdownify # type: ignore + except Exception: + return None + html = await page.content() + if not isinstance(html, str) or _looks_empty_content(html): + return None + md = markdownify(html, heading_style="ATX", wrap=True) + if not isinstance(md, str) or _looks_empty_content(md): + return None + return ReadResult(status="success", url=url, format="markdown", content=md, length=len(md)) + except Exception: + return None + return None + + def read( browser: SentienceBrowser, output_format: Literal["raw", "text", "markdown"] = "raw", @@ -53,14 +212,17 @@ def read( if not browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") + # Best-effort: wait for extension injection, like snapshot/text_search do. + # This prevents transient "window.sentience undefined" right after navigation. + try: + BrowserEvaluator.wait_for_extension(browser.page, timeout_ms=5000) + except Exception: + pass + if output_format == "markdown" and enhance_markdown: # Get raw HTML from the extension first raw_html_result = browser.page.evaluate( - """ - (options) => { - return window.sentience.read(options); - } - """, + _READ_EVAL_JS, {"format": "raw"}, ) @@ -68,9 +230,20 @@ def read( html_content = raw_html_result["content"] try: # Use markdownify for enhanced markdown conversion - from markdownify import MarkdownifyError, markdownify + from markdownify import markdownify # type: ignore + try: + # Some markdownify versions don't expose MarkdownifyError. + from markdownify import MarkdownifyError # type: ignore + except Exception: # pragma: no cover + MarkdownifyError = Exception # type: ignore[misc,assignment] markdown_content = markdownify(html_content, heading_style="ATX", wrap=True) + if _looks_empty_content(markdown_content): + # Extension returned empty/near-empty HTML; try Playwright fallback. + fb = _fallback_read_from_page_sync(browser.page, output_format="markdown") + if fb is not None: + _debug_read("fallback=playwright reason=empty_markdown_from_extension") + return fb return ReadResult( status="success", url=raw_html_result["url"], @@ -88,19 +261,59 @@ def read( print( f"Warning: An unexpected error occurred with markdownify ({e}), falling back to extension's markdown." ) + else: + # Extension raw read failed; try Playwright fallback for markdown if possible. + fb = _fallback_read_from_page_sync(browser.page, output_format="markdown") + if fb is not None: + _debug_read("fallback=playwright reason=extension_raw_failed format=markdown") + return fb # If not enhanced markdown, or fallback, call extension with requested format result = browser.page.evaluate( - """ - (options) => { - return window.sentience.read(options); - } - """, + _READ_EVAL_JS, {"format": output_format}, ) # Convert dict result to ReadResult model - return ReadResult(**result) + rr = ReadResult(**result) + if rr.status == "success" and _looks_empty_content(rr.content): + fb = _fallback_read_from_page_sync(browser.page, output_format=output_format) + if fb is not None: + _debug_read( + f"fallback=playwright reason=empty_content_from_extension format={output_format}" + ) + return fb + # If we couldn't fallback, treat near-empty as error so callers don't + # mistakenly treat it as a successful read. + return ReadResult( + status="error", + url=rr.url, + format=rr.format, + content=rr.content, + length=rr.length, + error="empty_content", + ) + return rr + + +def read_best_effort( + browser: SentienceBrowser, + output_format: Literal["raw", "text", "markdown"] = "raw", + enhance_markdown: bool = True, +) -> ReadResult: + """ + Best-effort read. + + This function exists to give callers a stable API contract: + - Prefer the extension-backed `read()` path (when available) + - If the extension returns `success` but empty/near-empty content, fallback to + Playwright primitives (page.content()/innerText and HTML→markdownify for markdown) + + Today, `read()` already implements this best-effort behavior; this wrapper + is intentionally thin so we can extend the fallback chain without changing + semantics for callers that want explicit "best effort" behavior. + """ + return read(browser, output_format=output_format, enhance_markdown=enhance_markdown) async def read_async( @@ -143,14 +356,16 @@ async def read_async( if not browser.page: raise RuntimeError("Browser not started. Call await browser.start() first.") + # Best-effort: wait for extension injection, like snapshot/text_search do. + try: + await BrowserEvaluator.wait_for_extension_async(browser.page, timeout_ms=5000) + except Exception: + pass + if output_format == "markdown" and enhance_markdown: # Get raw HTML from the extension first raw_html_result = await browser.page.evaluate( - """ - (options) => { - return window.sentience.read(options); - } - """, + _READ_EVAL_JS, {"format": "raw"}, ) @@ -158,9 +373,20 @@ async def read_async( html_content = raw_html_result["content"] try: # Use markdownify for enhanced markdown conversion - from markdownify import MarkdownifyError, markdownify + from markdownify import markdownify # type: ignore + try: + from markdownify import MarkdownifyError # type: ignore + except Exception: # pragma: no cover + MarkdownifyError = Exception # type: ignore[misc,assignment] markdown_content = markdownify(html_content, heading_style="ATX", wrap=True) + if _looks_empty_content(markdown_content): + fb = await _fallback_read_from_page_async( + browser.page, output_format="markdown" + ) + if fb is not None: + _debug_read("fallback=playwright reason=empty_markdown_from_extension") + return fb return ReadResult( status="success", url=raw_html_result["url"], @@ -178,19 +404,46 @@ async def read_async( print( f"Warning: An unexpected error occurred with markdownify ({e}), falling back to extension's markdown." ) + else: + fb = await _fallback_read_from_page_async(browser.page, output_format="markdown") + if fb is not None: + _debug_read("fallback=playwright reason=extension_raw_failed format=markdown") + return fb # If not enhanced markdown, or fallback, call extension with requested format result = await browser.page.evaluate( - """ - (options) => { - return window.sentience.read(options); - } - """, + _READ_EVAL_JS, {"format": output_format}, ) - # Convert dict result to ReadResult model - return ReadResult(**result) + rr = ReadResult(**result) + if rr.status == "success" and _looks_empty_content(rr.content): + fb = await _fallback_read_from_page_async(browser.page, output_format=output_format) + if fb is not None: + _debug_read(f"fallback=playwright reason=empty_content_from_extension format={output_format}") + return fb + return ReadResult( + status="error", + url=rr.url, + format=rr.format, + content=rr.content, + length=rr.length, + error="empty_content", + ) + return rr + + +async def read_best_effort_async( + browser: AsyncSentienceBrowser, + output_format: Literal["raw", "text", "markdown"] = "raw", + enhance_markdown: bool = True, +) -> ReadResult: + """ + Async best-effort read. See `read_best_effort()` for semantics. + """ + return await read_async( + browser, output_format=output_format, enhance_markdown=enhance_markdown + ) def _extract_json_payload(text: str) -> dict[str, Any]: @@ -221,7 +474,7 @@ def extract( schema_desc = "" if schema is not None: schema_desc = json.dumps(schema.model_json_schema(), ensure_ascii=False) - system = "You extract structured data from markdown content. " "Return only JSON. No prose." + system = "You extract structured data from markdown content. Return only JSON. No prose." user = f"QUERY:\n{query}\n\nSCHEMA:\n{schema_desc}\n\nCONTENT:\n{content}" response = llm.generate(system, user) raw = response.content.strip() @@ -255,7 +508,7 @@ async def extract_async( schema_desc = "" if schema is not None: schema_desc = json.dumps(schema.model_json_schema(), ensure_ascii=False) - system = "You extract structured data from markdown content. " "Return only JSON. No prose." + system = "You extract structured data from markdown content. Return only JSON. No prose." user = f"QUERY:\n{query}\n\nSCHEMA:\n{schema_desc}\n\nCONTENT:\n{content}" response = await llm.generate_async(system, user) raw = response.content.strip() diff --git a/tests/test_backends.py b/tests/test_backends.py index 4f7da32..43c153f 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -771,7 +771,7 @@ async def test_type_text(self) -> None: backend = PlaywrightBackend(mock_page) await backend.type_text("Hello") - mock_keyboard.type.assert_called_once_with("Hello") + mock_keyboard.type.assert_called_once_with("Hello", delay=0) @pytest.mark.asyncio async def test_screenshot_png(self) -> None: diff --git a/tests/test_read.py b/tests/test_read.py index 328eea1..578cb80 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -2,6 +2,8 @@ Tests for read functionality """ +from typing import Any + from sentience import SentienceBrowser, read @@ -61,3 +63,37 @@ def test_read_markdown_enhanced(): # Note: They might be similar for simple pages, but enhanced should handle more cases assert isinstance(result_enhanced.content, str) assert isinstance(result_basic.content, str) + + +def test_read_falls_back_when_extension_returns_success_but_empty(monkeypatch): + """ + Regression test: some sites can produce extension-backed reads with + status='success' but empty/near-empty content (e.g. '\\n'). + + In that case, SDK should fall back to Playwright primitives (page.content()). + """ + with SentienceBrowser(headless=True) as browser: + browser.page.goto("https://example.com") + browser.page.wait_for_load_state("networkidle") + + def fake_evaluate(_script: Any, arg: Any = None): + fmt = "raw" + if isinstance(arg, dict) and "format" in arg: + fmt = arg["format"] + # Mimic buggy extension response: success but empty + return { + "status": "success", + "url": browser.page.url, + "format": fmt, + "content": "\n", + "length": 1, + "error": None, + } + + monkeypatch.setattr(browser.page, "evaluate", fake_evaluate) + + result = read(browser, output_format="raw", enhance_markdown=False) + assert result.status == "success" + assert result.format == "raw" + assert result.length > 100 + assert "