Merge pull request #193 from SentienceAPI/tweaking_fixes3

rcholic · web-flow · commit 534f0f9cae41 · 2026-01-24T21:35:23.000-08:00
tweak SDK methods from webbench
diff --git a/sentience/__init__.py b/sentience/__init__.py
@@ -107,7 +107,7 @@
 from .ordinal import OrdinalIntent, boost_ordinal_elements, detect_ordinal_intent, select_by_ordinal
 from .overlay import clear_overlay, show_overlay
 from .query import find, query
-from .read import extract, extract_async, read
+from .read import extract, extract_async, read, read_best_effort
 from .recorder import Recorder, Trace, TraceStep, record
 from .runtime_agent import RuntimeAgent, RuntimeStep, StepVerification
 from .screenshot import screenshot
@@ -220,6 +220,7 @@
     "ScriptGenerator",
     "generate",
     "read",
+    "read_best_effort",
     "screenshot",
     "show_overlay",
     "clear_overlay",
diff --git a/sentience/agent_runtime.py b/sentience/agent_runtime.py
@@ -445,6 +445,29 @@ def _is_captcha_detected(self, snapshot: Snapshot) -> bool:
         captcha = getattr(snapshot.diagnostics, "captcha", None) if snapshot.diagnostics else None
         if not captcha or not getattr(captcha, "detected", False):
             return False
+        # IMPORTANT: Many sites load CAPTCHA libraries proactively. We only want to
+        # block execution when there's evidence it's actually *present/active*.
+        # If we block on low-signal detections (e.g. just a recaptcha script tag),
+        # interactive runs will “do nothing” and time out.
+        evidence = getattr(captcha, "evidence", None)
+        if evidence is not None:
+            def _list(name: str) -> list[str]:
+                try:
+                    v = getattr(evidence, name, None)
+                except Exception:
+                    v = None
+                if v is None and isinstance(evidence, dict):
+                    v = evidence.get(name)
+                if not v:
+                    return []
+                return [str(x) for x in v if x is not None]
+
+            iframe_hits = _list("iframe_src_hits")
+            url_hits = _list("url_hits")
+            text_hits = _list("text_hits")
+            # If we only saw selector/script hints, treat as non-blocking.
+            if not iframe_hits and not url_hits and not text_hits:
+                return False
         confidence = getattr(captcha, "confidence", 0.0)
         return confidence >= self._captcha_options.min_confidence
 
diff --git a/sentience/async_api.py b/sentience/async_api.py
@@ -52,8 +52,8 @@
 from sentience.query import find, query
 
 # ========== Phase 2B: Supporting Utilities ==========
-# Re-export async read function from read.py
-from sentience.read import read_async
+# Re-export async read functions from read.py
+from sentience.read import read_async, read_best_effort_async
 
 # ========== Phase 2D: Developer Tools ==========
 # Re-export async recorder and inspector from their modules
@@ -90,6 +90,7 @@
     "find_text_rect_async",  # Re-exported from text_search.py
     # Phase 2B: Supporting Utilities
     "read_async",  # Re-exported from read.py
+    "read_best_effort_async",  # Re-exported from read.py
     "show_overlay_async",  # Re-exported from overlay.py
     "clear_overlay_async",  # Re-exported from overlay.py
     "expect_async",  # Re-exported from expect.py
diff --git a/sentience/backends/actions.py b/sentience/backends/actions.py
@@ -121,6 +121,7 @@ async def type_text(
     text: str,
     target: BBox | dict[str, float] | tuple[float, float] | None = None,
     clear_first: bool = False,
+    delay_ms: float | None = None,
 ) -> ActionResult:
     """
     Type text, optionally clicking a target first.
@@ -159,8 +160,8 @@ async def type_text(
             await backend.eval("document.execCommand('selectAll')")
             await asyncio.sleep(0.02)
 
-        # Type the text
-        await backend.type_text(text)
+        # Type the text (optional human-like delay)
+        await backend.type_text(text, delay_ms=delay_ms)
 
         duration_ms = int((time.time() - start_time) * 1000)
         return ActionResult(
diff --git a/sentience/backends/cdp_backend.py b/sentience/backends/cdp_backend.py
@@ -342,8 +342,10 @@ async def wheel(
             },
         )
 
-    async def type_text(self, text: str) -> None:
+    async def type_text(self, text: str, delay_ms: float | None = None) -> None:
         """Type text using keyboard input."""
+        # Preserve historical default (~10ms) unless caller overrides.
+        per_char_delay_s = 0.01 if delay_ms is None else max(0.0, float(delay_ms) / 1000.0)
         for char in text:
             # Key down
             await self._transport.send(
@@ -372,8 +374,9 @@ async def type_text(self, text: str) -> None:
                 },
             )
 
-            # Small delay between characters
-            await asyncio.sleep(0.01)
+            # Delay between characters (human-like typing when requested)
+            if per_char_delay_s:
+                await asyncio.sleep(per_char_delay_s)
 
     async def wait_ready_state(
         self,
diff --git a/sentience/backends/playwright_backend.py b/sentience/backends/playwright_backend.py
@@ -315,9 +315,10 @@ async def wheel(
 
         await self._page.mouse.wheel(0, delta_y)
 
-    async def type_text(self, text: str) -> None:
+    async def type_text(self, text: str, delay_ms: float | None = None) -> None:
         """Type text using keyboard input."""
-        await self._page.keyboard.type(text)
+        delay = 0 if delay_ms is None else max(0, float(delay_ms))
+        await self._page.keyboard.type(text, delay=delay)
 
     async def wait_ready_state(
         self,
diff --git a/sentience/backends/protocol.py b/sentience/backends/protocol.py
@@ -188,14 +188,16 @@ async def wheel(
         """
         ...
 
-    async def type_text(self, text: str) -> None:
+    async def type_text(self, text: str, delay_ms: float | None = None) -> None:
         """
         Type text using keyboard input.
 
         Uses CDP Input.dispatchKeyEvent for each character.
 
         Args:
             text: Text to type
+            delay_ms: Optional delay between keystrokes in milliseconds.
+                      If None, backend default behavior is used.
         """
         ...
 
diff --git a/sentience/llm_provider.py b/sentience/llm_provider.py
@@ -343,6 +343,14 @@ def __init__(
         base_url: str = "https://api.deepinfra.com/v1/openai",
     ):
         api_key = get_api_key_from_env(["DEEPINFRA_TOKEN", "DEEPINFRA_API_KEY"], api_key)
+        # IMPORTANT: If we pass api_key=None to the OpenAI SDK client, it may
+        # implicitly fall back to OPENAI_API_KEY from the environment.
+        # That leads to confusing 401s against DeepInfra with an OpenAI key.
+        if not api_key:
+            raise RuntimeError(
+                "DeepInfra API key is missing. Set DEEPINFRA_API_KEY (or DEEPINFRA_TOKEN), "
+                "or pass api_key=... to DeepInfraProvider."
+            )
         super().__init__(api_key=api_key, model=model, base_url=base_url)
 
 
diff --git a/sentience/read.py b/sentience/read.py
diff --git a/tests/test_backends.py b/tests/test_backends.py
diff --git a/tests/test_read.py b/tests/test_read.py