From e86ee9e31d2de48476072b3ff569c2bbe568f735 Mon Sep 17 00:00:00 2001 From: rcholic Date: Sun, 21 Dec 2025 18:16:22 -0800 Subject: [PATCH 1/2] handle markdown read --- README.md | 23 ++++++++ examples/read_markdown.py | 53 ++++++++++++++++++ sentience/__init__.py | 2 + sentience/read.py | 111 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+) create mode 100644 examples/read_markdown.py create mode 100644 sentience/read.py diff --git a/README.md b/README.md index 2e8dd94..81d7cad 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,28 @@ with SentienceBrowser(headless=False) as browser: - `.to_have_text(text)` - `.to_have_count(n)` +### Content Reading +- `read(browser, format="raw|text|markdown")` - Read page content + - **Default format: `"raw"`** - Returns HTML suitable for Turndown/markdownify + - `format="raw"` - Get cleaned HTML + - `format="markdown"` - Get high-quality markdown (uses markdownify internally) + - `format="text"` - Get plain text + + **Examples:** + ```python + from sentience import read + + # Get raw HTML (default) + result = read(browser) + html = result["content"] + + # Get high-quality markdown (uses markdownify automatically) + result = read(browser, format="markdown") + markdown = result["content"] + ``` + + See `examples/read_markdown.py` for complete examples. + ## Examples See `examples/` directory: @@ -72,6 +94,7 @@ See `examples/` directory: - `basic_agent.py` - Basic snapshot - `query_demo.py` - Query engine - `wait_and_click.py` - Wait and actions +- `read_markdown.py` - Reading page content and converting to markdown ## Testing diff --git a/examples/read_markdown.py b/examples/read_markdown.py new file mode 100644 index 0000000..e83602d --- /dev/null +++ b/examples/read_markdown.py @@ -0,0 +1,53 @@ +""" +Example: Reading page content and converting to markdown + +This example shows how to use the read() function to get page content +and convert it to high-quality markdown using markdownify. +""" + +from sentience import SentienceBrowser, read +from markdownify import markdownify + + +def main(): + # Initialize browser + with SentienceBrowser(headless=True) as browser: + # Navigate to a page + browser.page.goto("https://example.com") + browser.page.wait_for_load_state("networkidle") + + # Method 1: Get raw HTML (default) and convert with markdownify + print("=== Method 1: Raw HTML + markdownify (Recommended) ===") + result = read(browser) # format="raw" is default + html_content = result["content"] + + # Convert to markdown using markdownify (better quality) + markdown = markdownify( + html_content, + heading_style="ATX", # Use # for headings + bullets="-", # Use - for lists + strip=['script', 'style', 'nav', 'footer', 'header'], # Strip unwanted tags + ) + print(f"Markdown length: {len(markdown)} characters") + print(markdown[:500]) # Print first 500 chars + print("\n") + + # Method 2: Get high-quality markdown directly (uses markdownify internally) + print("=== Method 2: Direct markdown (High-quality via markdownify) ===") + result = read(browser, format="markdown") + high_quality_markdown = result["content"] + print(f"Markdown length: {len(high_quality_markdown)} characters") + print(high_quality_markdown[:500]) # Print first 500 chars + print("\n") + + # Method 3: Get plain text + print("=== Method 3: Plain text ===") + result = read(browser, format="text") + text_content = result["content"] + print(f"Text length: {len(text_content)} characters") + print(text_content[:500]) # Print first 500 chars + + +if __name__ == "__main__": + main() + diff --git a/sentience/__init__.py b/sentience/__init__.py index b5a6330..0f07c00 100644 --- a/sentience/__init__.py +++ b/sentience/__init__.py @@ -12,6 +12,7 @@ from .inspector import Inspector, inspect from .recorder import Recorder, Trace, TraceStep, record from .generator import ScriptGenerator, generate +from .read import read __version__ = "0.1.0" @@ -39,5 +40,6 @@ "record", "ScriptGenerator", "generate", + "read", ] diff --git a/sentience/read.py b/sentience/read.py new file mode 100644 index 0000000..61d81d5 --- /dev/null +++ b/sentience/read.py @@ -0,0 +1,111 @@ +""" +Read page content - supports raw HTML, text, and markdown formats +""" + +from typing import Literal +from .browser import SentienceBrowser + + +def read( + browser: SentienceBrowser, + format: Literal["raw", "text", "markdown"] = "raw", # noqa: A002 +) -> dict: + """ + Read page content as raw HTML, text, or markdown + + Args: + browser: SentienceBrowser instance + format: Output format - "raw" (default, returns HTML for Turndown/markdownify), + "text" (plain text), or "markdown" (high-quality markdown via markdownify) + + Returns: + dict with: + - status: "success" or "error" + - url: Current page URL + - format: "raw", "text", or "markdown" + - content: Page content as string + - length: Content length in characters + - error: Error message if status is "error" + + Examples: + # Get raw HTML (default) - can be used with markdownify for better conversion + result = read(browser) + html_content = result["content"] + + # Get high-quality markdown (uses markdownify internally) + result = read(browser, format="markdown") + markdown = result["content"] + + # Get plain text + result = read(browser, format="text") + text = result["content"] + """ + if not browser.page: + raise RuntimeError("Browser not started. Call browser.start() first.") + + # For markdown format, get raw HTML first, then convert with markdownify + if format == "markdown": + # Get raw HTML from extension + raw_result = browser.page.evaluate( + """ + (options) => { + return window.sentience.read(options); + } + """, + {"format": "raw"}, + ) + + if raw_result.get("status") != "success": + return raw_result + + # Convert to markdown using markdownify + try: + from markdownify import markdownify as md + html_content = raw_result["content"] + markdown_content = md( + html_content, + heading_style="ATX", # Use # for headings + bullets="-", # Use - for lists + strip=['script', 'style', 'nav', 'footer', 'header', 'noscript'], # Strip unwanted tags + ) + + # Return result with markdown content + return { + "status": "success", + "url": raw_result["url"], + "format": "markdown", + "content": markdown_content, + "length": len(markdown_content), + } + except ImportError: + # Fallback to extension's lightweight markdown if markdownify not installed + result = browser.page.evaluate( + """ + (options) => { + return window.sentience.read(options); + } + """, + {"format": "markdown"}, + ) + return result + except (ValueError, TypeError, AttributeError) as e: + # If conversion fails, return error + return { + "status": "error", + "url": raw_result.get("url", ""), + "format": "markdown", + "content": "", + "length": 0, + "error": f"Markdown conversion failed: {e}", + } + else: + # For "raw" or "text", call extension directly + result = browser.page.evaluate( + """ + (options) => { + return window.sentience.read(options); + } + """, + {"format": format}, + ) + return result From 0812d946f4f717385efc74cce27ea7b62959fa3e Mon Sep 17 00:00:00 2001 From: rcholic Date: Sun, 21 Dec 2025 18:19:24 -0800 Subject: [PATCH 2/2] fix sync extension --- .github/workflows/sync-extension.yml | 89 +++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 16 deletions(-) diff --git a/.github/workflows/sync-extension.yml b/.github/workflows/sync-extension.yml index 3aae02a..e87622e 100644 --- a/.github/workflows/sync-extension.yml +++ b/.github/workflows/sync-extension.yml @@ -62,17 +62,27 @@ jobs: mkdir -p extension-temp cd extension-temp - # Download each file from release + # Download individual files from release (reliable method) + echo "📁 Downloading individual files from release..." curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ "https://api.github.com/repos/$REPO/releases/tags/$TAG" | \ - jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \ - while read url; do - filename=$(basename "$url") - curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename" + jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | "\(.browser_download_url)|\(.name)"' | \ + while IFS='|' read -r url name; do + if [ -n "$url" ] && [ "$url" != "null" ] && [ -n "$name" ]; then + # Preserve directory structure from asset name + # If name contains '/', create directories + dir=$(dirname "$name") + if [ "$dir" != "." ]; then + mkdir -p "$dir" + fi + echo " Downloading $name..." + curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$name" + fi done - # Alternative: Download from release archive if available - # Or use the extension-package artifact + # Verify downloaded files + echo "📋 Downloaded files:" + ls -la - name: Copy extension files if: steps.release.outputs.skip != 'true' @@ -80,16 +90,63 @@ jobs: # Create extension directory structure mkdir -p sentience/extension/pkg - # Copy extension files - cp extension-temp/manifest.json sentience/extension/ 2>/dev/null || echo "manifest.json not found in release" - cp extension-temp/content.js sentience/extension/ 2>/dev/null || echo "content.js not found in release" - cp extension-temp/background.js sentience/extension/ 2>/dev/null || echo "background.js not found in release" - cp extension-temp/injected_api.js sentience/extension/ 2>/dev/null || echo "injected_api.js not found in release" + # Copy extension files (handle both root and extension-package/ subdirectory) + # Check root first, then extension-package/ subdirectory + if [ -f "extension-temp/manifest.json" ]; then + cp extension-temp/manifest.json sentience/extension/ + elif [ -f "extension-temp/extension-package/manifest.json" ]; then + cp extension-temp/extension-package/manifest.json sentience/extension/ + else + echo "⚠️ manifest.json not found" + fi + + if [ -f "extension-temp/content.js" ]; then + cp extension-temp/content.js sentience/extension/ + elif [ -f "extension-temp/extension-package/content.js" ]; then + cp extension-temp/extension-package/content.js sentience/extension/ + else + echo "⚠️ content.js not found" + fi + + if [ -f "extension-temp/background.js" ]; then + cp extension-temp/background.js sentience/extension/ + elif [ -f "extension-temp/extension-package/background.js" ]; then + cp extension-temp/extension-package/background.js sentience/extension/ + else + echo "⚠️ background.js not found" + fi + + if [ -f "extension-temp/injected_api.js" ]; then + cp extension-temp/injected_api.js sentience/extension/ + elif [ -f "extension-temp/extension-package/injected_api.js" ]; then + cp extension-temp/extension-package/injected_api.js sentience/extension/ + else + echo "⚠️ injected_api.js not found" + fi + + # Copy WASM files (check both locations) + if [ -f "extension-temp/pkg/sentience_core.js" ]; then + cp extension-temp/pkg/sentience_core.js sentience/extension/pkg/ + elif [ -f "extension-temp/extension-package/pkg/sentience_core.js" ]; then + cp extension-temp/extension-package/pkg/sentience_core.js sentience/extension/pkg/ + else + echo "⚠️ sentience_core.js not found" + fi - # Copy WASM files - cp extension-temp/pkg/sentience_core.js sentience/extension/pkg/ 2>/dev/null || echo "sentience_core.js not found" - cp extension-temp/pkg/sentience_core_bg.wasm sentience/extension/pkg/ 2>/dev/null || echo "sentience_core_bg.wasm not found" - cp extension-temp/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "Type definitions not found" + if [ -f "extension-temp/pkg/sentience_core_bg.wasm" ]; then + cp extension-temp/pkg/sentience_core_bg.wasm sentience/extension/pkg/ + elif [ -f "extension-temp/extension-package/pkg/sentience_core_bg.wasm" ]; then + cp extension-temp/extension-package/pkg/sentience_core_bg.wasm sentience/extension/pkg/ + else + echo "⚠️ sentience_core_bg.wasm not found" + fi + + # Copy TypeScript definitions + if [ -d "extension-temp/pkg" ]; then + cp extension-temp/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found" + elif [ -d "extension-temp/extension-package/pkg" ]; then + cp extension-temp/extension-package/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found" + fi - name: Check for changes if: steps.release.outputs.skip != 'true'