diff --git a/.github/workflows/sync-extension.yml b/.github/workflows/sync-extension.yml index d64f72b..c1f8247 100644 --- a/.github/workflows/sync-extension.yml +++ b/.github/workflows/sync-extension.yml @@ -63,36 +63,25 @@ jobs: mkdir -p extension-temp cd extension-temp - # First, try to download the zip archive if available - ZIP_URL=$(curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ + # Download individual files from release (reliable method - no zip) + echo "📁 Downloading individual files from release..." + curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ "https://api.github.com/repos/$REPO/releases/tags/$TAG" | \ - jq -r '.assets[] | select(.name == "extension-package.zip") | .browser_download_url') - - if [ -n "$ZIP_URL" ] && [ "$ZIP_URL" != "null" ]; then - echo "📦 Downloading extension-package.zip..." - curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$ZIP_URL" -o extension-package.zip - unzip -q extension-package.zip -d . - # Files should now be in extension-temp/extension-package/ or extension-temp/ - if [ -d "extension-package" ]; then - mv extension-package/* . 2>/dev/null || true - rmdir extension-package 2>/dev/null || true - fi - else - echo "📁 Downloading individual files from release..." - # Download each file from release - curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \ - "https://api.github.com/repos/$REPO/releases/tags/$TAG" | \ - jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \ - while read url; do - if [ -n "$url" ] && [ "$url" != "null" ]; then - filename=$(basename "$url") - echo " Downloading $filename..." - curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename" + jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | "\(.browser_download_url)|\(.name)"' | \ + while IFS='|' read -r url name; do + if [ -n "$url" ] && [ "$url" != "null" ] && [ -n "$name" ]; then + # Preserve directory structure from asset name + # If name contains '/', create directories + dir=$(dirname "$name") + if [ "$dir" != "." ]; then + mkdir -p "$dir" fi - done - fi + echo " Downloading $name..." + curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$name" + fi + done - # Verify files were downloaded + # Verify downloaded files echo "📋 Downloaded files:" ls -la @@ -102,25 +91,53 @@ jobs: # Create extension directory structure mkdir -p sentience/extension/pkg - # Copy extension files (check both root and pkg subdirectory) - cp extension-temp/manifest.json sentience/extension/ 2>/dev/null || echo "⚠️ manifest.json not found in release" - cp extension-temp/content.js sentience/extension/ 2>/dev/null || echo "⚠️ content.js not found in release" - cp extension-temp/background.js sentience/extension/ 2>/dev/null || echo "⚠️ background.js not found in release" - cp extension-temp/injected_api.js sentience/extension/ 2>/dev/null || echo "⚠️ injected_api.js not found in release" + # Copy extension files (handle both root and extension-package/ subdirectory) + # Check root first, then extension-package/ subdirectory + if [ -f "extension-temp/manifest.json" ]; then + cp extension-temp/manifest.json sentience/extension/ + elif [ -f "extension-temp/extension-package/manifest.json" ]; then + cp extension-temp/extension-package/manifest.json sentience/extension/ + else + echo "⚠️ manifest.json not found" + fi + + if [ -f "extension-temp/content.js" ]; then + cp extension-temp/content.js sentience/extension/ + elif [ -f "extension-temp/extension-package/content.js" ]; then + cp extension-temp/extension-package/content.js sentience/extension/ + else + echo "⚠️ content.js not found" + fi + + if [ -f "extension-temp/background.js" ]; then + cp extension-temp/background.js sentience/extension/ + elif [ -f "extension-temp/extension-package/background.js" ]; then + cp extension-temp/extension-package/background.js sentience/extension/ + else + echo "⚠️ background.js not found" + fi + + if [ -f "extension-temp/injected_api.js" ]; then + cp extension-temp/injected_api.js sentience/extension/ + elif [ -f "extension-temp/extension-package/injected_api.js" ]; then + cp extension-temp/extension-package/injected_api.js sentience/extension/ + else + echo "⚠️ injected_api.js not found" + fi - # Copy WASM files (check both root and pkg subdirectory) + # Copy WASM files (check both locations) if [ -f "extension-temp/pkg/sentience_core.js" ]; then cp extension-temp/pkg/sentience_core.js sentience/extension/pkg/ - elif [ -f "extension-temp/sentience_core.js" ]; then - cp extension-temp/sentience_core.js sentience/extension/pkg/ + elif [ -f "extension-temp/extension-package/pkg/sentience_core.js" ]; then + cp extension-temp/extension-package/pkg/sentience_core.js sentience/extension/pkg/ else echo "⚠️ sentience_core.js not found" fi if [ -f "extension-temp/pkg/sentience_core_bg.wasm" ]; then cp extension-temp/pkg/sentience_core_bg.wasm sentience/extension/pkg/ - elif [ -f "extension-temp/sentience_core_bg.wasm" ]; then - cp extension-temp/sentience_core_bg.wasm sentience/extension/pkg/ + elif [ -f "extension-temp/extension-package/pkg/sentience_core_bg.wasm" ]; then + cp extension-temp/extension-package/pkg/sentience_core_bg.wasm sentience/extension/pkg/ else echo "⚠️ sentience_core_bg.wasm not found" fi @@ -128,8 +145,8 @@ jobs: # Copy TypeScript definitions if [ -d "extension-temp/pkg" ]; then cp extension-temp/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found" - elif [ -d "extension-temp" ]; then - cp extension-temp/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found" + elif [ -d "extension-temp/extension-package/pkg" ]; then + cp extension-temp/extension-package/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found" fi # Verify copied files @@ -156,9 +173,9 @@ jobs: if: steps.release.outputs.skip != 'true' && steps.changes.outputs.changed == 'true' uses: peter-evans/create-pull-request@v5 with: - # Use GITHUB_TOKEN (built-in) if repository allows PR creation, otherwise use PR_TOKEN (PAT) + # Use PR_TOKEN if available (for repos with org restrictions), otherwise use GITHUB_TOKEN # To use PAT: create secret named PR_TOKEN with a Personal Access Token that has 'repo' scope - token: ${{ secrets.PR_TOKEN || secrets.GITHUB_TOKEN }} + token: ${{ secrets.PR_TOKEN }} commit-message: "chore: sync extension files from sentience-chrome ${{ steps.release.outputs.tag }}" title: "Sync Extension: ${{ steps.release.outputs.tag }}" body: | diff --git a/README.md b/README.md index 52dadf7..a3de603 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,28 @@ with SentienceBrowser(headless=False) as browser: - `.to_have_text(text)` - `.to_have_count(n)` +### Content Reading +- `read(browser, format="raw|text|markdown")` - Read page content + - **Default format: `"raw"`** - Returns HTML suitable for Turndown/markdownify + - `format="raw"` - Get cleaned HTML + - `format="markdown"` - Get high-quality markdown (uses markdownify internally) + - `format="text"` - Get plain text + + **Examples:** + ```python + from sentience import read + + # Get raw HTML (default) + result = read(browser) + html = result["content"] + + # Get high-quality markdown (uses markdownify automatically) + result = read(browser, format="markdown") + markdown = result["content"] + ``` + + See `examples/read_markdown.py` for complete examples. + ## Examples See `examples/` directory: @@ -80,6 +102,7 @@ See `examples/` directory: - `basic_agent.py` - Basic snapshot - `query_demo.py` - Query engine - `wait_and_click.py` - Wait and actions +- `read_markdown.py` - Reading page content and converting to markdown ### Content Reading Example diff --git a/examples/read_markdown.py b/examples/read_markdown.py new file mode 100644 index 0000000..e83602d --- /dev/null +++ b/examples/read_markdown.py @@ -0,0 +1,53 @@ +""" +Example: Reading page content and converting to markdown + +This example shows how to use the read() function to get page content +and convert it to high-quality markdown using markdownify. +""" + +from sentience import SentienceBrowser, read +from markdownify import markdownify + + +def main(): + # Initialize browser + with SentienceBrowser(headless=True) as browser: + # Navigate to a page + browser.page.goto("https://example.com") + browser.page.wait_for_load_state("networkidle") + + # Method 1: Get raw HTML (default) and convert with markdownify + print("=== Method 1: Raw HTML + markdownify (Recommended) ===") + result = read(browser) # format="raw" is default + html_content = result["content"] + + # Convert to markdown using markdownify (better quality) + markdown = markdownify( + html_content, + heading_style="ATX", # Use # for headings + bullets="-", # Use - for lists + strip=['script', 'style', 'nav', 'footer', 'header'], # Strip unwanted tags + ) + print(f"Markdown length: {len(markdown)} characters") + print(markdown[:500]) # Print first 500 chars + print("\n") + + # Method 2: Get high-quality markdown directly (uses markdownify internally) + print("=== Method 2: Direct markdown (High-quality via markdownify) ===") + result = read(browser, format="markdown") + high_quality_markdown = result["content"] + print(f"Markdown length: {len(high_quality_markdown)} characters") + print(high_quality_markdown[:500]) # Print first 500 chars + print("\n") + + # Method 3: Get plain text + print("=== Method 3: Plain text ===") + result = read(browser, format="text") + text_content = result["content"] + print(f"Text length: {len(text_content)} characters") + print(text_content[:500]) # Print first 500 chars + + +if __name__ == "__main__": + main() + diff --git a/sentience/read.py b/sentience/read.py index 91425f7..b9d2bc4 100644 --- a/sentience/read.py +++ b/sentience/read.py @@ -1,70 +1,90 @@ """ -Read page content - enhanced markdown conversion +Read page content - supports raw HTML, text, and markdown formats """ -from typing import Optional, Literal +from typing import Literal from .browser import SentienceBrowser def read( browser: SentienceBrowser, - format: Literal["text", "markdown"] = "text", + output_format: Literal["raw", "text", "markdown"] = "raw", enhance_markdown: bool = True, ) -> dict: """ - Read page content as text or markdown + Read page content as raw HTML, text, or markdown Args: browser: SentienceBrowser instance - format: Output format - "text" or "markdown" - enhance_markdown: If True and format="markdown", use markdownify for better conversion + output_format: Output format - "raw" (default, returns HTML for external processing), + "text" (plain text), or "markdown" (lightweight or enhanced markdown). + enhance_markdown: If True and output_format is "markdown", uses markdownify for better conversion. + If False, uses the extension's lightweight markdown converter. Returns: dict with: - status: "success" or "error" - url: Current page URL - - format: "text" or "markdown" + - format: "raw", "text", or "markdown" - content: Page content as string - length: Content length in characters - error: Error message if status is "error" + + Examples: + # Get raw HTML (default) - can be used with markdownify for better conversion + result = read(browser) + html_content = result["content"] + + # Get high-quality markdown (uses markdownify internally) + result = read(browser, output_format="markdown") + markdown = result["content"] + + # Get plain text + result = read(browser, output_format="text") + text = result["content"] """ if not browser.page: raise RuntimeError("Browser not started. Call browser.start() first.") - # Get basic content from extension + if output_format == "markdown" and enhance_markdown: + # Get raw HTML from the extension first + raw_html_result = browser.page.evaluate( + """ + (options) => { + return window.sentience.read(options); + } + """, + {"format": "raw"}, + ) + + if raw_html_result.get("status") == "success": + html_content = raw_html_result["content"] + try: + # Use markdownify for enhanced markdown conversion + from markdownify import markdownify, MarkdownifyError + markdown_content = markdownify(html_content, heading_style="ATX", wrap=True) + return { + "status": "success", + "url": raw_html_result["url"], + "format": "markdown", + "content": markdown_content, + "length": len(markdown_content), + } + except ImportError: + print("Warning: 'markdownify' not installed. Install with 'pip install markdownify' for enhanced markdown. Falling back to extension's markdown.") + except MarkdownifyError as e: + print(f"Warning: markdownify failed ({e}), falling back to extension's markdown.") + except Exception as e: + print(f"Warning: An unexpected error occurred with markdownify ({e}), falling back to extension's markdown.") + + # If not enhanced markdown, or fallback, call extension with requested format result = browser.page.evaluate( """ (options) => { return window.sentience.read(options); } """, - {"format": format}, + {"format": output_format}, ) - # Enhance markdown if requested and format is markdown - if format == "markdown" and enhance_markdown and result.get("status") == "success": - try: - # Get full HTML from page - html_content = browser.page.evaluate("() => document.documentElement.outerHTML") - - # Use markdownify for better conversion - from markdownify import markdownify as md - enhanced_markdown = md( - html_content, - heading_style="ATX", # Use # for headings - bullets="-", # Use - for lists - strip=['script', 'style', 'nav', 'footer', 'header', 'noscript'], # Strip unwanted tags - ) - result["content"] = enhanced_markdown - result["length"] = len(enhanced_markdown) - except ImportError: - # Fallback to extension's lightweight conversion if markdownify not installed - # This shouldn't happen if dependencies are installed, but handle gracefully - pass - except Exception as e: - # If enhancement fails, use extension's result - # Don't overwrite result["error"] - keep extension's result - pass - return result -