Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 58 additions & 41 deletions .github/workflows/sync-extension.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,36 +63,25 @@ jobs:
mkdir -p extension-temp
cd extension-temp

# First, try to download the zip archive if available
ZIP_URL=$(curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
# Download individual files from release (reliable method - no zip)
echo "📁 Downloading individual files from release..."
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
"https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
jq -r '.assets[] | select(.name == "extension-package.zip") | .browser_download_url')

if [ -n "$ZIP_URL" ] && [ "$ZIP_URL" != "null" ]; then
echo "📦 Downloading extension-package.zip..."
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$ZIP_URL" -o extension-package.zip
unzip -q extension-package.zip -d .
# Files should now be in extension-temp/extension-package/ or extension-temp/
if [ -d "extension-package" ]; then
mv extension-package/* . 2>/dev/null || true
rmdir extension-package 2>/dev/null || true
fi
else
echo "📁 Downloading individual files from release..."
# Download each file from release
curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
"https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \
while read url; do
if [ -n "$url" ] && [ "$url" != "null" ]; then
filename=$(basename "$url")
echo " Downloading $filename..."
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename"
jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | "\(.browser_download_url)|\(.name)"' | \
while IFS='|' read -r url name; do
if [ -n "$url" ] && [ "$url" != "null" ] && [ -n "$name" ]; then
# Preserve directory structure from asset name
# If name contains '/', create directories
dir=$(dirname "$name")
if [ "$dir" != "." ]; then
mkdir -p "$dir"
fi
done
fi
echo " Downloading $name..."
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$name"
fi
done

# Verify files were downloaded
# Verify downloaded files
echo "📋 Downloaded files:"
ls -la

Expand All @@ -102,34 +91,62 @@ jobs:
# Create extension directory structure
mkdir -p sentience/extension/pkg

# Copy extension files (check both root and pkg subdirectory)
cp extension-temp/manifest.json sentience/extension/ 2>/dev/null || echo "⚠️ manifest.json not found in release"
cp extension-temp/content.js sentience/extension/ 2>/dev/null || echo "⚠️ content.js not found in release"
cp extension-temp/background.js sentience/extension/ 2>/dev/null || echo "⚠️ background.js not found in release"
cp extension-temp/injected_api.js sentience/extension/ 2>/dev/null || echo "⚠️ injected_api.js not found in release"
# Copy extension files (handle both root and extension-package/ subdirectory)
# Check root first, then extension-package/ subdirectory
if [ -f "extension-temp/manifest.json" ]; then
cp extension-temp/manifest.json sentience/extension/
elif [ -f "extension-temp/extension-package/manifest.json" ]; then
cp extension-temp/extension-package/manifest.json sentience/extension/
else
echo "⚠️ manifest.json not found"
fi

if [ -f "extension-temp/content.js" ]; then
cp extension-temp/content.js sentience/extension/
elif [ -f "extension-temp/extension-package/content.js" ]; then
cp extension-temp/extension-package/content.js sentience/extension/
else
echo "⚠️ content.js not found"
fi

if [ -f "extension-temp/background.js" ]; then
cp extension-temp/background.js sentience/extension/
elif [ -f "extension-temp/extension-package/background.js" ]; then
cp extension-temp/extension-package/background.js sentience/extension/
else
echo "⚠️ background.js not found"
fi

if [ -f "extension-temp/injected_api.js" ]; then
cp extension-temp/injected_api.js sentience/extension/
elif [ -f "extension-temp/extension-package/injected_api.js" ]; then
cp extension-temp/extension-package/injected_api.js sentience/extension/
else
echo "⚠️ injected_api.js not found"
fi

# Copy WASM files (check both root and pkg subdirectory)
# Copy WASM files (check both locations)
if [ -f "extension-temp/pkg/sentience_core.js" ]; then
cp extension-temp/pkg/sentience_core.js sentience/extension/pkg/
elif [ -f "extension-temp/sentience_core.js" ]; then
cp extension-temp/sentience_core.js sentience/extension/pkg/
elif [ -f "extension-temp/extension-package/pkg/sentience_core.js" ]; then
cp extension-temp/extension-package/pkg/sentience_core.js sentience/extension/pkg/
else
echo "⚠️ sentience_core.js not found"
fi

if [ -f "extension-temp/pkg/sentience_core_bg.wasm" ]; then
cp extension-temp/pkg/sentience_core_bg.wasm sentience/extension/pkg/
elif [ -f "extension-temp/sentience_core_bg.wasm" ]; then
cp extension-temp/sentience_core_bg.wasm sentience/extension/pkg/
elif [ -f "extension-temp/extension-package/pkg/sentience_core_bg.wasm" ]; then
cp extension-temp/extension-package/pkg/sentience_core_bg.wasm sentience/extension/pkg/
else
echo "⚠️ sentience_core_bg.wasm not found"
fi

# Copy TypeScript definitions
if [ -d "extension-temp/pkg" ]; then
cp extension-temp/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
elif [ -d "extension-temp" ]; then
cp extension-temp/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
elif [ -d "extension-temp/extension-package/pkg" ]; then
cp extension-temp/extension-package/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
fi

# Verify copied files
Expand All @@ -156,9 +173,9 @@ jobs:
if: steps.release.outputs.skip != 'true' && steps.changes.outputs.changed == 'true'
uses: peter-evans/create-pull-request@v5
with:
# Use GITHUB_TOKEN (built-in) if repository allows PR creation, otherwise use PR_TOKEN (PAT)
# Use PR_TOKEN if available (for repos with org restrictions), otherwise use GITHUB_TOKEN
# To use PAT: create secret named PR_TOKEN with a Personal Access Token that has 'repo' scope
token: ${{ secrets.PR_TOKEN || secrets.GITHUB_TOKEN }}
token: ${{ secrets.PR_TOKEN }}
commit-message: "chore: sync extension files from sentience-chrome ${{ steps.release.outputs.tag }}"
title: "Sync Extension: ${{ steps.release.outputs.tag }}"
body: |
Expand Down
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,36 @@ with SentienceBrowser(headless=False) as browser:
- `.to_have_text(text)`
- `.to_have_count(n)`

### Content Reading
- `read(browser, format="raw|text|markdown")` - Read page content
- **Default format: `"raw"`** - Returns HTML suitable for Turndown/markdownify
- `format="raw"` - Get cleaned HTML
- `format="markdown"` - Get high-quality markdown (uses markdownify internally)
- `format="text"` - Get plain text

**Examples:**
```python
from sentience import read

# Get raw HTML (default)
result = read(browser)
html = result["content"]

# Get high-quality markdown (uses markdownify automatically)
result = read(browser, format="markdown")
markdown = result["content"]
```

See `examples/read_markdown.py` for complete examples.

## Examples

See `examples/` directory:
- `hello.py` - Extension bridge verification
- `basic_agent.py` - Basic snapshot
- `query_demo.py` - Query engine
- `wait_and_click.py` - Wait and actions
- `read_markdown.py` - Reading page content and converting to markdown

### Content Reading Example

Expand Down
53 changes: 53 additions & 0 deletions examples/read_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
Example: Reading page content and converting to markdown

This example shows how to use the read() function to get page content
and convert it to high-quality markdown using markdownify.
"""

from sentience import SentienceBrowser, read
from markdownify import markdownify


def main():
# Initialize browser
with SentienceBrowser(headless=True) as browser:
# Navigate to a page
browser.page.goto("https://example.com")
browser.page.wait_for_load_state("networkidle")

# Method 1: Get raw HTML (default) and convert with markdownify
print("=== Method 1: Raw HTML + markdownify (Recommended) ===")
result = read(browser) # format="raw" is default
html_content = result["content"]

# Convert to markdown using markdownify (better quality)
markdown = markdownify(
html_content,
heading_style="ATX", # Use # for headings
bullets="-", # Use - for lists
strip=['script', 'style', 'nav', 'footer', 'header'], # Strip unwanted tags
)
print(f"Markdown length: {len(markdown)} characters")
print(markdown[:500]) # Print first 500 chars
print("\n")

# Method 2: Get high-quality markdown directly (uses markdownify internally)
print("=== Method 2: Direct markdown (High-quality via markdownify) ===")
result = read(browser, format="markdown")
high_quality_markdown = result["content"]
print(f"Markdown length: {len(high_quality_markdown)} characters")
print(high_quality_markdown[:500]) # Print first 500 chars
print("\n")

# Method 3: Get plain text
print("=== Method 3: Plain text ===")
result = read(browser, format="text")
text_content = result["content"]
print(f"Text length: {len(text_content)} characters")
print(text_content[:500]) # Print first 500 chars


if __name__ == "__main__":
main()

90 changes: 55 additions & 35 deletions sentience/read.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,90 @@
"""
Read page content - enhanced markdown conversion
Read page content - supports raw HTML, text, and markdown formats
"""

from typing import Optional, Literal
from typing import Literal
from .browser import SentienceBrowser


def read(
browser: SentienceBrowser,
format: Literal["text", "markdown"] = "text",
output_format: Literal["raw", "text", "markdown"] = "raw",
enhance_markdown: bool = True,
) -> dict:
"""
Read page content as text or markdown
Read page content as raw HTML, text, or markdown

Args:
browser: SentienceBrowser instance
format: Output format - "text" or "markdown"
enhance_markdown: If True and format="markdown", use markdownify for better conversion
output_format: Output format - "raw" (default, returns HTML for external processing),
"text" (plain text), or "markdown" (lightweight or enhanced markdown).
enhance_markdown: If True and output_format is "markdown", uses markdownify for better conversion.
If False, uses the extension's lightweight markdown converter.

Returns:
dict with:
- status: "success" or "error"
- url: Current page URL
- format: "text" or "markdown"
- format: "raw", "text", or "markdown"
- content: Page content as string
- length: Content length in characters
- error: Error message if status is "error"

Examples:
# Get raw HTML (default) - can be used with markdownify for better conversion
result = read(browser)
html_content = result["content"]

# Get high-quality markdown (uses markdownify internally)
result = read(browser, output_format="markdown")
markdown = result["content"]

# Get plain text
result = read(browser, output_format="text")
text = result["content"]
"""
if not browser.page:
raise RuntimeError("Browser not started. Call browser.start() first.")

# Get basic content from extension
if output_format == "markdown" and enhance_markdown:
# Get raw HTML from the extension first
raw_html_result = browser.page.evaluate(
"""
(options) => {
return window.sentience.read(options);
}
""",
{"format": "raw"},
)

if raw_html_result.get("status") == "success":
html_content = raw_html_result["content"]
try:
# Use markdownify for enhanced markdown conversion
from markdownify import markdownify, MarkdownifyError
markdown_content = markdownify(html_content, heading_style="ATX", wrap=True)
return {
"status": "success",
"url": raw_html_result["url"],
"format": "markdown",
"content": markdown_content,
"length": len(markdown_content),
}
except ImportError:
print("Warning: 'markdownify' not installed. Install with 'pip install markdownify' for enhanced markdown. Falling back to extension's markdown.")
except MarkdownifyError as e:
print(f"Warning: markdownify failed ({e}), falling back to extension's markdown.")
except Exception as e:
print(f"Warning: An unexpected error occurred with markdownify ({e}), falling back to extension's markdown.")

# If not enhanced markdown, or fallback, call extension with requested format
result = browser.page.evaluate(
"""
(options) => {
return window.sentience.read(options);
}
""",
{"format": format},
{"format": output_format},
)

# Enhance markdown if requested and format is markdown
if format == "markdown" and enhance_markdown and result.get("status") == "success":
try:
# Get full HTML from page
html_content = browser.page.evaluate("() => document.documentElement.outerHTML")

# Use markdownify for better conversion
from markdownify import markdownify as md
enhanced_markdown = md(
html_content,
heading_style="ATX", # Use # for headings
bullets="-", # Use - for lists
strip=['script', 'style', 'nav', 'footer', 'header', 'noscript'], # Strip unwanted tags
)
result["content"] = enhanced_markdown
result["length"] = len(enhanced_markdown)
except ImportError:
# Fallback to extension's lightweight conversion if markdownify not installed
# This shouldn't happen if dependencies are installed, but handle gracefully
pass
except Exception as e:
# If enhancement fails, use extension's result
# Don't overwrite result["error"] - keep extension's result
pass

return result

Loading