|
| 1 | +""" |
| 2 | +Read page content - supports raw HTML, text, and markdown formats |
| 3 | +""" |
| 4 | + |
| 5 | +from typing import Literal |
| 6 | +from .browser import SentienceBrowser |
| 7 | + |
| 8 | + |
| 9 | +def read( |
| 10 | + browser: SentienceBrowser, |
| 11 | + format: Literal["raw", "text", "markdown"] = "raw", # noqa: A002 |
| 12 | +) -> dict: |
| 13 | + """ |
| 14 | + Read page content as raw HTML, text, or markdown |
| 15 | + |
| 16 | + Args: |
| 17 | + browser: SentienceBrowser instance |
| 18 | + format: Output format - "raw" (default, returns HTML for Turndown/markdownify), |
| 19 | + "text" (plain text), or "markdown" (high-quality markdown via markdownify) |
| 20 | + |
| 21 | + Returns: |
| 22 | + dict with: |
| 23 | + - status: "success" or "error" |
| 24 | + - url: Current page URL |
| 25 | + - format: "raw", "text", or "markdown" |
| 26 | + - content: Page content as string |
| 27 | + - length: Content length in characters |
| 28 | + - error: Error message if status is "error" |
| 29 | + |
| 30 | + Examples: |
| 31 | + # Get raw HTML (default) - can be used with markdownify for better conversion |
| 32 | + result = read(browser) |
| 33 | + html_content = result["content"] |
| 34 | + |
| 35 | + # Get high-quality markdown (uses markdownify internally) |
| 36 | + result = read(browser, format="markdown") |
| 37 | + markdown = result["content"] |
| 38 | + |
| 39 | + # Get plain text |
| 40 | + result = read(browser, format="text") |
| 41 | + text = result["content"] |
| 42 | + """ |
| 43 | + if not browser.page: |
| 44 | + raise RuntimeError("Browser not started. Call browser.start() first.") |
| 45 | + |
| 46 | + # For markdown format, get raw HTML first, then convert with markdownify |
| 47 | + if format == "markdown": |
| 48 | + # Get raw HTML from extension |
| 49 | + raw_result = browser.page.evaluate( |
| 50 | + """ |
| 51 | + (options) => { |
| 52 | + return window.sentience.read(options); |
| 53 | + } |
| 54 | + """, |
| 55 | + {"format": "raw"}, |
| 56 | + ) |
| 57 | + |
| 58 | + if raw_result.get("status") != "success": |
| 59 | + return raw_result |
| 60 | + |
| 61 | + # Convert to markdown using markdownify |
| 62 | + try: |
| 63 | + from markdownify import markdownify as md |
| 64 | + html_content = raw_result["content"] |
| 65 | + markdown_content = md( |
| 66 | + html_content, |
| 67 | + heading_style="ATX", # Use # for headings |
| 68 | + bullets="-", # Use - for lists |
| 69 | + strip=['script', 'style', 'nav', 'footer', 'header', 'noscript'], # Strip unwanted tags |
| 70 | + ) |
| 71 | + |
| 72 | + # Return result with markdown content |
| 73 | + return { |
| 74 | + "status": "success", |
| 75 | + "url": raw_result["url"], |
| 76 | + "format": "markdown", |
| 77 | + "content": markdown_content, |
| 78 | + "length": len(markdown_content), |
| 79 | + } |
| 80 | + except ImportError: |
| 81 | + # Fallback to extension's lightweight markdown if markdownify not installed |
| 82 | + result = browser.page.evaluate( |
| 83 | + """ |
| 84 | + (options) => { |
| 85 | + return window.sentience.read(options); |
| 86 | + } |
| 87 | + """, |
| 88 | + {"format": "markdown"}, |
| 89 | + ) |
| 90 | + return result |
| 91 | + except (ValueError, TypeError, AttributeError) as e: |
| 92 | + # If conversion fails, return error |
| 93 | + return { |
| 94 | + "status": "error", |
| 95 | + "url": raw_result.get("url", ""), |
| 96 | + "format": "markdown", |
| 97 | + "content": "", |
| 98 | + "length": 0, |
| 99 | + "error": f"Markdown conversion failed: {e}", |
| 100 | + } |
| 101 | + else: |
| 102 | + # For "raw" or "text", call extension directly |
| 103 | + result = browser.page.evaluate( |
| 104 | + """ |
| 105 | + (options) => { |
| 106 | + return window.sentience.read(options); |
| 107 | + } |
| 108 | + """, |
| 109 | + {"format": format}, |
| 110 | + ) |
| 111 | + return result |
0 commit comments