handle markdown read

rcholic · rcholic · commit e86ee9e31d2d · 2025-12-21T18:16:22.000-08:00
diff --git a/README.md b/README.md
@@ -65,13 +65,36 @@ with SentienceBrowser(headless=False) as browser:
   - `.to_have_text(text)`
   - `.to_have_count(n)`
 
+### Content Reading
+- `read(browser, format="raw|text|markdown")` - Read page content
+  - **Default format: `"raw"`** - Returns HTML suitable for Turndown/markdownify
+  - `format="raw"` - Get cleaned HTML
+  - `format="markdown"` - Get high-quality markdown (uses markdownify internally)
+  - `format="text"` - Get plain text
+  
+  **Examples:**
+  ```python
+  from sentience import read
+  
+  # Get raw HTML (default)
+  result = read(browser)
+  html = result["content"]
+  
+  # Get high-quality markdown (uses markdownify automatically)
+  result = read(browser, format="markdown")
+  markdown = result["content"]
+  ```
+  
+  See `examples/read_markdown.py` for complete examples.
+
 ## Examples
 
 See `examples/` directory:
 - `hello.py` - Extension bridge verification
 - `basic_agent.py` - Basic snapshot
 - `query_demo.py` - Query engine
 - `wait_and_click.py` - Wait and actions
+- `read_markdown.py` - Reading page content and converting to markdown
 
 ## Testing
 
diff --git a/examples/read_markdown.py b/examples/read_markdown.py
@@ -0,0 +1,53 @@
+"""
+Example: Reading page content and converting to markdown
+
+This example shows how to use the read() function to get page content
+and convert it to high-quality markdown using markdownify.
+"""
+
+from sentience import SentienceBrowser, read
+from markdownify import markdownify
+
+
+def main():
+    # Initialize browser
+    with SentienceBrowser(headless=True) as browser:
+        # Navigate to a page
+        browser.page.goto("https://example.com")
+        browser.page.wait_for_load_state("networkidle")
+        
+        # Method 1: Get raw HTML (default) and convert with markdownify
+        print("=== Method 1: Raw HTML + markdownify (Recommended) ===")
+        result = read(browser)  # format="raw" is default
+        html_content = result["content"]
+        
+        # Convert to markdown using markdownify (better quality)
+        markdown = markdownify(
+            html_content,
+            heading_style="ATX",  # Use # for headings
+            bullets="-",  # Use - for lists
+            strip=['script', 'style', 'nav', 'footer', 'header'],  # Strip unwanted tags
+        )
+        print(f"Markdown length: {len(markdown)} characters")
+        print(markdown[:500])  # Print first 500 chars
+        print("\n")
+        
+        # Method 2: Get high-quality markdown directly (uses markdownify internally)
+        print("=== Method 2: Direct markdown (High-quality via markdownify) ===")
+        result = read(browser, format="markdown")
+        high_quality_markdown = result["content"]
+        print(f"Markdown length: {len(high_quality_markdown)} characters")
+        print(high_quality_markdown[:500])  # Print first 500 chars
+        print("\n")
+        
+        # Method 3: Get plain text
+        print("=== Method 3: Plain text ===")
+        result = read(browser, format="text")
+        text_content = result["content"]
+        print(f"Text length: {len(text_content)} characters")
+        print(text_content[:500])  # Print first 500 chars
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/sentience/__init__.py b/sentience/__init__.py
@@ -12,6 +12,7 @@
 from .inspector import Inspector, inspect
 from .recorder import Recorder, Trace, TraceStep, record
 from .generator import ScriptGenerator, generate
+from .read import read
 
 __version__ = "0.1.0"
 
@@ -39,5 +40,6 @@
     "record",
     "ScriptGenerator",
     "generate",
+    "read",
 ]
 
diff --git a/sentience/read.py b/sentience/read.py
@@ -0,0 +1,111 @@
+"""
+Read page content - supports raw HTML, text, and markdown formats
+"""
+
+from typing import Literal
+from .browser import SentienceBrowser
+
+
+def read(
+    browser: SentienceBrowser,
+    format: Literal["raw", "text", "markdown"] = "raw",  # noqa: A002
+) -> dict:
+    """
+    Read page content as raw HTML, text, or markdown
+    
+    Args:
+        browser: SentienceBrowser instance
+        format: Output format - "raw" (default, returns HTML for Turndown/markdownify),
+                "text" (plain text), or "markdown" (high-quality markdown via markdownify)
+    
+    Returns:
+        dict with:
+            - status: "success" or "error"
+            - url: Current page URL
+            - format: "raw", "text", or "markdown"
+            - content: Page content as string
+            - length: Content length in characters
+            - error: Error message if status is "error"
+    
+    Examples:
+        # Get raw HTML (default) - can be used with markdownify for better conversion
+        result = read(browser)
+        html_content = result["content"]
+        
+        # Get high-quality markdown (uses markdownify internally)
+        result = read(browser, format="markdown")
+        markdown = result["content"]
+        
+        # Get plain text
+        result = read(browser, format="text")
+        text = result["content"]
+    """
+    if not browser.page:
+        raise RuntimeError("Browser not started. Call browser.start() first.")
+    
+    # For markdown format, get raw HTML first, then convert with markdownify
+    if format == "markdown":
+        # Get raw HTML from extension
+        raw_result = browser.page.evaluate(
+            """
+            (options) => {
+                return window.sentience.read(options);
+            }
+            """,
+            {"format": "raw"},
+        )
+        
+        if raw_result.get("status") != "success":
+            return raw_result
+        
+        # Convert to markdown using markdownify
+        try:
+            from markdownify import markdownify as md
+            html_content = raw_result["content"]
+            markdown_content = md(
+                html_content,
+                heading_style="ATX",  # Use # for headings
+                bullets="-",  # Use - for lists
+                strip=['script', 'style', 'nav', 'footer', 'header', 'noscript'],  # Strip unwanted tags
+            )
+            
+            # Return result with markdown content
+            return {
+                "status": "success",
+                "url": raw_result["url"],
+                "format": "markdown",
+                "content": markdown_content,
+                "length": len(markdown_content),
+            }
+        except ImportError:
+            # Fallback to extension's lightweight markdown if markdownify not installed
+            result = browser.page.evaluate(
+                """
+                (options) => {
+                    return window.sentience.read(options);
+                }
+                """,
+                {"format": "markdown"},
+            )
+            return result
+        except (ValueError, TypeError, AttributeError) as e:
+            # If conversion fails, return error
+            return {
+                "status": "error",
+                "url": raw_result.get("url", ""),
+                "format": "markdown",
+                "content": "",
+                "length": 0,
+                "error": f"Markdown conversion failed: {e}",
+            }
+    else:
+        # For "raw" or "text", call extension directly
+        result = browser.page.evaluate(
+            """
+            (options) => {
+                return window.sentience.read(options);
+            }
+            """,
+            {"format": format},
+        )
+        return result