Skip to content

Commit e86ee9e

Browse files
committed
handle markdown read
1 parent e5c3e14 commit e86ee9e

File tree

4 files changed

+189
-0
lines changed

4 files changed

+189
-0
lines changed

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,36 @@ with SentienceBrowser(headless=False) as browser:
6565
- `.to_have_text(text)`
6666
- `.to_have_count(n)`
6767

68+
### Content Reading
69+
- `read(browser, format="raw|text|markdown")` - Read page content
70+
- **Default format: `"raw"`** - Returns HTML suitable for Turndown/markdownify
71+
- `format="raw"` - Get cleaned HTML
72+
- `format="markdown"` - Get high-quality markdown (uses markdownify internally)
73+
- `format="text"` - Get plain text
74+
75+
**Examples:**
76+
```python
77+
from sentience import read
78+
79+
# Get raw HTML (default)
80+
result = read(browser)
81+
html = result["content"]
82+
83+
# Get high-quality markdown (uses markdownify automatically)
84+
result = read(browser, format="markdown")
85+
markdown = result["content"]
86+
```
87+
88+
See `examples/read_markdown.py` for complete examples.
89+
6890
## Examples
6991

7092
See `examples/` directory:
7193
- `hello.py` - Extension bridge verification
7294
- `basic_agent.py` - Basic snapshot
7395
- `query_demo.py` - Query engine
7496
- `wait_and_click.py` - Wait and actions
97+
- `read_markdown.py` - Reading page content and converting to markdown
7598

7699
## Testing
77100

examples/read_markdown.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
Example: Reading page content and converting to markdown
3+
4+
This example shows how to use the read() function to get page content
5+
and convert it to high-quality markdown using markdownify.
6+
"""
7+
8+
from sentience import SentienceBrowser, read
9+
from markdownify import markdownify
10+
11+
12+
def main():
13+
# Initialize browser
14+
with SentienceBrowser(headless=True) as browser:
15+
# Navigate to a page
16+
browser.page.goto("https://example.com")
17+
browser.page.wait_for_load_state("networkidle")
18+
19+
# Method 1: Get raw HTML (default) and convert with markdownify
20+
print("=== Method 1: Raw HTML + markdownify (Recommended) ===")
21+
result = read(browser) # format="raw" is default
22+
html_content = result["content"]
23+
24+
# Convert to markdown using markdownify (better quality)
25+
markdown = markdownify(
26+
html_content,
27+
heading_style="ATX", # Use # for headings
28+
bullets="-", # Use - for lists
29+
strip=['script', 'style', 'nav', 'footer', 'header'], # Strip unwanted tags
30+
)
31+
print(f"Markdown length: {len(markdown)} characters")
32+
print(markdown[:500]) # Print first 500 chars
33+
print("\n")
34+
35+
# Method 2: Get high-quality markdown directly (uses markdownify internally)
36+
print("=== Method 2: Direct markdown (High-quality via markdownify) ===")
37+
result = read(browser, format="markdown")
38+
high_quality_markdown = result["content"]
39+
print(f"Markdown length: {len(high_quality_markdown)} characters")
40+
print(high_quality_markdown[:500]) # Print first 500 chars
41+
print("\n")
42+
43+
# Method 3: Get plain text
44+
print("=== Method 3: Plain text ===")
45+
result = read(browser, format="text")
46+
text_content = result["content"]
47+
print(f"Text length: {len(text_content)} characters")
48+
print(text_content[:500]) # Print first 500 chars
49+
50+
51+
if __name__ == "__main__":
52+
main()
53+

sentience/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from .inspector import Inspector, inspect
1313
from .recorder import Recorder, Trace, TraceStep, record
1414
from .generator import ScriptGenerator, generate
15+
from .read import read
1516

1617
__version__ = "0.1.0"
1718

@@ -39,5 +40,6 @@
3940
"record",
4041
"ScriptGenerator",
4142
"generate",
43+
"read",
4244
]
4345

sentience/read.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""
2+
Read page content - supports raw HTML, text, and markdown formats
3+
"""
4+
5+
from typing import Literal
6+
from .browser import SentienceBrowser
7+
8+
9+
def read(
10+
browser: SentienceBrowser,
11+
format: Literal["raw", "text", "markdown"] = "raw", # noqa: A002
12+
) -> dict:
13+
"""
14+
Read page content as raw HTML, text, or markdown
15+
16+
Args:
17+
browser: SentienceBrowser instance
18+
format: Output format - "raw" (default, returns HTML for Turndown/markdownify),
19+
"text" (plain text), or "markdown" (high-quality markdown via markdownify)
20+
21+
Returns:
22+
dict with:
23+
- status: "success" or "error"
24+
- url: Current page URL
25+
- format: "raw", "text", or "markdown"
26+
- content: Page content as string
27+
- length: Content length in characters
28+
- error: Error message if status is "error"
29+
30+
Examples:
31+
# Get raw HTML (default) - can be used with markdownify for better conversion
32+
result = read(browser)
33+
html_content = result["content"]
34+
35+
# Get high-quality markdown (uses markdownify internally)
36+
result = read(browser, format="markdown")
37+
markdown = result["content"]
38+
39+
# Get plain text
40+
result = read(browser, format="text")
41+
text = result["content"]
42+
"""
43+
if not browser.page:
44+
raise RuntimeError("Browser not started. Call browser.start() first.")
45+
46+
# For markdown format, get raw HTML first, then convert with markdownify
47+
if format == "markdown":
48+
# Get raw HTML from extension
49+
raw_result = browser.page.evaluate(
50+
"""
51+
(options) => {
52+
return window.sentience.read(options);
53+
}
54+
""",
55+
{"format": "raw"},
56+
)
57+
58+
if raw_result.get("status") != "success":
59+
return raw_result
60+
61+
# Convert to markdown using markdownify
62+
try:
63+
from markdownify import markdownify as md
64+
html_content = raw_result["content"]
65+
markdown_content = md(
66+
html_content,
67+
heading_style="ATX", # Use # for headings
68+
bullets="-", # Use - for lists
69+
strip=['script', 'style', 'nav', 'footer', 'header', 'noscript'], # Strip unwanted tags
70+
)
71+
72+
# Return result with markdown content
73+
return {
74+
"status": "success",
75+
"url": raw_result["url"],
76+
"format": "markdown",
77+
"content": markdown_content,
78+
"length": len(markdown_content),
79+
}
80+
except ImportError:
81+
# Fallback to extension's lightweight markdown if markdownify not installed
82+
result = browser.page.evaluate(
83+
"""
84+
(options) => {
85+
return window.sentience.read(options);
86+
}
87+
""",
88+
{"format": "markdown"},
89+
)
90+
return result
91+
except (ValueError, TypeError, AttributeError) as e:
92+
# If conversion fails, return error
93+
return {
94+
"status": "error",
95+
"url": raw_result.get("url", ""),
96+
"format": "markdown",
97+
"content": "",
98+
"length": 0,
99+
"error": f"Markdown conversion failed: {e}",
100+
}
101+
else:
102+
# For "raw" or "text", call extension directly
103+
result = browser.page.evaluate(
104+
"""
105+
(options) => {
106+
return window.sentience.read(options);
107+
}
108+
""",
109+
{"format": format},
110+
)
111+
return result

0 commit comments

Comments
 (0)