88
99def read (
1010 browser : SentienceBrowser ,
11- format : Literal ["raw" , "text" , "markdown" ] = "raw" , # noqa: A002
11+ output_format : Literal ["raw" , "text" , "markdown" ] = "raw" ,
12+ enhance_markdown : bool = True ,
1213) -> dict :
1314 """
1415 Read page content as raw HTML, text, or markdown
1516
1617 Args:
1718 browser: SentienceBrowser instance
18- format: Output format - "raw" (default, returns HTML for Turndown/markdownify),
19- "text" (plain text), or "markdown" (high-quality markdown via markdownify)
19+ output_format: Output format - "raw" (default, returns HTML for external processing),
20+ "text" (plain text), or "markdown" (lightweight or enhanced markdown).
21+ enhance_markdown: If True and output_format is "markdown", uses markdownify for better conversion.
22+ If False, uses the extension's lightweight markdown converter.
2023
2124 Returns:
2225 dict with:
@@ -33,20 +36,19 @@ def read(
3336 html_content = result["content"]
3437
3538 # Get high-quality markdown (uses markdownify internally)
36- result = read(browser, format ="markdown")
39+ result = read(browser, output_format ="markdown")
3740 markdown = result["content"]
3841
3942 # Get plain text
40- result = read(browser, format ="text")
43+ result = read(browser, output_format ="text")
4144 text = result["content"]
4245 """
4346 if not browser .page :
4447 raise RuntimeError ("Browser not started. Call browser.start() first." )
4548
46- # For markdown format, get raw HTML first, then convert with markdownify
47- if format == "markdown" :
48- # Get raw HTML from extension
49- raw_result = browser .page .evaluate (
49+ if output_format == "markdown" and enhance_markdown :
50+ # Get raw HTML from the extension first
51+ raw_html_result = browser .page .evaluate (
5052 """
5153 (options) => {
5254 return window.sentience.read(options);
@@ -55,57 +57,34 @@ def read(
5557 {"format" : "raw" },
5658 )
5759
58- if raw_result .get ("status" ) != "success" :
59- return raw_result
60-
61- # Convert to markdown using markdownify
62- try :
63- from markdownify import markdownify as md
64- html_content = raw_result ["content" ]
65- markdown_content = md (
66- html_content ,
67- heading_style = "ATX" , # Use # for headings
68- bullets = "-" , # Use - for lists
69- strip = ['script' , 'style' , 'nav' , 'footer' , 'header' , 'noscript' ], # Strip unwanted tags
70- )
71-
72- # Return result with markdown content
73- return {
74- "status" : "success" ,
75- "url" : raw_result ["url" ],
76- "format" : "markdown" ,
77- "content" : markdown_content ,
78- "length" : len (markdown_content ),
79- }
80- except ImportError :
81- # Fallback to extension's lightweight markdown if markdownify not installed
82- result = browser .page .evaluate (
83- """
84- (options) => {
85- return window.sentience.read(options);
60+ if raw_html_result .get ("status" ) == "success" :
61+ html_content = raw_html_result ["content" ]
62+ try :
63+ # Use markdownify for enhanced markdown conversion
64+ from markdownify import markdownify , MarkdownifyError
65+ markdown_content = markdownify (html_content , heading_style = "ATX" , wrap = True )
66+ return {
67+ "status" : "success" ,
68+ "url" : raw_html_result ["url" ],
69+ "format" : "markdown" ,
70+ "content" : markdown_content ,
71+ "length" : len (markdown_content ),
8672 }
87- """ ,
88- {"format" : "markdown" },
89- )
90- return result
91- except (ValueError , TypeError , AttributeError ) as e :
92- # If conversion fails, return error
93- return {
94- "status" : "error" ,
95- "url" : raw_result .get ("url" , "" ),
96- "format" : "markdown" ,
97- "content" : "" ,
98- "length" : 0 ,
99- "error" : f"Markdown conversion failed: { e } " ,
100- }
101- else :
102- # For "raw" or "text", call extension directly
103- result = browser .page .evaluate (
104- """
105- (options) => {
106- return window.sentience.read(options);
107- }
108- """ ,
109- {"format" : format },
110- )
111- return result
73+ except ImportError :
74+ print ("Warning: 'markdownify' not installed. Install with 'pip install markdownify' for enhanced markdown. Falling back to extension's markdown." )
75+ except MarkdownifyError as e :
76+ print (f"Warning: markdownify failed ({ e } ), falling back to extension's markdown." )
77+ except Exception as e :
78+ print (f"Warning: An unexpected error occurred with markdownify ({ e } ), falling back to extension's markdown." )
79+
80+ # If not enhanced markdown, or fallback, call extension with requested format
81+ result = browser .page .evaluate (
82+ """
83+ (options) => {
84+ return window.sentience.read(options);
85+ }
86+ """ ,
87+ {"format" : output_format },
88+ )
89+
90+ return result
0 commit comments