diff --git a/README.md b/README.md index 771798c0b..7535e6bd5 100644 --- a/README.md +++ b/README.md @@ -1093,6 +1093,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro | Company | About | Sponsorship Tier | |------|------|----------------------------| +| Thor Data | Leveraging Thordata ensures seamless compatibility with any AI/ML workflows and data infrastructure, massively accessing web data with 99.9% uptime, backed by one-on-one customer support. | 🥈 Silver | | nstproxy | NstProxy is a trusted proxy provider with over 110M+ real residential IPs, city-level targeting, 99.99% uptime, and low pricing at $0.1/GB, it delivers unmatched stability, scale, and cost-efficiency. | 🥈 Silver | | Scrapeless | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver | | Capsolver | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze | diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py index ca15b4534..838b0dd22 100644 --- a/crawl4ai/html2text/__init__.py +++ b/crawl4ai/html2text/__init__.py @@ -312,6 +312,11 @@ def handle_tag( ) -> None: self.current_tag = tag + if tag == "base" and start: + href = attrs.get("href") + if href: + self.baseurl = urlparse.urljoin(self.baseurl, href) + if self.tag_callback is not None: if self.tag_callback(self, tag, attrs, start) is True: return diff --git a/tests/test_base_tag_local.py b/tests/test_base_tag_local.py new file mode 100644 index 000000000..159dbca15 --- /dev/null +++ b/tests/test_base_tag_local.py @@ -0,0 +1,39 @@ +import unittest +from crawl4ai.html2text import HTML2Text + +class TestBaseTag(unittest.TestCase): + def test_base_tag_handling(self): + html_content = """ + + + + + + Link + + + """ + + # Initialize parser with a different base (or empty) + parser = HTML2Text(baseurl="https://override.com/") + + # Feed content + markdown = parser.handle(html_content) + + print(f"Markdown Output: {markdown}") + + # Expected: The link should be resolved against the tag + expected_url = "https://example.com/subdir/page.html" + + # Current behavior (bug): It resolves against init baseurl ("https://override.com/page.html") + # OR if baseurl is empty, it stays relative "page.html" + + if expected_url in markdown: + print("SUCCESS: Base tag respected.") + else: + print(f"FAILURE: Base tag ignored. Expected {expected_url} in output.") + + self.assertIn(expected_url, markdown) + +if __name__ == "__main__": + unittest.main()