conda-forge · msarahan · Sep 10, 2025
@@ -4,7 +4,7 @@
 {% set ver2 = '.'.join(version.split('.')[0:2]) %}
 {% set ver2nd = ''.join(version.split('.')[0:2]) %}
 {% set ver3nd = ''.join(version.split('.')[0:3]) %}
-{% set build_number = 0 %}
+{% set build_number = 1 %}
 
 # this makes the linter happy
 {% set channel_targets = channel_targets or 'conda-forge main' %}

@@ -0,0 +1,238 @@
+From 2fe9ded9d6a7774425976c979b9a32a8936feb12 Mon Sep 17 00:00:00 2001
+From: Serhiy Storchaka <storchaka@gmail.com>
+Date: Fri, 13 Jun 2025 19:57:48 +0300
+Subject: [PATCH 24/24] gh-135462: Fix quadratic complexity in processing
+ special input in HTMLParser (GH-135464)
+
+End-of-file errors are now handled according to the HTML5 specs --
+comments and declarations are automatically closed, tags are ignored.
+(cherry picked from commit 6eb6c5dbfb528bd07d77b60fd71fd05d81d45c41)
+
+Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
+---
+ Lib/html/parser.py                            | 41 +++++---
+ Lib/test/test_htmlparser.py                   | 94 ++++++++++++++++---
+ ...-06-13-15-55-22.gh-issue-135462.KBeJpc.rst |  4 +
+ 3 files changed, 116 insertions(+), 23 deletions(-)
+ create mode 100644 Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst
+
+diff --git a/Lib/html/parser.py b/Lib/html/parser.py
+index 13c95c34e50..ecd5e0f019a 100644
+--- a/Lib/html/parser.py
++++ b/Lib/html/parser.py
+@@ -25,6 +25,7 @@
+ charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+
+ starttagopen = re.compile('<[a-zA-Z]')
++endtagopen = re.compile('</[a-zA-Z]')
+ piclose = re.compile('>')
+ commentclose = re.compile(r'--\s*>')
+ # Note:
+@@ -177,7 +178,7 @@ def goahead(self, end):
+                     k = self.parse_pi(i)
+                 elif startswith("<!", i):
+                     k = self.parse_html_declaration(i)
+-                elif (i + 1) < n:
++                elif (i + 1) < n or end:
+                     self.handle_data("<")
+                     k = i + 1
+                 else:
+@@ -185,17 +186,35 @@ def goahead(self, end):
+                 if k < 0:
+                     if not end:
+                         break
+-                    k = rawdata.find('>', i + 1)
+-                    if k < 0:
+-                        k = rawdata.find('<', i + 1)
+-                        if k < 0:
+-                            k = i + 1
+-                    else:
+-                        k += 1
+-                    if self.convert_charrefs and not self.cdata_elem:
+-                        self.handle_data(unescape(rawdata[i:k]))
++                    if starttagopen.match(rawdata, i):  # < + letter
++                        pass
++                    elif startswith("</", i):
++                        if i + 2 == n:
++                            self.handle_data("</")
++                        elif endtagopen.match(rawdata, i):  # </ + letter
++                            pass
++                        else:
++                            # bogus comment
++                            self.handle_comment(rawdata[i+2:])
++                    elif startswith("<!--", i):
++                        j = n
++                        for suffix in ("--!", "--", "-"):
++                            if rawdata.endswith(suffix, i+4):
++                                j -= len(suffix)
++                                break
++                        self.handle_comment(rawdata[i+4:j])
++                    elif startswith("<![CDATA[", i):
++                        self.unknown_decl(rawdata[i+3:])
++                    elif rawdata[i:i+9].lower() == '<!doctype':
++                        self.handle_decl(rawdata[i+2:])
++                    elif startswith("<!", i):
++                        # bogus comment
++                        self.handle_comment(rawdata[i+2:])
++                    elif startswith("<?", i):
++                        self.handle_pi(rawdata[i+2:])
+                     else:
+-                        self.handle_data(rawdata[i:k])
++                        raise AssertionError("we should not get here!")
++                    k = n
+                 i = self.updatepos(i, k)
+             elif startswith("&#", i):
+                 match = charref.match(rawdata, i)
+diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
+index b42a611c62c..729dfe08fa6 100644
+--- a/Lib/test/test_htmlparser.py
++++ b/Lib/test/test_htmlparser.py
+@@ -5,6 +5,7 @@
+ import unittest
+
+ from unittest.mock import patch
++from test import support
+
+
+ class EventCollector(html.parser.HTMLParser):
+@@ -393,28 +394,34 @@ def test_tolerant_parsing(self):
+                             ('data', '<'),
+                             ('starttag', 'bc<', [('a', None)]),
+                             ('endtag', 'html'),
+-                            ('data', '\n<img src="URL>'),
+-                            ('comment', '/img'),
+-                            ('endtag', 'html<')])
++                            ('data', '\n')])
+
+     def test_starttag_junk_chars(self):
++        self._run_check("<", [('data', '<')])
++        self._run_check("<>", [('data', '<>')])
++        self._run_check("< >", [('data', '< >')])
++        self._run_check("< ", [('data', '< ')])
+         self._run_check("</>", [])
++        self._run_check("<$>", [('data', '<$>')])
+         self._run_check("</$>", [('comment', '$')])
+         self._run_check("</", [('data', '</')])
+-        self._run_check("</a", [('data', '</a')])
++        self._run_check("</a", [])
++        self._run_check("</ a>", [('endtag', 'a')])
++        self._run_check("</ a", [('comment', ' a')])
+         self._run_check("<a<a>", [('starttag', 'a<a', [])])
+         self._run_check("</a<a>", [('endtag', 'a<a')])
+-        self._run_check("<!", [('data', '<!')])
+-        self._run_check("<a", [('data', '<a')])
+-        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
+-        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
+-        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
+-        self._run_check("<a foo='>", [('data', "<a foo='>")])
++        self._run_check("<!", [('comment', '')])
++        self._run_check("<a", [])
++        self._run_check("<a foo='bar'", [])
++        self._run_check("<a foo='bar", [])
++        self._run_check("<a foo='>'", [])
++        self._run_check("<a foo='>", [])
+         self._run_check("<a$>", [('starttag', 'a$', [])])
+         self._run_check("<a$b>", [('starttag', 'a$b', [])])
+         self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
+         self._run_check("<a$b  >", [('starttag', 'a$b', [])])
+         self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
++        self._run_check("</a$b>", [('endtag', 'a$b')])
+
+     def test_slashes_in_starttag(self):
+         self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
+@@ -539,13 +546,56 @@ def test_EOF_in_charref(self):
+         for html, expected in data:
+             self._run_check(html, expected)
+
+-    def test_broken_comments(self):
+-        html = ('<! not really a comment >'
++    def test_eof_in_comments(self):
++        data = [
++            ('<!--', [('comment', '')]),
++            ('<!---', [('comment', '')]),
++            ('<!----', [('comment', '')]),
++            ('<!-----', [('comment', '-')]),
++            ('<!------', [('comment', '--')]),
++            ('<!----!', [('comment', '')]),
++            ('<!---!', [('comment', '-!')]),
++            ('<!---!>', [('comment', '-!>')]),
++            ('<!--foo', [('comment', 'foo')]),
++            ('<!--foo-', [('comment', 'foo')]),
++            ('<!--foo--', [('comment', 'foo')]),
++            ('<!--foo--!', [('comment', 'foo')]),
++            ('<!--<!--', [('comment', '<!')]),
++            ('<!--<!--!', [('comment', '<!')]),
++        ]
++        for html, expected in data:
++            self._run_check(html, expected)
++
++    def test_eof_in_declarations(self):
++        data = [
++            ('<!', [('comment', '')]),
++            ('<!-', [('comment', '-')]),
++            ('<![', [('comment', '[')]),
++            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
++            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
++            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
++            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
++            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
++            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
++            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
++            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
++            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
++            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
++            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
++             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
++        ]
++        for html, expected in data:
++            self._run_check(html, expected)
++
++    def test_bogus_comments(self):
++        html = ('<!ELEMENT br EMPTY>'
++                '<! not really a comment >'
+                 '<! not a comment either -->'
+                 '<! -- close enough -->'
+                 '<!><!<-- this was an empty comment>'
+                 '<!!! another bogus comment !!!>')
+         expected = [
++            ('comment', 'ELEMENT br EMPTY'),
+             ('comment', ' not really a comment '),
+             ('comment', ' not a comment either --'),
+             ('comment', ' -- close enough --'),
+@@ -600,6 +650,26 @@ def test_convert_charrefs_dropped_text(self):
+              ('endtag', 'a'), ('data', ' bar & baz')]
+         )
+
++    @support.requires_resource('cpu')
++    def test_eof_no_quadratic_complexity(self):
++        # Each of these examples used to take about an hour.
++        # Now they take a fraction of a second.
++        def check(source):
++            parser = html.parser.HTMLParser()
++            parser.feed(source)
++            parser.close()
++        n = 120_000
++        check("<a " * n)
++        check("<a a=" * n)
++        check("</a " * 14 * n)
++        check("</a a=" * 11 * n)
++        check("<!--" * 4 * n)
++        check("<!" * 60 * n)
++        check("<?" * 19 * n)
++        check("</$" * 15 * n)
++        check("<![CDATA[" * 9 * n)
++        check("<!doctype" * 35 * n)
++
+
+ class AttributesTestCase(TestCaseBase):
+
+diff --git a/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst b/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst
+new file mode 100644
+index 00000000000..cf9aa8dbdf2
+--- /dev/null
++++ b/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst
+@@ -0,0 +1,4 @@
++Fix quadratic complexity in processing specially crafted input in
++:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
++to the HTML5 specs -- comments and declarations are automatically closed,
++tags are ignored.