From 2153a4cc126dfcfa1cabf19bd3a025624d646b65 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 15 Aug 2025 23:08:48 +0300 Subject: [PATCH 1/8] gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser * the "plaintext" element * the RAWTEXT elements "xmp", "iframe", "noembed" and "noframes" * optionally RAWTEXT (if scripting=True) element "noscript" --- Doc/library/html.parser.rst | 8 +- Lib/html/parser.py | 17 +- Lib/test/test_htmlparser.py | 169 ++++++++++++++---- ...-08-15-23-08-44.gh-issue-137836.b55rhh.rst | 3 + 4 files changed, 160 insertions(+), 37 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index dd67fc34e856f1..81b9239185aab1 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -15,7 +15,7 @@ This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(*, convert_charrefs=True) +.. class:: HTMLParser(*, convert_charrefs=True, scripting=False) Create a parser instance able to parse invalid markup. @@ -23,6 +23,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. references (except the ones in ``script``/``style`` elements) are automatically converted to the corresponding Unicode characters. + If *scripting* is true, the ``noscript`` element is parsed in the + RAWTEXT mode. + An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are encountered. The user should subclass :class:`.HTMLParser` and override its @@ -37,6 +40,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. .. versionchanged:: 3.5 The default value for argument *convert_charrefs* is now ``True``. + .. versionchanged:: 3.13.8 + Added the *scripting* parameter. + Example HTML Parser Application ------------------------------- diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 75bf8adae6d70a..79850fa6981d55 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -127,17 +127,23 @@ class HTMLParser(_markupbase.ParserBase): argument. """ - CDATA_CONTENT_ELEMENTS = ("script", "style") + # See the HTML5 specs section "13.4 Parsing HTML fragments". + # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments + CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes") RCDATA_CONTENT_ELEMENTS = ("textarea", "title") - def __init__(self, *, convert_charrefs=True): + def __init__(self, *, convert_charrefs=True, scripting=False): """Initialize and reset this instance. - If convert_charrefs is True (the default), all character references + If convert_charrefs is true (the default), all character references are automatically converted to the corresponding Unicode characters. + + If scripting is true, the noscript element is parsed in the + RAWTEXT mode. """ super().__init__() self.convert_charrefs = convert_charrefs + self.scripting = scripting self.reset() def reset(self): @@ -454,6 +460,11 @@ def parse_starttag(self, i): self.set_cdata_mode(tag) elif tag in self.RCDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag, escapable=True) + elif self.scripting and tag == "noscript": + self.set_cdata_mode(tag) + elif tag == "plaintext": + self.set_cdata_mode(tag) + self.interesting = re.compile(r'\z') return endpos # Internal -- check to see if we have a complete starttag; return end diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index fff41dab321acd..64cc6d8f1893f2 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -324,49 +324,138 @@ def test_style_content(self, content): ("data", content), ("endtag", "style")]) - @support.subTests('content', [ - '', - "", - '', - '', - '', - '\u2603', - '< /title>', - '', - '', - '', - '', - '', + @support.subTests('tag', ['title', 'textarea']) + def test_rcdata_content(self, tag): + content = ( + '' + "" + '' + '' + '' + '\u2603' + f'< /{tag}>' + f'' + f'' + f'' + f'' + ) + source = f"<{tag}>{content}" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ("endtag", tag), ]) - def test_title_content(self, content): - source = f"{content}" + source = f"<{tag}>&" self._run_check(source, [ - ("starttag", "title", []), + ("starttag", tag, []), + ('entityref', 'amp'), + ("endtag", tag), + ]) + + @support.subTests('tag', + ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script']) + def test_rawtext_content(self, tag): + content = ( + '' + '¬-an-entity-ref;' + "" + '' + '' + '' + '\u2603' + f'< /{tag}>' + f'' + f'' + f'' + f'' + ) + source = f"<{tag}>{content}" + self._run_check(source, [ + ("starttag", tag, []), ("data", content), - ("endtag", "title"), + ("endtag", tag), ]) - @support.subTests('content', [ - '', - "", - '', - '', - '', - '\u2603', - '< /textarea>', - '', - '', - '', - '', + def test_noscript_content(self): + content = ( + '' + '¬-an-entity-ref;' + "" + '' + '' + '' + '\u2603' + f'< /noscript>' + f'' + f'' + f'' + f'' + ) + source = f"" + self._run_check(source, [ + ('starttag', 'noscript', []), + ('comment', ' not a comment '), + ('entityref', 'not'), + ('data', '-an-entity-ref;'), + ('starttag', 'not', [('a', 'start tag')]), + ('unknown decl', 'CDATA[not a cdata'), + ('comment', 'not a bogus comment'), + ('endtag', 'not'), + ('data', '☃< /noscript>'), + ('comment', ' noscript'), + ('endtag', 'noscriptx'), + ('endtag', 'noscript\x0b'), + ('endtag', 'noscript\xa0'), + ('endtag', 'noscript') ]) - def test_textarea_content(self, content): - source = f"" self._run_check(source, [ - ("starttag", "textarea", []), + ("starttag", "noscript", []), + ("data", content), + ("endtag", "noscript"), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + + def test_plaintext_content(self): + content = ( + '' + '¬-an-entity-ref;' + "" + '' + '' + '' + '\u2603' + '' + ) + source = f"{content}" + self._run_check(source, [ + ("starttag", "plaintext", []), ("data", content), - ("endtag", "textarea"), ]) + @support.subTests('tag,endtag', [ + ('title', 'tıtle'), + ('style', 'ſtyle'), + ('style', 'ſtyle'), + ('style', 'style'), + ('iframe', 'ıframe'), + ('noframes', 'noframeſ'), + ('noscript', 'noſcript'), + ('noscript', 'noscrıpt'), + ('script', 'ſcript'), + ('script', 'scrıpt'), + ]) + def test_invalid_nonascii_closing_tag(self, tag, endtag): + source = f"<{tag}><a></{endtag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", f"<a></{endtag}>"), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + source = f"<{tag}><a></{endtag}></{tag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", f"<a></{endtag}>"), + ("endtag", tag), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n', 'script/', 'script foo=bar', 'script foo=">"']) def test_script_closing_tag(self, endtag): @@ -428,6 +517,20 @@ def test_textarea_closing_tag(self, endtag): ("endtag", "textarea")], collector=EventCollectorNoNormalize(convert_charrefs=False)) + @support.subTests('starttag', ['TitLe', 'TexTarEa', 'StyLE', 'XmP', + 'iFraMe', 'noEmBed', 'noFraMes', 'noScrIPt', + 'ScrIPt']) + def test_closing_tag(self, starttag): + tag = starttag.lower() + for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n', + f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']: + content = "<!-- not a comment --><i>Spam</i>" + s = f'<{starttag}>{content}</{endtag}>' + self._run_check(s, [("starttag", tag, []), + ('data', content), + ("endtag", tag)], + collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True)) + @support.subTests('tail,end', [ ('', False), ('<', False), diff --git a/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst new file mode 100644 index 00000000000000..c30c9439a76a19 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst @@ -0,0 +1,3 @@ +Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe", +"noembed" and "noframes", and optionally RAWTEXT element "noscript" in +:class:`html.parser.HTMLParser`. From c8429be73477c1a669beb61f12a6756bee90fbf2 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Wed, 15 Oct 2025 11:06:34 +0300 Subject: [PATCH 2/8] Update Doc/library/html.parser.rst --- Doc/library/html.parser.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index 81b9239185aab1..fdc24d3a93b741 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -40,7 +40,7 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. .. versionchanged:: 3.5 The default value for argument *convert_charrefs* is now ``True``. - .. versionchanged:: 3.13.8 + .. versionchanged:: 3.14.1 Added the *scripting* parameter. From 22191061489b51ad22c6087eb2faa93a80208bbd Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Fri, 24 Oct 2025 16:56:02 +0300 Subject: [PATCH 3/8] Apply suggestions from code review Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> --- Doc/library/html.parser.rst | 6 +++--- Lib/html/parser.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index fdc24d3a93b741..c949978704afb3 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -20,11 +20,11 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. Create a parser instance able to parse invalid markup. If *convert_charrefs* is ``True`` (the default), all character - references (except the ones in ``script``/``style`` elements) are + references (except the ones in RAWTEXT tags) are automatically converted to the corresponding Unicode characters. - If *scripting* is true, the ``noscript`` element is parsed in the - RAWTEXT mode. + If *scripting* is false (the default), the content of the ``noscript`` element + is parsed normally; if it's true, it's parsed in RAWTEXT mode. An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are diff --git a/Lib/html/parser.py b/Lib/html/parser.py index a879127e0a1750..7052a76874b7b6 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -129,6 +129,7 @@ class HTMLParser(_markupbase.ParserBase): # See the HTML5 specs section "13.4 Parsing HTML fragments". # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments + # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes") RCDATA_CONTENT_ELEMENTS = ("textarea", "title") @@ -138,8 +139,9 @@ def __init__(self, *, convert_charrefs=True, scripting=False): If convert_charrefs is true (the default), all character references are automatically converted to the corresponding Unicode characters. - If scripting is true, the noscript element is parsed in the - RAWTEXT mode. + If *scripting* is false (the default), the content of the + ``noscript`` element is parsed normally; if it's true, + it's parsed in RAWTEXT mode. """ super().__init__() self.convert_charrefs = convert_charrefs From 9971a24deab47e7b1e6dfee9cd05be3bf1188ea8 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Sat, 25 Oct 2025 12:46:52 +0300 Subject: [PATCH 4/8] Polish the documentation. --- Doc/library/html.parser.rst | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index c949978704afb3..6ea2fe8644091c 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -19,12 +19,13 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. Create a parser instance able to parse invalid markup. - If *convert_charrefs* is ``True`` (the default), all character - references (except the ones in RAWTEXT tags) are + If *convert_charrefs* is true (the default), all character + references (except the ones in elements like ``script`` and ``style``) are automatically converted to the corresponding Unicode characters. - If *scripting* is false (the default), the content of the ``noscript`` element - is parsed normally; if it's true, it's parsed in RAWTEXT mode. + If *scripting* is false (the default), the content of the ``noscript`` + element is parsed normally; if it's true, it's parsed in RAWTEXT mode, + like ``script``. An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are @@ -167,15 +168,15 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): .. method:: HTMLParser.handle_data(data) This method is called to process arbitrary data (e.g. text nodes and the - content of ``<script>...</script>`` and ``<style>...</style>``). + content of elements like ``script`` and ``style``). .. method:: HTMLParser.handle_entityref(name) This method is called to process a named character reference of the form ``&name;`` (e.g. ``&gt;``), where *name* is a general entity reference - (e.g. ``'gt'``). This method is never called if *convert_charrefs* is - ``True``. + (e.g. ``'gt'``). + This method is only called if *convert_charrefs* is false. .. method:: HTMLParser.handle_charref(name) @@ -183,8 +184,8 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): This method is called to process decimal and hexadecimal numeric character references of the form :samp:`&#{NNN};` and :samp:`&#x{NNN};`. For example, the decimal equivalent for ``&gt;`` is ``&#62;``, whereas the hexadecimal is ``&#x3E;``; - in this case the method will receive ``'62'`` or ``'x3E'``. This method - is never called if *convert_charrefs* is ``True``. + in this case the method will receive ``'62'`` or ``'x3E'``. + This method is only called if *convert_charrefs* is false. .. method:: HTMLParser.handle_comment(data) @@ -298,8 +299,8 @@ Parsing an element with a few attributes and a title: Data : Python End tag : h1 -The content of ``script`` and ``style`` elements is returned as is, without -further parsing: +The content of elements like ``script`` and ``style`` is returned as is, +without further parsing: .. doctest:: @@ -310,10 +311,10 @@ further parsing: End tag : style >>> parser.feed('<script type="text/javascript">' - ... 'alert("<strong>hello!</strong>");</script>') + ... 'alert("<strong>hello! &#9786;</strong>");</script>') Start tag: script attr: ('type', 'text/javascript') - Data : alert("<strong>hello!</strong>"); + Data : alert("<strong>hello! &#9786;</strong>"); End tag : script Parsing comments: @@ -342,7 +343,7 @@ correct char (note: these 3 references are all equivalent to ``'>'``): Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but :meth:`~HTMLParser.handle_data` might be called more than once -(unless *convert_charrefs* is set to ``True``): +if *convert_charrefs* is false: .. doctest:: From 69a2b33547b2665238b62601c34029cac9d7f449 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Sat, 25 Oct 2025 14:17:32 +0300 Subject: [PATCH 5/8] Rewrite tests. --- Lib/test/test_htmlparser.py | 250 +++++++++++++----------------------- 1 file changed, 87 insertions(+), 163 deletions(-) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 32467b9c64595a..19dde9362a43b6 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -8,6 +8,18 @@ from test import support +SAMPLE_RCDATA = ( + '<!-- not a comment -->' + "<not a='start tag'>" + '<![CDATA[not a cdata]]>' + '<!not a bogus comment>' + '</not a bogus comment>' + '\u2603' +) + +SAMPLE_RAWTEXT = SAMPLE_RCDATA + '&amp;&#9786;' + + class EventCollector(html.parser.HTMLParser): def __init__(self, *args, autocdata=False, **kw): @@ -293,30 +305,20 @@ def test_get_starttag_text(self): 'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'), '\n<!-- //\nvar foo = 3.14;\n// -->\n', '<!-- \u2603 -->', - 'foo = "</ script>"', - 'foo = "</scripture>"', - 'foo = "</script\v>"', - 'foo = "</script\xa0>"', - 'foo = "</ſcript>"', - 'foo = "</scrıpt>"', ]) def test_script_content(self, content): s = f'<script>{content}</script>' - self._run_check(s, [("starttag", "script", []), - ("data", content), - ("endtag", "script")]) + self._run_check(s, [ + ("starttag", "script", []), + ("data", content), + ("endtag", "script"), + ]) @support.subTests('content', [ 'a::before { content: "<!-- not a comment -->"; }', 'a::before { content: "&not-an-entity-ref;"; }', 'a::before { content: "<not a=\'start tag\'>"; }', 'a::before { content: "\u2603"; }', - 'a::before { content: "< /style>"; }', - 'a::before { content: "</ style>"; }', - 'a::before { content: "</styled>"; }', - 'a::before { content: "</style\v>"; }', - 'a::before { content: "</style\xa0>"; }', - 'a::before { content: "</ſtyle>"; }', ]) def test_style_content(self, content): s = f'<style>{content}</style>' @@ -326,23 +328,10 @@ def test_style_content(self, content): @support.subTests('tag', ['title', 'textarea']) def test_rcdata_content(self, tag): - content = ( - '<!-- not a comment -->' - "<not a='start tag'>" - '<![CDATA[not a cdata]]>' - '<!not a bogus comment>' - '</not a bogus comment>' - '\u2603' - f'< /{tag}>' - f'</ {tag}>' - f'</{tag}x>' - f'</{tag}\v>' - f'</{tag}\xa0>' - ) - source = f"<{tag}>{content}</{tag}>" + source = f"<{tag}>{SAMPLE_RCDATA}</{tag}>" self._run_check(source, [ ("starttag", tag, []), - ("data", content), + ("data", SAMPLE_RCDATA), ("endtag", tag), ]) source = f"<{tag}>&amp;</{tag}>" @@ -355,107 +344,43 @@ def test_rcdata_content(self, tag): @support.subTests('tag', ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script']) def test_rawtext_content(self, tag): - content = ( - '<!-- not a comment -->' - '&not-an-entity-ref;' - "<not a='start tag'>" - '<![CDATA[not a cdata]]>' - '<!not a bogus comment>' - '</not a bogus comment>' - '\u2603' - f'< /{tag}>' - f'</ {tag}>' - f'</{tag}x>' - f'</{tag}\v>' - f'</{tag}\xa0>' - ) - source = f"<{tag}>{content}</{tag}>" + source = f"<{tag}>{SAMPLE_RAWTEXT}</{tag}>" self._run_check(source, [ ("starttag", tag, []), - ("data", content), + ("data", SAMPLE_RAWTEXT), ("endtag", tag), ]) def test_noscript_content(self): - content = ( - '<!-- not a comment -->' - '&not-an-entity-ref;' - "<not a='start tag'>" - '<![CDATA[not a cdata]]>' - '<!not a bogus comment>' - '</not a bogus comment>' - '\u2603' - f'< /noscript>' - f'</ noscript>' - f'</noscriptx>' - f'</noscript\v>' - f'</noscript\xa0>' - ) - source = f"<noscript>{content}</noscript>" + source = f"<noscript>{SAMPLE_RAWTEXT}</noscript>" + # scripting=False -- normal mode self._run_check(source, [ ('starttag', 'noscript', []), ('comment', ' not a comment '), - ('entityref', 'not'), - ('data', '-an-entity-ref;'), ('starttag', 'not', [('a', 'start tag')]), ('unknown decl', 'CDATA[not a cdata'), ('comment', 'not a bogus comment'), ('endtag', 'not'), - ('data', '☃< /noscript>'), - ('comment', ' noscript'), - ('endtag', 'noscriptx'), - ('endtag', 'noscript\x0b'), - ('endtag', 'noscript\xa0'), - ('endtag', 'noscript') + ('data', '☃'), + ('entityref', 'amp'), + ('charref', '9786'), + ('endtag', 'noscript'), ]) + # scripting=True -- RAWTEXT mode self._run_check(source, [ ("starttag", "noscript", []), - ("data", content), + ("data", SAMPLE_RAWTEXT), ("endtag", "noscript"), - ], collector=EventCollector(convert_charrefs=False, scripting=True)) + ], collector=EventCollector(scripting=True)) def test_plaintext_content(self): - content = ( - '<!-- not a comment -->' - '&not-an-entity-ref;' - "<not a='start tag'>" - '<![CDATA[not a cdata]]>' - '<!not a bogus comment>' - '</not a bogus comment>' - '\u2603' - '</plaintext>' - ) + content = SAMPLE_RAWTEXT + '</plaintext>' # not closing source = f"<plaintext>{content}" self._run_check(source, [ ("starttag", "plaintext", []), ("data", content), ]) - @support.subTests('tag,endtag', [ - ('title', 'tıtle'), - ('style', 'ſtyle'), - ('style', 'ſtyle'), - ('style', 'style'), - ('iframe', 'ıframe'), - ('noframes', 'noframeſ'), - ('noscript', 'noſcript'), - ('noscript', 'noscrıpt'), - ('script', 'ſcript'), - ('script', 'scrıpt'), - ]) - def test_invalid_nonascii_closing_tag(self, tag, endtag): - source = f"<{tag}><a></{endtag}>" - self._run_check(source, [ - ("starttag", tag, []), - ("data", f"<a></{endtag}>"), - ], collector=EventCollector(convert_charrefs=False, scripting=True)) - source = f"<{tag}><a></{endtag}></{tag}>" - self._run_check(source, [ - ("starttag", tag, []), - ("data", f"<a></{endtag}>"), - ("endtag", tag), - ], collector=EventCollector(convert_charrefs=False, scripting=True)) - @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n', 'script/', 'script foo=bar', 'script foo=">"']) def test_script_closing_tag(self, endtag): @@ -470,66 +395,65 @@ def test_script_closing_tag(self, endtag): ("endtag", "script")], collector=EventCollectorNoNormalize(convert_charrefs=False)) - @support.subTests('endtag', ['style', 'STYLE', 'style ', 'style\n', - 'style/', 'style foo=bar', 'style foo=">"']) - def test_style_closing_tag(self, endtag): - content = """ - b::before { content: "<!-- not a comment -->"; } - p::before { content: "&not-an-entity-ref;"; } - a::before { content: "<i>"; } - a::after { content: "</i>"; } - """ - s = f'<StyLE>{content}</{endtag}>' - self._run_check(s, [("starttag", "style", []), - ("data", content), - ("endtag", "style")], - collector=EventCollectorNoNormalize(convert_charrefs=False)) - - @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n', - 'title/', 'title foo=bar', 'title foo=">"']) - def test_title_closing_tag(self, endtag): - content = "<!-- not a comment --><i>Egg &amp; Spam</i>" - s = f'<TitLe>{content}</{endtag}>' - self._run_check(s, [("starttag", "title", []), - ('data', '<!-- not a comment --><i>Egg & Spam</i>'), - ("endtag", "title")], - collector=EventCollectorNoNormalize(convert_charrefs=True)) - self._run_check(s, [("starttag", "title", []), - ('data', '<!-- not a comment --><i>Egg '), - ('entityref', 'amp'), - ('data', ' Spam</i>'), - ("endtag", "title")], - collector=EventCollectorNoNormalize(convert_charrefs=False)) - - @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n', - 'textarea/', 'textarea foo=bar', 'textarea foo=">"']) - def test_textarea_closing_tag(self, endtag): - content = "<!-- not a comment --><i>Egg &amp; Spam</i>" - s = f'<TexTarEa>{content}</{endtag}>' - self._run_check(s, [("starttag", "textarea", []), - ('data', '<!-- not a comment --><i>Egg & Spam</i>'), - ("endtag", "textarea")], - collector=EventCollectorNoNormalize(convert_charrefs=True)) - self._run_check(s, [("starttag", "textarea", []), - ('data', '<!-- not a comment --><i>Egg '), - ('entityref', 'amp'), - ('data', ' Spam</i>'), - ("endtag", "textarea")], - collector=EventCollectorNoNormalize(convert_charrefs=False)) - - @support.subTests('starttag', ['TitLe', 'TexTarEa', 'StyLE', 'XmP', - 'iFraMe', 'noEmBed', 'noFraMes', 'noScrIPt', - 'ScrIPt']) - def test_closing_tag(self, starttag): - tag = starttag.lower() + @support.subTests('tag', [ + 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', + 'textarea', 'title', 'noscript', + ]) + def test_closing_tag(self, tag): for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n', f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']: content = "<!-- not a comment --><i>Spam</i>" - s = f'<{starttag}>{content}</{endtag}>' - self._run_check(s, [("starttag", tag, []), - ('data', content), - ("endtag", tag)], - collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True)) + s = f'<{tag.upper()}>{content}</{endtag}>' + self._run_check(s, [ + ("starttag", tag, []), + ('data', content), + ("endtag", tag), + ], collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True)) + + @support.subTests('tag', [ + 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', + 'textarea', 'title', 'noscript', + ]) + def test_invalid_closing_tag(self, tag): + content = ( + f'< /{tag}>' + f'</ {tag}>' + f'</{tag}x>' + f'</{tag}\v>' + f'</{tag}\xa0>' + ) + source = f"<{tag}>{content}</{tag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ("endtag", tag), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + + @support.subTests('tag,endtag', [ + ('title', 'tıtle'), + ('style', 'ſtyle'), + ('style', 'ſtyle'), + ('style', 'style'), + ('iframe', 'ıframe'), + ('noframes', 'noframeſ'), + ('noscript', 'noſcript'), + ('noscript', 'noscrıpt'), + ('script', 'ſcript'), + ('script', 'scrıpt'), + ]) + def test_invalid_nonascii_closing_tag(self, tag, endtag): + content = f"<br></{endtag}>" + source = f"<{tag}>{content}" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + source = f"<{tag}>{content}</{tag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ("endtag", tag), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) @support.subTests('tail,end', [ ('', False), From 08f483593e114b0f6ce22691a491b8f036e21e9b Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Sat, 25 Oct 2025 14:22:38 +0300 Subject: [PATCH 6/8] Use set_cdata_mode(escapable=None) for PLAINTEXT. --- Lib/html/parser.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 7052a76874b7b6..82fe61270bfa86 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -180,7 +180,9 @@ def get_starttag_text(self): def set_cdata_mode(self, elem, *, escapable=False): self.cdata_elem = elem.lower() self._escapable = escapable - if escapable and not self.convert_charrefs: + if escapable is None: # PLAINTEXT mode + self.interesting = re.compile(r'\z') + elif escapable and not self.convert_charrefs: self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem, re.IGNORECASE|re.ASCII) else: @@ -459,8 +461,7 @@ def parse_starttag(self, i): elif self.scripting and tag == "noscript": self.set_cdata_mode(tag) elif tag == "plaintext": - self.set_cdata_mode(tag) - self.interesting = re.compile(r'\z') + self.set_cdata_mode(tag, escapable=None) return endpos # Internal -- check to see if we have a complete starttag; return end From 428abe1f0a62bef2b6159cbdde640e7dabbfa4be Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Fri, 31 Oct 2025 16:19:24 +0200 Subject: [PATCH 7/8] Apply suggestions from code review Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> --- Doc/library/html.parser.rst | 4 ++-- Lib/html/parser.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index 6ea2fe8644091c..341a8337ba2ceb 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -24,8 +24,8 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. automatically converted to the corresponding Unicode characters. If *scripting* is false (the default), the content of the ``noscript`` - element is parsed normally; if it's true, it's parsed in RAWTEXT mode, - like ``script``. + element is parsed normally; if it's true, it's returned as is without + being parsed. An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 82fe61270bfa86..15badbd5eb84e6 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -141,7 +141,7 @@ def __init__(self, *, convert_charrefs=True, scripting=False): If *scripting* is false (the default), the content of the ``noscript`` element is parsed normally; if it's true, - it's parsed in RAWTEXT mode. + it's returned as is without being parsed. """ super().__init__() self.convert_charrefs = convert_charrefs From 350ce25e1be99da6640a20b407d56e0073e2220e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Fri, 31 Oct 2025 17:06:39 +0200 Subject: [PATCH 8/8] Apply suggestions. --- Lib/html/parser.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 82fe61270bfa86..1274b25eaca579 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -180,7 +180,7 @@ def get_starttag_text(self): def set_cdata_mode(self, elem, *, escapable=False): self.cdata_elem = elem.lower() self._escapable = escapable - if escapable is None: # PLAINTEXT mode + if self.cdata_elem == 'plaintext': self.interesting = re.compile(r'\z') elif escapable and not self.convert_charrefs: self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem, @@ -454,14 +454,12 @@ def parse_starttag(self, i): self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) + if (tag in self.CDATA_CONTENT_ELEMENTS or + (self.scripting and tag == "noscript") or + tag == "plaintext"): + self.set_cdata_mode(tag, escapable=False) elif tag in self.RCDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag, escapable=True) - elif self.scripting and tag == "noscript": - self.set_cdata_mode(tag) - elif tag == "plaintext": - self.set_cdata_mode(tag, escapable=None) return endpos # Internal -- check to see if we have a complete starttag; return end