From 2153a4cc126dfcfa1cabf19bd3a025624d646b65 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 15 Aug 2025 23:08:48 +0300
Subject: [PATCH 1/8] gh-137836: Support more RAWTEXT and PLAINTEXT elements in
 HTMLParser

* the "plaintext" element
* the RAWTEXT elements "xmp", "iframe", "noembed" and "noframes"
* optionally RAWTEXT (if scripting=True) element "noscript"
---
 Doc/library/html.parser.rst                   |   8 +-
 Lib/html/parser.py                            |  17 +-
 Lib/test/test_htmlparser.py                   | 169 ++++++++++++++----
 ...-08-15-23-08-44.gh-issue-137836.b55rhh.rst |   3 +
 4 files changed, 160 insertions(+), 37 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst

diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index dd67fc34e856f1..81b9239185aab1 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -15,7 +15,7 @@
 This module defines a class :class:`HTMLParser` which serves as the basis for
 parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
 
-.. class:: HTMLParser(*, convert_charrefs=True)
+.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
 
    Create a parser instance able to parse invalid markup.
 
@@ -23,6 +23,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
    references (except the ones in ``script``/``style`` elements) are
    automatically converted to the corresponding Unicode characters.
 
+   If *scripting* is true, the ``noscript`` element is parsed in the
+   RAWTEXT mode.
+
    An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
    when start tags, end tags, text, comments, and other markup elements are
    encountered.  The user should subclass :class:`.HTMLParser` and override its
@@ -37,6 +40,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
    .. versionchanged:: 3.5
       The default value for argument *convert_charrefs* is now ``True``.
 
+   .. versionchanged:: 3.13.8
+      Added the *scripting* parameter.
+
 
 Example HTML Parser Application
 -------------------------------
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 75bf8adae6d70a..79850fa6981d55 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -127,17 +127,23 @@ class HTMLParser(_markupbase.ParserBase):
     argument.
     """
 
-    CDATA_CONTENT_ELEMENTS = ("script", "style")
+    # See the HTML5 specs section "13.4 Parsing HTML fragments".
+    # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+    CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
     RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
 
-    def __init__(self, *, convert_charrefs=True):
+    def __init__(self, *, convert_charrefs=True, scripting=False):
         """Initialize and reset this instance.
 
-        If convert_charrefs is True (the default), all character references
+        If convert_charrefs is true (the default), all character references
         are automatically converted to the corresponding Unicode characters.
+
+        If scripting is true, the noscript element is parsed in the
+        RAWTEXT mode.
         """
         super().__init__()
         self.convert_charrefs = convert_charrefs
+        self.scripting = scripting
         self.reset()
 
     def reset(self):
@@ -454,6 +460,11 @@ def parse_starttag(self, i):
                 self.set_cdata_mode(tag)
             elif tag in self.RCDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag, escapable=True)
+            elif self.scripting and tag == "noscript":
+                self.set_cdata_mode(tag)
+            elif tag == "plaintext":
+                self.set_cdata_mode(tag)
+                self.interesting = re.compile(r'\z')
         return endpos
 
     # Internal -- check to see if we have a complete starttag; return end
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index fff41dab321acd..64cc6d8f1893f2 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -324,49 +324,138 @@ def test_style_content(self, content):
                             ("data", content),
                             ("endtag", "style")])
 
-    @support.subTests('content', [
-            '<!-- not a comment -->',
-            "<not a='start tag'>",
-            '<![CDATA[not a cdata]]>',
-            '<!not a bogus comment>',
-            '</not a bogus comment>',
-            '\u2603',
-            '< /title>',
-            '</ title>',
-            '</titled>',
-            '</title\v>',
-            '</title\xa0>',
-            '</tıtle>',
+    @support.subTests('tag', ['title', 'textarea'])
+    def test_rcdata_content(self, tag):
+        content = (
+            '<!-- not a comment -->'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            f'< /{tag}>'
+            f'</ {tag}>'
+            f'</{tag}x>'
+            f'</{tag}\v>'
+            f'</{tag}\xa0>'
+        )
+        source = f"<{tag}>{content}</{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", content),
+            ("endtag", tag),
         ])
-    def test_title_content(self, content):
-        source = f"<title>{content}</title>"
+        source = f"<{tag}>&amp;</{tag}>"
         self._run_check(source, [
-            ("starttag", "title", []),
+            ("starttag", tag, []),
+            ('entityref', 'amp'),
+            ("endtag", tag),
+        ])
+
+    @support.subTests('tag',
+            ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
+    def test_rawtext_content(self, tag):
+        content = (
+            '<!-- not a comment -->'
+            '&not-an-entity-ref;'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            f'< /{tag}>'
+            f'</ {tag}>'
+            f'</{tag}x>'
+            f'</{tag}\v>'
+            f'</{tag}\xa0>'
+        )
+        source = f"<{tag}>{content}</{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
             ("data", content),
-            ("endtag", "title"),
+            ("endtag", tag),
         ])
 
-    @support.subTests('content', [
-            '<!-- not a comment -->',
-            "<not a='start tag'>",
-            '<![CDATA[not a cdata]]>',
-            '<!not a bogus comment>',
-            '</not a bogus comment>',
-            '\u2603',
-            '< /textarea>',
-            '</ textarea>',
-            '</textareable>',
-            '</textarea\v>',
-            '</textarea\xa0>',
+    def test_noscript_content(self):
+        content = (
+            '<!-- not a comment -->'
+            '&not-an-entity-ref;'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            f'< /noscript>'
+            f'</ noscript>'
+            f'</noscriptx>'
+            f'</noscript\v>'
+            f'</noscript\xa0>'
+        )
+        source = f"<noscript>{content}</noscript>"
+        self._run_check(source, [
+            ('starttag', 'noscript', []),
+            ('comment', ' not a comment '),
+            ('entityref', 'not'),
+            ('data', '-an-entity-ref;'),
+            ('starttag', 'not', [('a', 'start tag')]),
+            ('unknown decl', 'CDATA[not a cdata'),
+            ('comment', 'not a bogus comment'),
+            ('endtag', 'not'),
+            ('data', '☃< /noscript>'),
+            ('comment', ' noscript'),
+            ('endtag', 'noscriptx'),
+            ('endtag', 'noscript\x0b'),
+            ('endtag', 'noscript\xa0'),
+            ('endtag', 'noscript')
         ])
-    def test_textarea_content(self, content):
-        source = f"<textarea>{content}</textarea>"
         self._run_check(source, [
-            ("starttag", "textarea", []),
+            ("starttag", "noscript", []),
+            ("data", content),
+            ("endtag", "noscript"),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
+    def test_plaintext_content(self):
+        content = (
+            '<!-- not a comment -->'
+            '&not-an-entity-ref;'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            '</plaintext>'
+        )
+        source = f"<plaintext>{content}"
+        self._run_check(source, [
+            ("starttag", "plaintext", []),
             ("data", content),
-            ("endtag", "textarea"),
         ])
 
+    @support.subTests('tag,endtag', [
+            ('title', 'tıtle'),
+            ('style', 'ſtyle'),
+            ('style', 'ﬅyle'),
+            ('style', 'ﬆyle'),
+            ('iframe', 'ıframe'),
+            ('noframes', 'noframeſ'),
+            ('noscript', 'noſcript'),
+            ('noscript', 'noscrıpt'),
+            ('script', 'ſcript'),
+            ('script', 'scrıpt'),
+        ])
+    def test_invalid_nonascii_closing_tag(self, tag, endtag):
+        source = f"<{tag}><a></{endtag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", f"<a></{endtag}>"),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+        source = f"<{tag}><a></{endtag}></{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", f"<a></{endtag}>"),
+            ("endtag", tag),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
     @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
                                  'script/', 'script foo=bar', 'script foo=">"'])
     def test_script_closing_tag(self, endtag):
@@ -428,6 +517,20 @@ def test_textarea_closing_tag(self, endtag):
                             ("endtag", "textarea")],
                         collector=EventCollectorNoNormalize(convert_charrefs=False))
 
+    @support.subTests('starttag', ['TitLe', 'TexTarEa', 'StyLE', 'XmP',
+                                   'iFraMe', 'noEmBed', 'noFraMes', 'noScrIPt',
+                                   'ScrIPt'])
+    def test_closing_tag(self, starttag):
+        tag = starttag.lower()
+        for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n',
+                       f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']:
+            content = "<!-- not a comment --><i>Spam</i>"
+            s = f'<{starttag}>{content}</{endtag}>'
+            self._run_check(s, [("starttag", tag, []),
+                                ('data', content),
+                                ("endtag", tag)],
+                            collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True))
+
     @support.subTests('tail,end', [
         ('', False),
         ('<', False),
diff --git a/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst
new file mode 100644
index 00000000000000..c30c9439a76a19
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst
@@ -0,0 +1,3 @@
+Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe",
+"noembed" and "noframes", and optionally RAWTEXT element "noscript" in
+:class:`html.parser.HTMLParser`.

From c8429be73477c1a669beb61f12a6756bee90fbf2 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Wed, 15 Oct 2025 11:06:34 +0300
Subject: [PATCH 2/8] Update Doc/library/html.parser.rst

---
 Doc/library/html.parser.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index 81b9239185aab1..fdc24d3a93b741 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -40,7 +40,7 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
    .. versionchanged:: 3.5
       The default value for argument *convert_charrefs* is now ``True``.
 
-   .. versionchanged:: 3.13.8
+   .. versionchanged:: 3.14.1
       Added the *scripting* parameter.
 
 

From 22191061489b51ad22c6087eb2faa93a80208bbd Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 24 Oct 2025 16:56:02 +0300
Subject: [PATCH 3/8] Apply suggestions from code review

Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
---
 Doc/library/html.parser.rst | 6 +++---
 Lib/html/parser.py          | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index fdc24d3a93b741..c949978704afb3 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -20,11 +20,11 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
    Create a parser instance able to parse invalid markup.
 
    If *convert_charrefs* is ``True`` (the default), all character
-   references (except the ones in ``script``/``style`` elements) are
+   references (except the ones in RAWTEXT tags) are
    automatically converted to the corresponding Unicode characters.
 
-   If *scripting* is true, the ``noscript`` element is parsed in the
-   RAWTEXT mode.
+   If *scripting* is false (the default), the content of the ``noscript`` element
+   is parsed normally; if it's true, it's parsed in RAWTEXT mode.
 
    An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
    when start tags, end tags, text, comments, and other markup elements are
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index a879127e0a1750..7052a76874b7b6 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -129,6 +129,7 @@ class HTMLParser(_markupbase.ParserBase):
 
     # See the HTML5 specs section "13.4 Parsing HTML fragments".
     # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+    # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
     CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
     RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
 
@@ -138,8 +139,9 @@ def __init__(self, *, convert_charrefs=True, scripting=False):
         If convert_charrefs is true (the default), all character references
         are automatically converted to the corresponding Unicode characters.
 
-        If scripting is true, the noscript element is parsed in the
-        RAWTEXT mode.
+        If *scripting* is false (the default), the content of the
+        ``noscript`` element is parsed normally; if it's true,
+        it's parsed in RAWTEXT mode.
         """
         super().__init__()
         self.convert_charrefs = convert_charrefs

From 9971a24deab47e7b1e6dfee9cd05be3bf1188ea8 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 25 Oct 2025 12:46:52 +0300
Subject: [PATCH 4/8] Polish the documentation.

---
 Doc/library/html.parser.rst | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index c949978704afb3..6ea2fe8644091c 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -19,12 +19,13 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
 
    Create a parser instance able to parse invalid markup.
 
-   If *convert_charrefs* is ``True`` (the default), all character
-   references (except the ones in RAWTEXT tags) are
+   If *convert_charrefs* is true (the default), all character
+   references (except the ones in elements like ``script`` and ``style``) are
    automatically converted to the corresponding Unicode characters.
 
-   If *scripting* is false (the default), the content of the ``noscript`` element
-   is parsed normally; if it's true, it's parsed in RAWTEXT mode.
+   If *scripting* is false (the default), the content of the ``noscript``
+   element is parsed normally; if it's true, it's parsed in RAWTEXT mode,
+   like ``script``.
 
    An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
    when start tags, end tags, text, comments, and other markup elements are
@@ -167,15 +168,15 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
 .. method:: HTMLParser.handle_data(data)
 
    This method is called to process arbitrary data (e.g. text nodes and the
-   content of ``<script>...</script>`` and ``<style>...</style>``).
+   content of elements like ``script`` and ``style``).
 
 
 .. method:: HTMLParser.handle_entityref(name)
 
    This method is called to process a named character reference of the form
    ``&name;`` (e.g. ``&gt;``), where *name* is a general entity reference
-   (e.g. ``'gt'``).  This method is never called if *convert_charrefs* is
-   ``True``.
+   (e.g. ``'gt'``).
+   This method is only called if *convert_charrefs* is false.
 
 
 .. method:: HTMLParser.handle_charref(name)
@@ -183,8 +184,8 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
    This method is called to process decimal and hexadecimal numeric character
    references of the form :samp:`&#{NNN};` and :samp:`&#x{NNN};`.  For example, the decimal
    equivalent for ``&gt;`` is ``&#62;``, whereas the hexadecimal is ``&#x3E;``;
-   in this case the method will receive ``'62'`` or ``'x3E'``.  This method
-   is never called if *convert_charrefs* is ``True``.
+   in this case the method will receive ``'62'`` or ``'x3E'``.
+   This method is only called if *convert_charrefs* is false.
 
 
 .. method:: HTMLParser.handle_comment(data)
@@ -298,8 +299,8 @@ Parsing an element with a few attributes and a title:
    Data     : Python
    End tag  : h1
 
-The content of ``script`` and ``style`` elements is returned as is, without
-further parsing:
+The content of elements like ``script`` and ``style`` is returned as is,
+without further parsing:
 
 .. doctest::
 
@@ -310,10 +311,10 @@ further parsing:
    End tag  : style
 
    >>> parser.feed('<script type="text/javascript">'
-   ...             'alert("<strong>hello!</strong>");</script>')
+   ...             'alert("<strong>hello! &#9786;</strong>");</script>')
    Start tag: script
         attr: ('type', 'text/javascript')
-   Data     : alert("<strong>hello!</strong>");
+   Data     : alert("<strong>hello! &#9786;</strong>");
    End tag  : script
 
 Parsing comments:
@@ -342,7 +343,7 @@ correct char (note: these 3 references are all equivalent to ``'>'``):
 
 Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
 :meth:`~HTMLParser.handle_data` might be called more than once
-(unless *convert_charrefs* is set to ``True``):
+if *convert_charrefs* is false:
 
 .. doctest::
 

From 69a2b33547b2665238b62601c34029cac9d7f449 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 25 Oct 2025 14:17:32 +0300
Subject: [PATCH 5/8] Rewrite tests.

---
 Lib/test/test_htmlparser.py | 250 +++++++++++++-----------------------
 1 file changed, 87 insertions(+), 163 deletions(-)

diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 32467b9c64595a..19dde9362a43b6 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -8,6 +8,18 @@
 from test import support
 
 
+SAMPLE_RCDATA = (
+    '<!-- not a comment -->'
+    "<not a='start tag'>"
+    '<![CDATA[not a cdata]]>'
+    '<!not a bogus comment>'
+    '</not a bogus comment>'
+    '\u2603'
+)
+
+SAMPLE_RAWTEXT = SAMPLE_RCDATA + '&amp;&#9786;'
+
+
 class EventCollector(html.parser.HTMLParser):
 
     def __init__(self, *args, autocdata=False, **kw):
@@ -293,30 +305,20 @@ def test_get_starttag_text(self):
              'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
             '\n<!-- //\nvar foo = 3.14;\n// -->\n',
             '<!-- \u2603 -->',
-            'foo = "</ script>"',
-            'foo = "</scripture>"',
-            'foo = "</script\v>"',
-            'foo = "</script\xa0>"',
-            'foo = "</ſcript>"',
-            'foo = "</scrıpt>"',
         ])
     def test_script_content(self, content):
         s = f'<script>{content}</script>'
-        self._run_check(s, [("starttag", "script", []),
-                            ("data", content),
-                            ("endtag", "script")])
+        self._run_check(s, [
+            ("starttag", "script", []),
+            ("data", content),
+            ("endtag", "script"),
+        ])
 
     @support.subTests('content', [
             'a::before { content: "<!-- not a comment -->"; }',
             'a::before { content: "&not-an-entity-ref;"; }',
             'a::before { content: "<not a=\'start tag\'>"; }',
             'a::before { content: "\u2603"; }',
-            'a::before { content: "< /style>"; }',
-            'a::before { content: "</ style>"; }',
-            'a::before { content: "</styled>"; }',
-            'a::before { content: "</style\v>"; }',
-            'a::before { content: "</style\xa0>"; }',
-            'a::before { content: "</ſtyle>"; }',
         ])
     def test_style_content(self, content):
         s = f'<style>{content}</style>'
@@ -326,23 +328,10 @@ def test_style_content(self, content):
 
     @support.subTests('tag', ['title', 'textarea'])
     def test_rcdata_content(self, tag):
-        content = (
-            '<!-- not a comment -->'
-            "<not a='start tag'>"
-            '<![CDATA[not a cdata]]>'
-            '<!not a bogus comment>'
-            '</not a bogus comment>'
-            '\u2603'
-            f'< /{tag}>'
-            f'</ {tag}>'
-            f'</{tag}x>'
-            f'</{tag}\v>'
-            f'</{tag}\xa0>'
-        )
-        source = f"<{tag}>{content}</{tag}>"
+        source = f"<{tag}>{SAMPLE_RCDATA}</{tag}>"
         self._run_check(source, [
             ("starttag", tag, []),
-            ("data", content),
+            ("data", SAMPLE_RCDATA),
             ("endtag", tag),
         ])
         source = f"<{tag}>&amp;</{tag}>"
@@ -355,107 +344,43 @@ def test_rcdata_content(self, tag):
     @support.subTests('tag',
             ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
     def test_rawtext_content(self, tag):
-        content = (
-            '<!-- not a comment -->'
-            '&not-an-entity-ref;'
-            "<not a='start tag'>"
-            '<![CDATA[not a cdata]]>'
-            '<!not a bogus comment>'
-            '</not a bogus comment>'
-            '\u2603'
-            f'< /{tag}>'
-            f'</ {tag}>'
-            f'</{tag}x>'
-            f'</{tag}\v>'
-            f'</{tag}\xa0>'
-        )
-        source = f"<{tag}>{content}</{tag}>"
+        source = f"<{tag}>{SAMPLE_RAWTEXT}</{tag}>"
         self._run_check(source, [
             ("starttag", tag, []),
-            ("data", content),
+            ("data", SAMPLE_RAWTEXT),
             ("endtag", tag),
         ])
 
     def test_noscript_content(self):
-        content = (
-            '<!-- not a comment -->'
-            '&not-an-entity-ref;'
-            "<not a='start tag'>"
-            '<![CDATA[not a cdata]]>'
-            '<!not a bogus comment>'
-            '</not a bogus comment>'
-            '\u2603'
-            f'< /noscript>'
-            f'</ noscript>'
-            f'</noscriptx>'
-            f'</noscript\v>'
-            f'</noscript\xa0>'
-        )
-        source = f"<noscript>{content}</noscript>"
+        source = f"<noscript>{SAMPLE_RAWTEXT}</noscript>"
+        # scripting=False -- normal mode
         self._run_check(source, [
             ('starttag', 'noscript', []),
             ('comment', ' not a comment '),
-            ('entityref', 'not'),
-            ('data', '-an-entity-ref;'),
             ('starttag', 'not', [('a', 'start tag')]),
             ('unknown decl', 'CDATA[not a cdata'),
             ('comment', 'not a bogus comment'),
             ('endtag', 'not'),
-            ('data', '☃< /noscript>'),
-            ('comment', ' noscript'),
-            ('endtag', 'noscriptx'),
-            ('endtag', 'noscript\x0b'),
-            ('endtag', 'noscript\xa0'),
-            ('endtag', 'noscript')
+            ('data', '☃'),
+            ('entityref', 'amp'),
+            ('charref', '9786'),
+            ('endtag', 'noscript'),
         ])
+        # scripting=True -- RAWTEXT mode
         self._run_check(source, [
             ("starttag", "noscript", []),
-            ("data", content),
+            ("data", SAMPLE_RAWTEXT),
             ("endtag", "noscript"),
-        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+        ], collector=EventCollector(scripting=True))
 
     def test_plaintext_content(self):
-        content = (
-            '<!-- not a comment -->'
-            '&not-an-entity-ref;'
-            "<not a='start tag'>"
-            '<![CDATA[not a cdata]]>'
-            '<!not a bogus comment>'
-            '</not a bogus comment>'
-            '\u2603'
-            '</plaintext>'
-        )
+        content = SAMPLE_RAWTEXT + '</plaintext>'  # not closing
         source = f"<plaintext>{content}"
         self._run_check(source, [
             ("starttag", "plaintext", []),
             ("data", content),
         ])
 
-    @support.subTests('tag,endtag', [
-            ('title', 'tıtle'),
-            ('style', 'ſtyle'),
-            ('style', 'ﬅyle'),
-            ('style', 'ﬆyle'),
-            ('iframe', 'ıframe'),
-            ('noframes', 'noframeſ'),
-            ('noscript', 'noſcript'),
-            ('noscript', 'noscrıpt'),
-            ('script', 'ſcript'),
-            ('script', 'scrıpt'),
-        ])
-    def test_invalid_nonascii_closing_tag(self, tag, endtag):
-        source = f"<{tag}><a></{endtag}>"
-        self._run_check(source, [
-            ("starttag", tag, []),
-            ("data", f"<a></{endtag}>"),
-        ], collector=EventCollector(convert_charrefs=False, scripting=True))
-        source = f"<{tag}><a></{endtag}></{tag}>"
-        self._run_check(source, [
-            ("starttag", tag, []),
-            ("data", f"<a></{endtag}>"),
-            ("endtag", tag),
-        ], collector=EventCollector(convert_charrefs=False, scripting=True))
-
     @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
                                  'script/', 'script foo=bar', 'script foo=">"'])
     def test_script_closing_tag(self, endtag):
@@ -470,66 +395,65 @@ def test_script_closing_tag(self, endtag):
                             ("endtag", "script")],
                         collector=EventCollectorNoNormalize(convert_charrefs=False))
 
-    @support.subTests('endtag', ['style', 'STYLE', 'style ', 'style\n',
-                                 'style/', 'style foo=bar', 'style foo=">"'])
-    def test_style_closing_tag(self, endtag):
-        content = """
-            b::before { content: "<!-- not a comment -->"; }
-            p::before { content: "&not-an-entity-ref;"; }
-            a::before { content: "<i>"; }
-            a::after { content: "</i>"; }
-            """
-        s = f'<StyLE>{content}</{endtag}>'
-        self._run_check(s, [("starttag", "style", []),
-                            ("data", content),
-                            ("endtag", "style")],
-                        collector=EventCollectorNoNormalize(convert_charrefs=False))
-
-    @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
-                                 'title/', 'title foo=bar', 'title foo=">"'])
-    def test_title_closing_tag(self, endtag):
-        content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
-        s = f'<TitLe>{content}</{endtag}>'
-        self._run_check(s, [("starttag", "title", []),
-                            ('data', '<!-- not a comment --><i>Egg & Spam</i>'),
-                            ("endtag", "title")],
-                        collector=EventCollectorNoNormalize(convert_charrefs=True))
-        self._run_check(s, [("starttag", "title", []),
-                            ('data', '<!-- not a comment --><i>Egg '),
-                            ('entityref', 'amp'),
-                            ('data', ' Spam</i>'),
-                            ("endtag", "title")],
-                        collector=EventCollectorNoNormalize(convert_charrefs=False))
-
-    @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
-                                 'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
-    def test_textarea_closing_tag(self, endtag):
-        content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
-        s = f'<TexTarEa>{content}</{endtag}>'
-        self._run_check(s, [("starttag", "textarea", []),
-                            ('data', '<!-- not a comment --><i>Egg & Spam</i>'),
-                            ("endtag", "textarea")],
-                        collector=EventCollectorNoNormalize(convert_charrefs=True))
-        self._run_check(s, [("starttag", "textarea", []),
-                            ('data', '<!-- not a comment --><i>Egg '),
-                            ('entityref', 'amp'),
-                            ('data', ' Spam</i>'),
-                            ("endtag", "textarea")],
-                        collector=EventCollectorNoNormalize(convert_charrefs=False))
-
-    @support.subTests('starttag', ['TitLe', 'TexTarEa', 'StyLE', 'XmP',
-                                   'iFraMe', 'noEmBed', 'noFraMes', 'noScrIPt',
-                                   'ScrIPt'])
-    def test_closing_tag(self, starttag):
-        tag = starttag.lower()
+    @support.subTests('tag', [
+        'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes',
+        'textarea', 'title', 'noscript',
+    ])
+    def test_closing_tag(self, tag):
         for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n',
                        f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']:
             content = "<!-- not a comment --><i>Spam</i>"
-            s = f'<{starttag}>{content}</{endtag}>'
-            self._run_check(s, [("starttag", tag, []),
-                                ('data', content),
-                                ("endtag", tag)],
-                            collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True))
+            s = f'<{tag.upper()}>{content}</{endtag}>'
+            self._run_check(s, [
+                ("starttag", tag, []),
+                ('data', content),
+                ("endtag", tag),
+            ], collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True))
+
+    @support.subTests('tag', [
+        'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes',
+        'textarea', 'title', 'noscript',
+    ])
+    def test_invalid_closing_tag(self, tag):
+        content = (
+            f'< /{tag}>'
+            f'</ {tag}>'
+            f'</{tag}x>'
+            f'</{tag}\v>'
+            f'</{tag}\xa0>'
+        )
+        source = f"<{tag}>{content}</{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", content),
+            ("endtag", tag),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
+    @support.subTests('tag,endtag', [
+        ('title', 'tıtle'),
+        ('style', 'ſtyle'),
+        ('style', 'ﬅyle'),
+        ('style', 'ﬆyle'),
+        ('iframe', 'ıframe'),
+        ('noframes', 'noframeſ'),
+        ('noscript', 'noſcript'),
+        ('noscript', 'noscrıpt'),
+        ('script', 'ſcript'),
+        ('script', 'scrıpt'),
+    ])
+    def test_invalid_nonascii_closing_tag(self, tag, endtag):
+        content = f"<br></{endtag}>"
+        source = f"<{tag}>{content}"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", content),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+        source = f"<{tag}>{content}</{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", content),
+            ("endtag", tag),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
 
     @support.subTests('tail,end', [
         ('', False),

From 08f483593e114b0f6ce22691a491b8f036e21e9b Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 25 Oct 2025 14:22:38 +0300
Subject: [PATCH 6/8] Use set_cdata_mode(escapable=None) for PLAINTEXT.

---
 Lib/html/parser.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 7052a76874b7b6..82fe61270bfa86 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -180,7 +180,9 @@ def get_starttag_text(self):
     def set_cdata_mode(self, elem, *, escapable=False):
         self.cdata_elem = elem.lower()
         self._escapable = escapable
-        if escapable and not self.convert_charrefs:
+        if escapable is None:  # PLAINTEXT mode
+            self.interesting = re.compile(r'\z')
+        elif escapable and not self.convert_charrefs:
             self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
                                           re.IGNORECASE|re.ASCII)
         else:
@@ -459,8 +461,7 @@ def parse_starttag(self, i):
             elif self.scripting and tag == "noscript":
                 self.set_cdata_mode(tag)
             elif tag == "plaintext":
-                self.set_cdata_mode(tag)
-                self.interesting = re.compile(r'\z')
+                self.set_cdata_mode(tag, escapable=None)
         return endpos
 
     # Internal -- check to see if we have a complete starttag; return end

From 428abe1f0a62bef2b6159cbdde640e7dabbfa4be Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 31 Oct 2025 16:19:24 +0200
Subject: [PATCH 7/8] Apply suggestions from code review

Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
---
 Doc/library/html.parser.rst | 4 ++--
 Lib/html/parser.py          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index 6ea2fe8644091c..341a8337ba2ceb 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -24,8 +24,8 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
    automatically converted to the corresponding Unicode characters.
 
    If *scripting* is false (the default), the content of the ``noscript``
-   element is parsed normally; if it's true, it's parsed in RAWTEXT mode,
-   like ``script``.
+   element is parsed normally; if it's true, it's returned as is without
+   being parsed.
 
    An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
    when start tags, end tags, text, comments, and other markup elements are
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 82fe61270bfa86..15badbd5eb84e6 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -141,7 +141,7 @@ def __init__(self, *, convert_charrefs=True, scripting=False):
 
         If *scripting* is false (the default), the content of the
         ``noscript`` element is parsed normally; if it's true,
-        it's parsed in RAWTEXT mode.
+        it's returned as is without being parsed.
         """
         super().__init__()
         self.convert_charrefs = convert_charrefs

From 350ce25e1be99da6640a20b407d56e0073e2220e Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 31 Oct 2025 17:06:39 +0200
Subject: [PATCH 8/8] Apply suggestions.

---
 Lib/html/parser.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 82fe61270bfa86..1274b25eaca579 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -180,7 +180,7 @@ def get_starttag_text(self):
     def set_cdata_mode(self, elem, *, escapable=False):
         self.cdata_elem = elem.lower()
         self._escapable = escapable
-        if escapable is None:  # PLAINTEXT mode
+        if self.cdata_elem == 'plaintext':
             self.interesting = re.compile(r'\z')
         elif escapable and not self.convert_charrefs:
             self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
@@ -454,14 +454,12 @@ def parse_starttag(self, i):
             self.handle_startendtag(tag, attrs)
         else:
             self.handle_starttag(tag, attrs)
-            if tag in self.CDATA_CONTENT_ELEMENTS:
-                self.set_cdata_mode(tag)
+            if (tag in self.CDATA_CONTENT_ELEMENTS or
+                (self.scripting and tag == "noscript") or
+                tag == "plaintext"):
+                self.set_cdata_mode(tag, escapable=False)
             elif tag in self.RCDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag, escapable=True)
-            elif self.scripting and tag == "noscript":
-                self.set_cdata_mode(tag)
-            elif tag == "plaintext":
-                self.set_cdata_mode(tag, escapable=None)
         return endpos
 
     # Internal -- check to see if we have a complete starttag; return end