From cde9150c5831a720930688bb0addf98139b3ec02 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Mon, 2 Feb 2026 19:46:59 -0700 Subject: [PATCH 1/5] More reliable fix for `') -commentabruptclose = re.compile(r'-?>') # Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. # Users can still do `from html import parser` and get the default behavior. @@ -48,6 +47,8 @@ # throwing it away. When we see it, we will process it as data. htmlparser.starttagopen = re.compile('<[a-zA-Z]|') +htmlparser.endtagopen = re.compile('` to close Processing Instructions. htmlparser.piclose = re.compile(r'\?>') # Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon. @@ -110,9 +111,6 @@ def __init__(self, md: Markdown, *args, **kwargs): self.lineno_start_cache = [0] - self.override_comment_update = False - self.override_comment_start = 0 - # This calls self.reset super().__init__(*args, **kwargs) self.md = md @@ -125,8 +123,6 @@ def reset(self): self._cache: list[str] = [] self.cleandoc: list[str] = [] self.lineno_start_cache = [0] - self.override_comment_start = 0 - self.override_comment_update = False super().reset() @@ -207,6 +203,14 @@ def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]): # This is presumably a standalone tag in a code span (see #1036). self.clear_cdata_mode() + def parse_endtag(self, i): + start = self.rawdata[i:i+3] + c = ord(start[-1]) + if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122): + self.handle_data(self.rawdata[i:i + 2]) + return i + 2 + return super().parse_endtag(i) + def handle_endtag(self, tag: str): text = self.get_endtag_text(tag) @@ -276,22 +280,8 @@ def handle_entityref(self, name: str): def handle_comment(self, data: str): # Check if the comment is unclosed, if so, we need to override position - j = self.rawdata.find(data) - i = j - 2 - if self.rawdata[i:j] == ''.format(data), is_block=True) - def updatepos(self, i: int, j: int) -> int: - if self.override_comment_update: - self.override_comment_update = False - i = self.override_comment_start - j = self.override_comment_start + 1 - return super().updatepos(i, j) - def handle_decl(self, data: str): self.handle_empty_tag(''.format(data), is_block=True) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 87d4dc0b..80688a0f 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -1732,3 +1732,58 @@ def test_issue_1590(self): ''' ) ) + + def test_stress_comment_handling(self): + """Stress test the comment handling.""" + + self.assertMarkdownRenders( + self.dedent( + ''' + `` and + + ` +

<!--[if mso]> </ <!--[if mso]> and </> <!-- and <!--

+

</> <!--[if mso]> </ <!-- and <!-- and <!--[if mso]>

+ ''' + ) + ) + + def test_unclosed_endtag(self): + """Ensure unclosed end tag does not have side effects.""" + + self.assertMarkdownRenders( + self.dedent( + ''' + ` + +

foo

+ + + ''' + ), + self.dedent( + ''' +

</

+
+ +

foo

+ +
+ ''' + ) + ) From cc9865c0fc2f7221dc4f6631da28934a472f5daa Mon Sep 17 00:00:00 2001 From: facelessuser Date: Mon, 2 Feb 2026 19:52:33 -0700 Subject: [PATCH 2/5] Fix lint --- tests/test_syntax/blocks/test_html_blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 80688a0f..ecc10f45 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -1757,7 +1757,7 @@ def test_stress_comment_handling(self):

<!--[if mso]> </ <!--[if mso]> and </> <!-- and <!--

</> <!--[if mso]> </ <!-- and <!-- and <!--[if mso]>

- ''' + ''' # noqa: E501 ) ) From ca73a6ddfea4b63aab2ddc5764e7d99f2e8420b2 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Mon, 2 Feb 2026 20:14:19 -0700 Subject: [PATCH 3/5] Add a hack so we don't break MkDocs --- markdown/htmlparser.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 9d39285c..ad5264ae 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -93,6 +93,30 @@ blank_line_re = re.compile(r'^([ ]*\n){2}') +class _HTMLParser(htmlparser.HTMLParser): + """Handle special start and end tags.""" + + def parse_endtag(self, i): + start = self.rawdata[i:i+3] + c = ord(start[-1]) + if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122): + self.handle_data(self.rawdata[i:i + 2]) + return i + 2 + return super().parse_endtag(i) + + def parse_starttag(self, i: int) -> int: # pragma: no cover + # Treat `` as normal data as it is not a real tag. + if self.rawdata[i:i + 3] == '': + self.handle_data(self.rawdata[i:i + 3]) + return i + 3 + + return super().parse_starttag(i) + + +# Overwrite our custom one for people like MkDocs that pull it in +htmlparser.HTMLParser = _HTMLParser + + class HTMLExtractor(htmlparser.HTMLParser): """ Extract raw HTML from text. From 7e4a8f72a60d58580dc4995fbb63c6724156f302 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Tue, 3 Feb 2026 06:42:59 -0700 Subject: [PATCH 4/5] Simplify regex pattern --- markdown/htmlparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index ad5264ae..eb548bd4 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -47,7 +47,7 @@ # throwing it away. When we see it, we will process it as data. htmlparser.starttagopen = re.compile('<[a-zA-Z]|') -htmlparser.endtagopen = re.compile('` to close Processing Instructions. htmlparser.piclose = re.compile(r'\?>') From d74efde36b59208e25a1c76014a42d5000be39e9 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Tue, 3 Feb 2026 06:54:44 -0700 Subject: [PATCH 5/5] Remove copy of function that we already inherit --- markdown/htmlparser.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index eb548bd4..488da033 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -227,14 +227,6 @@ def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]): # This is presumably a standalone tag in a code span (see #1036). self.clear_cdata_mode() - def parse_endtag(self, i): - start = self.rawdata[i:i+3] - c = ord(start[-1]) - if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122): - self.handle_data(self.rawdata[i:i + 2]) - return i + 2 - return super().parse_endtag(i) - def handle_endtag(self, tag: str): text = self.get_endtag_text(tag)