diff --git a/docs/changelog.md b/docs/changelog.md index a11b6c9f..0b43c877 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -15,6 +15,7 @@ See the [Contributing Guide](contributing.md) for details. ### Fixed * Fix a regression related to comment handling (#1590). +* More reliable fix for `') -commentabruptclose = re.compile(r'-?>') # Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. # Users can still do `from html import parser` and get the default behavior. @@ -48,6 +47,8 @@ # throwing it away. When we see it, we will process it as data. htmlparser.starttagopen = re.compile('<[a-zA-Z]|') +htmlparser.endtagopen = re.compile('` to close Processing Instructions. htmlparser.piclose = re.compile(r'\?>') # Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon. @@ -92,6 +93,30 @@ blank_line_re = re.compile(r'^([ ]*\n){2}') +class _HTMLParser(htmlparser.HTMLParser): + """Handle special start and end tags.""" + + def parse_endtag(self, i): + start = self.rawdata[i:i+3] + c = ord(start[-1]) + if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122): + self.handle_data(self.rawdata[i:i + 2]) + return i + 2 + return super().parse_endtag(i) + + def parse_starttag(self, i: int) -> int: # pragma: no cover + # Treat `` as normal data as it is not a real tag. + if self.rawdata[i:i + 3] == '': + self.handle_data(self.rawdata[i:i + 3]) + return i + 3 + + return super().parse_starttag(i) + + +# Overwrite our custom one for people like MkDocs that pull it in +htmlparser.HTMLParser = _HTMLParser + + class HTMLExtractor(htmlparser.HTMLParser): """ Extract raw HTML from text. @@ -110,9 +135,6 @@ def __init__(self, md: Markdown, *args, **kwargs): self.lineno_start_cache = [0] - self.override_comment_update = False - self.override_comment_start = 0 - # This calls self.reset super().__init__(*args, **kwargs) self.md = md @@ -125,8 +147,6 @@ def reset(self): self._cache: list[str] = [] self.cleandoc: list[str] = [] self.lineno_start_cache = [0] - self.override_comment_start = 0 - self.override_comment_update = False super().reset() @@ -276,22 +296,8 @@ def handle_entityref(self, name: str): def handle_comment(self, data: str): # Check if the comment is unclosed, if so, we need to override position - j = self.rawdata.find(data) - i = j - 2 - if self.rawdata[i:j] == ''.format(data), is_block=True) - def updatepos(self, i: int, j: int) -> int: - if self.override_comment_update: - self.override_comment_update = False - i = self.override_comment_start - j = self.override_comment_start + 1 - return super().updatepos(i, j) - def handle_decl(self, data: str): self.handle_empty_tag(''.format(data), is_block=True) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 87d4dc0b..ecc10f45 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -1732,3 +1732,58 @@ def test_issue_1590(self): ''' ) ) + + def test_stress_comment_handling(self): + """Stress test the comment handling.""" + + self.assertMarkdownRenders( + self.dedent( + ''' + `` and + + ` +

<!--[if mso]> </ <!--[if mso]> and </> <!-- and <!--

+

</> <!--[if mso]> </ <!-- and <!-- and <!--[if mso]>

+ ''' # noqa: E501 + ) + ) + + def test_unclosed_endtag(self): + """Ensure unclosed end tag does not have side effects.""" + + self.assertMarkdownRenders( + self.dedent( + ''' + ` + +

foo

+ + + ''' + ), + self.dedent( + ''' +

</

+
+ +

foo

+ +
+ ''' + ) + )