Skip to content

Commit a36070a

Browse files
committed
update to latest main
1 parent d8cc255 commit a36070a

File tree

2 files changed

+7
-64
lines changed

2 files changed

+7
-64
lines changed

Lib/html/parser.py

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828

2929
starttagopen = re.compile('<[a-zA-Z]')
3030
piclose = re.compile('>')
31-
escapable_raw_text_close = re.compile('</(title|textarea)>', re.I)
3231
commentclose = re.compile(r'--\s*>')
3332
# Note:
3433
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
@@ -101,7 +100,6 @@ class HTMLParser(_markupbase.ParserBase):
101100
"""
102101

103102
CDATA_CONTENT_ELEMENTS = ("script", "style")
104-
ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea")
105103

106104
def __init__(self, *, convert_charrefs=True):
107105
"""Initialize and reset this instance.
@@ -119,7 +117,6 @@ def reset(self):
119117
self.lasttag = '???'
120118
self.interesting = interesting_normal
121119
self.cdata_elem = None
122-
self.escapable_raw_text_elem = None
123120
super().reset()
124121

125122
def feed(self, data):
@@ -141,14 +138,6 @@ def get_starttag_text(self):
141138
"""Return full source of start tag: '<...>'."""
142139
return self.__starttag_text
143140

144-
def set_escapable_raw_text_mode(self, elem):
145-
self.escapable_raw_text_elem = elem.lower()
146-
self.interesting = re.compile(r'</\s*%s\s*>' % self.escapable_raw_text_elem, re.I)
147-
148-
def clear_escapable_raw_text_mode(self):
149-
self.interesting = interesting_normal
150-
self.escapable_raw_text_elem = None
151-
152141
def set_cdata_mode(self, elem):
153142
self.cdata_elem = elem.lower()
154143
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
@@ -165,7 +154,7 @@ def goahead(self, end):
165154
i = 0
166155
n = len(rawdata)
167156
while i < n:
168-
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
157+
if self.convert_charrefs and not self.cdata_elem:
169158
j = rawdata.find('<', i)
170159
if j < 0:
171160
# if we can't find the next <, either we are at the end
@@ -184,13 +173,11 @@ def goahead(self, end):
184173
if match:
185174
j = match.start()
186175
else:
187-
if self.escapable_raw_text_elem:
188-
break
189176
if self.cdata_elem:
190177
break
191178
j = n
192179
if i < j:
193-
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
180+
if self.convert_charrefs and not self.cdata_elem:
194181
self.handle_data(unescape(rawdata[i:j]))
195182
else:
196183
self.handle_data(rawdata[i:j])
@@ -367,8 +354,6 @@ def parse_starttag(self, i):
367354
self.handle_startendtag(tag, attrs)
368355
else:
369356
self.handle_starttag(tag, attrs)
370-
if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS:
371-
self.set_escapable_raw_text_mode(tag)
372357
if tag in self.CDATA_CONTENT_ELEMENTS:
373358
self.set_cdata_mode(tag)
374359
return endpos
@@ -444,14 +429,8 @@ def parse_endtag(self, i):
444429
self.handle_data(rawdata[i:gtpos])
445430
return gtpos
446431

447-
if self.escapable_raw_text_elem is not None: # title or textarea
448-
if elem != self.escapable_raw_text_elem:
449-
self.handle_data(rawdata[i:gtpos])
450-
return gtpos
451-
452432
self.handle_endtag(elem)
453433
self.clear_cdata_mode()
454-
self.clear_escapable_raw_text_mode()
455434
return gtpos
456435

457436
# Overridable -- finish processing of start+end tag: <tag.../>
@@ -492,4 +471,4 @@ def handle_pi(self, data):
492471
pass
493472

494473
def unknown_decl(self, data):
495-
pass
474+
pass

Lib/test/test_htmlparser.py

Lines changed: 4 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -285,9 +285,7 @@ def test_cdata_content(self):
285285
#'foo = </\nscript>',
286286
#'foo = </ script>',
287287
]
288-
tags = ['script', 'style', 'textarea', 'title']
289-
# test the following 'casing' for each tag: script, SCRIPT, Script etc.
290-
elements = [f(tag) for tag in tags for f in (str.lower, str.upper, str.capitalize)]
288+
elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
291289
for content in contents:
292290
for element in elements:
293291
element_lower = element.lower()
@@ -319,34 +317,6 @@ def get_events(self):
319317
("endtag", element_lower)],
320318
collector=Collector(convert_charrefs=False))
321319

322-
def test_escapable_raw_text_content(self):
323-
contents = [
324-
'foo = "</TITLE" + ">";',
325-
'foo = <\n/title> ',
326-
'<!-- document.write("</scr" + "ipt>"); -->',
327-
'\n//<![CDATA[\n'
328-
'\n<!-- //\nvar foo = 3.14;\n// -->\n',
329-
# valid character reference
330-
'&#65;',
331-
# ambiguous ampersand example
332-
'&notaref',
333-
'foo = "</sty" + "le>";',
334-
'<!-- \u2603 -->',
335-
# these two should be invalid according to the HTML 5 spec,
336-
# section 8.1.2.2
337-
#'foo = </\nscript>',
338-
#'foo = </ script>',
339-
]
340-
elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea']
341-
for content in contents:
342-
for element in elements:
343-
element_lower = element.lower()
344-
s = '<{element}>{content}</{element}>'.format(element=element,
345-
content=content)
346-
self._run_check(s, [("starttag", element_lower, []),
347-
("data", content),
348-
("endtag", element_lower)])
349-
350320
def test_EOF_in_cdata(self):
351321
content = """<!-- not a comment --> &not-an-entity-ref;
352322
<a href="" /> </p><p> <span></span></style>
@@ -407,15 +377,9 @@ def test_convert_charrefs(self):
407377
('starttag', 'script', []), ('data', text),
408378
('endtag', 'script'), ('data', '"'),
409379
('starttag', 'style', []), ('data', text),
410-
('endtag', 'style'), ('data', '"'),
411-
('starttag', 'title', []), ('data', text),
412-
('endtag', 'title'), ('data', '"'),
413-
('starttag', 'textarea', []), ('data', text),
414-
('endtag', 'textarea'), ('data', '"')]
380+
('endtag', 'style'), ('data', '"')]
415381
self._run_check('{1}<script>{0}</script>{1}'
416-
'<style>{0}</style>{1}'
417-
'<title>{0}</title>{1}'
418-
'<textarea>{0}</textarea>{1}'.format(text, charref),
382+
'<style>{0}</style>{1}'.format(text, charref),
419383
expected, collector=collector())
420384
# check truncated charrefs at the end of the file
421385
html = '&quo &# &#x'
@@ -922,4 +886,4 @@ def test_base_class_methods_called(self, super_reset_method, super_init_method):
922886

923887

924888
if __name__ == "__main__":
925-
unittest.main()
889+
unittest.main()

0 commit comments

Comments
 (0)