@@ -109,16 +109,24 @@ class HTMLParser(_markupbase.ParserBase):
109109 argument.
110110 """
111111
112- CDATA_CONTENT_ELEMENTS = ("script" , "style" )
112+ # See the HTML5 specs section "13.4 Parsing HTML fragments".
113+ # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
114+ # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
115+ CDATA_CONTENT_ELEMENTS = ("script" , "style" , "xmp" , "iframe" , "noembed" , "noframes" )
113116 RCDATA_CONTENT_ELEMENTS = ("textarea" , "title" )
114117
115- def __init__ (self , * , convert_charrefs = True ):
118+ def __init__ (self , * , convert_charrefs = True , scripting = False ):
116119 """Initialize and reset this instance.
117120
118- If convert_charrefs is True (the default), all character references
121+ If convert_charrefs is true (the default), all character references
119122 are automatically converted to the corresponding Unicode characters.
123+
124+ If *scripting* is false (the default), the content of the
125+ ``noscript`` element is parsed normally; if it's true,
126+ it's returned as is without being parsed.
120127 """
121128 self .convert_charrefs = convert_charrefs
129+ self .scripting = scripting
122130 self .reset ()
123131
124132 def reset (self ):
@@ -127,6 +135,7 @@ def reset(self):
127135 self .lasttag = '???'
128136 self .interesting = interesting_normal
129137 self .cdata_elem = None
138+ self ._support_cdata = True
130139 self ._escapable = True
131140 _markupbase .ParserBase .reset (self )
132141
@@ -152,7 +161,9 @@ def get_starttag_text(self):
152161 def set_cdata_mode (self , elem , * , escapable = False ):
153162 self .cdata_elem = elem .lower ()
154163 self ._escapable = escapable
155- if escapable and not self .convert_charrefs :
164+ if self .cdata_elem == 'plaintext' :
165+ self .interesting = re .compile (r'\Z' )
166+ elif escapable and not self .convert_charrefs :
156167 self .interesting = re .compile (r'&|</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
157168 re .IGNORECASE | re .ASCII )
158169 else :
@@ -164,6 +175,19 @@ def clear_cdata_mode(self):
164175 self .cdata_elem = None
165176 self ._escapable = True
166177
178+ def _set_support_cdata (self , flag = True ):
179+ """Enable or disable support of the CDATA sections.
180+ If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
181+ If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
182+
183+ This method is not called by default. Its purpose is to be called
184+ in custom handle_starttag() and handle_endtag() methods, with
185+ value that depends on the adjusted current node.
186+ See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
187+ for details.
188+ """
189+ self ._support_cdata = flag
190+
167191 # Internal -- handle data as far as reasonable. May leave state
168192 # and data to be processed by a subsequent call. If 'end' is
169193 # true, force handling all data as if followed by EOF marker.
@@ -238,7 +262,7 @@ def goahead(self, end):
238262 j -= len (suffix )
239263 break
240264 self .handle_comment (rawdata [i + 4 :j ])
241- elif startswith ("<![CDATA[" , i ):
265+ elif startswith ("<![CDATA[" , i ) and self . _support_cdata :
242266 self .unknown_decl (rawdata [i + 3 :])
243267 elif rawdata [i :i + 9 ].lower () == '<!doctype' :
244268 self .handle_decl (rawdata [i + 2 :])
@@ -314,15 +338,28 @@ def parse_html_declaration(self, i):
314338 if rawdata [i :i + 4 ] == '<!--' :
315339 # this case is actually already handled in goahead()
316340 return self .parse_comment (i )
317- elif rawdata [i :i + 3 ] == '<![' :
318- return self .parse_marked_section (i )
341+ elif rawdata [i :i + 9 ] == '<![CDATA[' and self ._support_cdata :
342+ j = rawdata .find (']]>' , i + 9 )
343+ if j < 0 :
344+ return - 1
345+ self .unknown_decl (rawdata [i + 3 : j ])
346+ return j + 3
319347 elif rawdata [i :i + 9 ].lower () == '<!doctype' :
320348 # find the closing >
321349 gtpos = rawdata .find ('>' , i + 9 )
322350 if gtpos == - 1 :
323351 return - 1
324352 self .handle_decl (rawdata [i + 2 :gtpos ])
325353 return gtpos + 1
354+ elif rawdata [i :i + 3 ] == '<![' :
355+ j = rawdata .find ('>' , i + 3 )
356+ if j < 0 :
357+ return - 1
358+ if rawdata [j - 1 ] == ']' :
359+ self .unknown_decl (rawdata [i + 3 : j - 1 ])
360+ else :
361+ self .handle_comment (rawdata [i + 2 : j ])
362+ return j + 1
326363 else :
327364 return self .parse_bogus_comment (i )
328365
@@ -407,8 +444,10 @@ def parse_starttag(self, i):
407444 self .handle_startendtag (tag , attrs )
408445 else :
409446 self .handle_starttag (tag , attrs )
410- if tag in self .CDATA_CONTENT_ELEMENTS :
411- self .set_cdata_mode (tag )
447+ if (tag in self .CDATA_CONTENT_ELEMENTS or
448+ (self .scripting and tag == "noscript" ) or
449+ tag == "plaintext" ):
450+ self .set_cdata_mode (tag , escapable = False )
412451 elif tag in self .RCDATA_CONTENT_ELEMENTS :
413452 self .set_cdata_mode (tag , escapable = True )
414453 return endpos
0 commit comments