Skip to content

Commit 7499b90

Browse files
committed
Merge remote-tracking branch 'upstream/3.10' into backport-f04bea4-3.10
2 parents 8027b28 + 9524203 commit 7499b90

34 files changed

+732
-327
lines changed

Doc/Makefile

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,20 @@ venv:
155155
echo "The venv has been created in the $(VENVDIR) directory"; \
156156
fi
157157

158+
.PHONY: dist-no-html
159+
dist-no-html: dist-text dist-epub dist-texinfo
160+
158161
dist:
159162
rm -rf dist
160163
mkdir -p dist
161-
164+
$(MAKE) dist-html
165+
$(MAKE) dist-text
166+
$(MAKE) dist-pdf
167+
$(MAKE) dist-epub
168+
$(MAKE) dist-texinfo
169+
170+
.PHONY: dist-html
171+
dist-html:
162172
# archive the HTML
163173
make html
164174
cp -pPR build/html dist/python-$(DISTVERSION)-docs-html
@@ -168,6 +178,8 @@ dist:
168178
rm -r dist/python-$(DISTVERSION)-docs-html
169179
rm dist/python-$(DISTVERSION)-docs-html.tar
170180

181+
.PHONY: dist-text
182+
dist-text:
171183
# archive the text build
172184
make text
173185
cp -pPR build/text dist/python-$(DISTVERSION)-docs-text
@@ -177,6 +189,8 @@ dist:
177189
rm -r dist/python-$(DISTVERSION)-docs-text
178190
rm dist/python-$(DISTVERSION)-docs-text.tar
179191

192+
.PHONY: dist-pdf
193+
dist-pdf:
180194
# archive the A4 latex
181195
rm -rf build/latex
182196
make latex PAPER=a4
@@ -193,11 +207,15 @@ dist:
193207
cp build/latex/docs-pdf.zip dist/python-$(DISTVERSION)-docs-pdf-letter.zip
194208
cp build/latex/docs-pdf.tar.bz2 dist/python-$(DISTVERSION)-docs-pdf-letter.tar.bz2
195209

210+
.PHONY: dist-epub
211+
dist-epub:
196212
# copy the epub build
197213
rm -rf build/epub
198214
make epub
199215
cp -pPR build/epub/Python.epub dist/python-$(DISTVERSION)-docs.epub
200216

217+
.PHONY: dist-texinfo
218+
dist-texinfo:
201219
# archive the texinfo build
202220
rm -rf build/texinfo
203221
make texinfo
@@ -225,11 +243,11 @@ serve:
225243

226244
# for development releases: always build
227245
autobuild-dev:
228-
make dist SPHINXOPTS='$(SPHINXOPTS) -Ea -A daily=1'
246+
make dist-no-html SPHINXOPTS='$(SPHINXOPTS) -Ea -A daily=1'
229247

230248
# for quick rebuilds (HTML only)
231249
autobuild-dev-html:
232-
make html SPHINXOPTS='$(SPHINXOPTS) -Ea -A daily=1'
250+
make dist-html SPHINXOPTS='$(SPHINXOPTS) -Ea -A daily=1'
233251

234252
# for stable releases: only build if not in pre-release stage (alpha, beta)
235253
# release candidate downloads are okay, since the stable tree can be in that stage

Doc/library/html.parser.rst

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,18 @@
1515
This module defines a class :class:`HTMLParser` which serves as the basis for
1616
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
1717

18-
.. class:: HTMLParser(*, convert_charrefs=True)
18+
.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
1919

2020
Create a parser instance able to parse invalid markup.
2121

22-
If *convert_charrefs* is ``True`` (the default), all character
23-
references (except the ones in ``script``/``style`` elements) are
22+
If *convert_charrefs* is true (the default), all character
23+
references (except the ones in elements like ``script`` and ``style``) are
2424
automatically converted to the corresponding Unicode characters.
2525

26+
If *scripting* is false (the default), the content of the ``noscript``
27+
element is parsed normally; if it's true, it's returned as is without
28+
being parsed.
29+
2630
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
2731
when start tags, end tags, text, comments, and other markup elements are
2832
encountered. The user should subclass :class:`.HTMLParser` and override its
@@ -37,6 +41,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
3741
.. versionchanged:: 3.5
3842
The default value for argument *convert_charrefs* is now ``True``.
3943

44+
.. versionchanged:: 3.10.20
45+
Added the *scripting* parameter.
46+
4047

4148
Example HTML Parser Application
4249
-------------------------------
@@ -159,24 +166,24 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
159166
.. method:: HTMLParser.handle_data(data)
160167

161168
This method is called to process arbitrary data (e.g. text nodes and the
162-
content of ``<script>...</script>`` and ``<style>...</style>``).
169+
content of elements like ``script`` and ``style``).
163170

164171

165172
.. method:: HTMLParser.handle_entityref(name)
166173

167174
This method is called to process a named character reference of the form
168175
``&name;`` (e.g. ``&gt;``), where *name* is a general entity reference
169-
(e.g. ``'gt'``). This method is never called if *convert_charrefs* is
170-
``True``.
176+
(e.g. ``'gt'``).
177+
This method is only called if *convert_charrefs* is false.
171178

172179

173180
.. method:: HTMLParser.handle_charref(name)
174181

175182
This method is called to process decimal and hexadecimal numeric character
176183
references of the form ``&#NNN;`` and ``&#xNNN;``. For example, the decimal
177184
equivalent for ``&gt;`` is ``&#62;``, whereas the hexadecimal is ``&#x3E;``;
178-
in this case the method will receive ``'62'`` or ``'x3E'``. This method
179-
is never called if *convert_charrefs* is ``True``.
185+
in this case the method will receive ``'62'`` or ``'x3E'``.
186+
This method is only called if *convert_charrefs* is false.
180187

181188

182189
.. method:: HTMLParser.handle_comment(data)
@@ -284,8 +291,8 @@ Parsing an element with a few attributes and a title::
284291
Data : Python
285292
End tag : h1
286293

287-
The content of ``script`` and ``style`` elements is returned as is, without
288-
further parsing::
294+
The content of elements like ``script`` and ``style`` is returned as is,
295+
without further parsing::
289296

290297
>>> parser.feed('<style type="text/css">#python { color: green }</style>')
291298
Start tag: style
@@ -294,10 +301,10 @@ further parsing::
294301
End tag : style
295302

296303
>>> parser.feed('<script type="text/javascript">'
297-
... 'alert("<strong>hello!</strong>");</script>')
304+
... 'alert("<strong>hello! &#9786;</strong>");</script>')
298305
Start tag: script
299306
attr: ('type', 'text/javascript')
300-
Data : alert("<strong>hello!</strong>");
307+
Data : alert("<strong>hello! &#9786;</strong>");
301308
End tag : script
302309

303310
Parsing comments::
@@ -317,7 +324,7 @@ correct char (note: these 3 references are all equivalent to ``'>'``)::
317324

318325
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
319326
:meth:`~HTMLParser.handle_data` might be called more than once
320-
(unless *convert_charrefs* is set to ``True``)::
327+
if *convert_charrefs* is false::
321328

322329
>>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']:
323330
... parser.feed(chunk)

Include/patchlevel.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
/*--start constants--*/
1919
#define PY_MAJOR_VERSION 3
2020
#define PY_MINOR_VERSION 10
21-
#define PY_MICRO_VERSION 18
21+
#define PY_MICRO_VERSION 19
2222
#define PY_RELEASE_LEVEL PY_RELEASE_LEVEL_FINAL
2323
#define PY_RELEASE_SERIAL 0
2424

2525
/* Version as a string */
26-
#define PY_VERSION "3.10.18+"
26+
#define PY_VERSION "3.10.19+"
2727
/*--end constants--*/
2828

2929
/* Version as a single 4-byte hex number, e.g. 0x010502B2 == 1.5.2b2.

Lib/html/parser.py

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -109,16 +109,24 @@ class HTMLParser(_markupbase.ParserBase):
109109
argument.
110110
"""
111111

112-
CDATA_CONTENT_ELEMENTS = ("script", "style")
112+
# See the HTML5 specs section "13.4 Parsing HTML fragments".
113+
# https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
114+
# CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
115+
CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
113116
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
114117

115-
def __init__(self, *, convert_charrefs=True):
118+
def __init__(self, *, convert_charrefs=True, scripting=False):
116119
"""Initialize and reset this instance.
117120
118-
If convert_charrefs is True (the default), all character references
121+
If convert_charrefs is true (the default), all character references
119122
are automatically converted to the corresponding Unicode characters.
123+
124+
If *scripting* is false (the default), the content of the
125+
``noscript`` element is parsed normally; if it's true,
126+
it's returned as is without being parsed.
120127
"""
121128
self.convert_charrefs = convert_charrefs
129+
self.scripting = scripting
122130
self.reset()
123131

124132
def reset(self):
@@ -127,6 +135,7 @@ def reset(self):
127135
self.lasttag = '???'
128136
self.interesting = interesting_normal
129137
self.cdata_elem = None
138+
self._support_cdata = True
130139
self._escapable = True
131140
_markupbase.ParserBase.reset(self)
132141

@@ -152,7 +161,9 @@ def get_starttag_text(self):
152161
def set_cdata_mode(self, elem, *, escapable=False):
153162
self.cdata_elem = elem.lower()
154163
self._escapable = escapable
155-
if escapable and not self.convert_charrefs:
164+
if self.cdata_elem == 'plaintext':
165+
self.interesting = re.compile(r'\Z')
166+
elif escapable and not self.convert_charrefs:
156167
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
157168
re.IGNORECASE|re.ASCII)
158169
else:
@@ -164,6 +175,19 @@ def clear_cdata_mode(self):
164175
self.cdata_elem = None
165176
self._escapable = True
166177

178+
def _set_support_cdata(self, flag=True):
179+
"""Enable or disable support of the CDATA sections.
180+
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
181+
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
182+
183+
This method is not called by default. Its purpose is to be called
184+
in custom handle_starttag() and handle_endtag() methods, with
185+
value that depends on the adjusted current node.
186+
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
187+
for details.
188+
"""
189+
self._support_cdata = flag
190+
167191
# Internal -- handle data as far as reasonable. May leave state
168192
# and data to be processed by a subsequent call. If 'end' is
169193
# true, force handling all data as if followed by EOF marker.
@@ -238,7 +262,7 @@ def goahead(self, end):
238262
j -= len(suffix)
239263
break
240264
self.handle_comment(rawdata[i+4:j])
241-
elif startswith("<![CDATA[", i):
265+
elif startswith("<![CDATA[", i) and self._support_cdata:
242266
self.unknown_decl(rawdata[i+3:])
243267
elif rawdata[i:i+9].lower() == '<!doctype':
244268
self.handle_decl(rawdata[i+2:])
@@ -314,15 +338,28 @@ def parse_html_declaration(self, i):
314338
if rawdata[i:i+4] == '<!--':
315339
# this case is actually already handled in goahead()
316340
return self.parse_comment(i)
317-
elif rawdata[i:i+3] == '<![':
318-
return self.parse_marked_section(i)
341+
elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
342+
j = rawdata.find(']]>', i+9)
343+
if j < 0:
344+
return -1
345+
self.unknown_decl(rawdata[i+3: j])
346+
return j + 3
319347
elif rawdata[i:i+9].lower() == '<!doctype':
320348
# find the closing >
321349
gtpos = rawdata.find('>', i+9)
322350
if gtpos == -1:
323351
return -1
324352
self.handle_decl(rawdata[i+2:gtpos])
325353
return gtpos+1
354+
elif rawdata[i:i+3] == '<![':
355+
j = rawdata.find('>', i+3)
356+
if j < 0:
357+
return -1
358+
if rawdata[j-1] == ']':
359+
self.unknown_decl(rawdata[i+3: j-1])
360+
else:
361+
self.handle_comment(rawdata[i+2: j])
362+
return j + 1
326363
else:
327364
return self.parse_bogus_comment(i)
328365

@@ -414,8 +451,10 @@ def parse_starttag(self, i):
414451
self.handle_startendtag(tag, attrs)
415452
else:
416453
self.handle_starttag(tag, attrs)
417-
if tag in self.CDATA_CONTENT_ELEMENTS:
418-
self.set_cdata_mode(tag)
454+
if (tag in self.CDATA_CONTENT_ELEMENTS or
455+
(self.scripting and tag == "noscript") or
456+
tag == "plaintext"):
457+
self.set_cdata_mode(tag, escapable=False)
419458
elif tag in self.RCDATA_CONTENT_ELEMENTS:
420459
self.set_cdata_mode(tag, escapable=True)
421460
return endpos

0 commit comments

Comments
 (0)