Skip to content

Commit 8b7a3ea

Browse files
authored
Merge pull request RustPython#4009 from youknowone/htmlurllib
Update html/urllib and their tests
2 parents cd75df5 + ad57357 commit 8b7a3ea

File tree

10 files changed

+121
-118
lines changed

10 files changed

+121
-118
lines changed

Lib/html/entities.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55

66
# maps the HTML entity name to the Unicode code point
7+
# from https://html.spec.whatwg.org/multipage/named-characters.html
78
name2codepoint = {
89
'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
910
'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1

Lib/html/parser.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010

1111
import re
12-
import warnings
1312
import _markupbase
1413

1514
from html import unescape
@@ -47,7 +46,7 @@
4746
|"[^"]*" # LIT-enclosed value
4847
|(?!['"])[^>\s]* # bare value
4948
)
50-
(?:\s*,)* # possibly followed by a comma
49+
\s* # possibly followed by a space
5150
)?(?:\s|/(?!>))*
5251
)*
5352
)?
@@ -406,7 +405,7 @@ def parse_endtag(self, i):
406405
tagname = namematch.group(1).lower()
407406
# consume and ignore other stuff between the name and the >
408407
# Note: this is not 100% correct, since we might have things like
409-
# </tag attr=">">, but looking for > after tha name should cover
408+
# </tag attr=">">, but looking for > after the name should cover
410409
# most of the cases and is much simpler
411410
gtpos = rawdata.find('>', namematch.end())
412411
self.handle_endtag(tagname)
@@ -418,7 +417,7 @@ def parse_endtag(self, i):
418417
self.handle_data(rawdata[i:gtpos])
419418
return gtpos
420419

421-
self.handle_endtag(elem.lower())
420+
self.handle_endtag(elem)
422421
self.clear_cdata_mode()
423422
return gtpos
424423

@@ -461,10 +460,3 @@ def handle_pi(self, data):
461460

462461
def unknown_decl(self, data):
463462
pass
464-
465-
# Internal -- helper to remove special character quoting
466-
def unescape(self, s):
467-
warnings.warn('The unescape method is deprecated and will be removed '
468-
'in 3.5, use html.unescape() instead.',
469-
DeprecationWarning, stacklevel=2)
470-
return unescape(s)

Lib/test/test_htmlparser.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -537,13 +537,6 @@ def test_EOF_in_charref(self):
537537
for html, expected in data:
538538
self._run_check(html, expected)
539539

540-
def test_unescape_method(self):
541-
from html import unescape
542-
p = self.get_collector()
543-
with self.assertWarns(DeprecationWarning):
544-
s = '&quot;&#34;&#x22;&quot&#34&#x22&#bad;'
545-
self.assertEqual(p.unescape(s), unescape(s))
546-
547540
def test_broken_comments(self):
548541
html = ('<! not really a comment >'
549542
'<! not a comment either -->'
@@ -761,8 +754,6 @@ def test_with_unquoted_attributes(self):
761754
]
762755
self._run_check(html, expected)
763756

764-
# TODO: RUSTPYTHON
765-
@unittest.expectedFailure
766757
def test_comma_between_attributes(self):
767758
# see bpo 41478
768759
# HTMLParser preserves duplicate attributes, leaving the task of

Lib/test/test_urllib.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
import unittest
1010
from unittest.mock import patch
1111
from test import support
12-
from test.support import os_helper, warnings_helper
12+
from test.support import os_helper
13+
from test.support import warnings_helper
1314
import os
1415
try:
1516
import ssl

Lib/test/test_urllib2.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,6 @@ def test_request_headers_methods(self):
141141
req.remove_header("Unredirected-spam")
142142
self.assertFalse(req.has_header("Unredirected-spam"))
143143

144-
# TODO: RUSTPYTHON, AssertionError: Tuples differ: ('foo', 'ni') != (None, None)
145-
@unittest.expectedFailure
146144
def test_password_manager(self):
147145
mgr = urllib.request.HTTPPasswordMgr()
148146
add = mgr.add_password

Lib/test/test_urlparse.py

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -613,8 +613,8 @@ def test_urlsplit_attributes(self):
613613
p.port
614614

615615
def test_urlsplit_remove_unsafe_bytes(self):
616-
# Remove ASCII tabs and newlines from input, for http common case scenario.
617-
url = "h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
616+
# Remove ASCII tabs and newlines from input
617+
url = "http\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
618618
p = urllib.parse.urlsplit(url)
619619
self.assertEqual(p.scheme, "http")
620620
self.assertEqual(p.netloc, "www.python.org")
@@ -627,8 +627,8 @@ def test_urlsplit_remove_unsafe_bytes(self):
627627
self.assertEqual(p.port, None)
628628
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
629629

630-
# Remove ASCII tabs and newlines from input as bytes, for http common case scenario.
631-
url = b"h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
630+
# Remove ASCII tabs and newlines from input as bytes.
631+
url = b"http\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
632632
p = urllib.parse.urlsplit(url)
633633
self.assertEqual(p.scheme, b"http")
634634
self.assertEqual(p.netloc, b"www.python.org")
@@ -641,24 +641,13 @@ def test_urlsplit_remove_unsafe_bytes(self):
641641
self.assertEqual(p.port, None)
642642
self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/?query=something#fragment")
643643

644-
# any scheme
645-
url = "x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
646-
p = urllib.parse.urlsplit(url)
647-
self.assertEqual(p.geturl(), "x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment")
648-
649-
# Remove ASCII tabs and newlines from input as bytes, any scheme.
650-
url = b"x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
651-
p = urllib.parse.urlsplit(url)
652-
self.assertEqual(p.geturl(), b"x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment")
653-
654-
# Unsafe bytes is not returned from urlparse cache.
655-
# scheme is stored after parsing, sending an scheme with unsafe bytes *will not* return an unsafe scheme
656-
url = "https://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
657-
scheme = "htt\nps"
644+
# with scheme as cache-key
645+
url = "http://www.python.org/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
646+
scheme = "ht\ntp"
658647
for _ in range(2):
659648
p = urllib.parse.urlsplit(url, scheme=scheme)
660-
self.assertEqual(p.scheme, "https")
661-
self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment")
649+
self.assertEqual(p.scheme, "http")
650+
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
662651

663652
def test_attributes_bad_port(self):
664653
"""Check handling of invalid ports."""
@@ -745,15 +734,17 @@ def test_withoutscheme(self):
745734

746735
def test_portseparator(self):
747736
# Issue 754016 makes changes for port separator ':' from scheme separator
748-
self.assertEqual(urllib.parse.urlparse("path:80"),
749-
('','','path:80','','',''))
737+
self.assertEqual(urllib.parse.urlparse("http:80"), ('http','','80','','',''))
738+
self.assertEqual(urllib.parse.urlparse("https:80"), ('https','','80','','',''))
739+
self.assertEqual(urllib.parse.urlparse("path:80"), ('path','','80','','',''))
750740
self.assertEqual(urllib.parse.urlparse("http:"),('http','','','','',''))
751741
self.assertEqual(urllib.parse.urlparse("https:"),('https','','','','',''))
752742
self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
753743
('http','www.python.org:80','','','',''))
754744
# As usual, need to check bytes input as well
755-
self.assertEqual(urllib.parse.urlparse(b"path:80"),
756-
(b'',b'',b'path:80',b'',b'',b''))
745+
self.assertEqual(urllib.parse.urlparse(b"http:80"), (b'http',b'',b'80',b'',b'',b''))
746+
self.assertEqual(urllib.parse.urlparse(b"https:80"), (b'https',b'',b'80',b'',b'',b''))
747+
self.assertEqual(urllib.parse.urlparse(b"path:80"), (b'path',b'',b'80',b'',b'',b''))
757748
self.assertEqual(urllib.parse.urlparse(b"http:"),(b'http',b'',b'',b'',b'',b''))
758749
self.assertEqual(urllib.parse.urlparse(b"https:"),(b'https',b'',b'',b'',b'',b''))
759750
self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"),

Lib/urllib/parse.py

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
import re
3131
import sys
32+
import types
3233
import collections
3334
import warnings
3435

@@ -179,6 +180,8 @@ def port(self):
179180
raise ValueError("Port out of range 0-65535")
180181
return port
181182

183+
__class_getitem__ = classmethod(types.GenericAlias)
184+
182185

183186
class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
184187
__slots__ = ()
@@ -369,9 +372,23 @@ def _fix_result_transcoding():
369372
def urlparse(url, scheme='', allow_fragments=True):
370373
"""Parse a URL into 6 components:
371374
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
372-
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
373-
Note that we don't break the components up in smaller bits
374-
(e.g. netloc is a single string) and we don't expand % escapes."""
375+
376+
The result is a named 6-tuple with fields corresponding to the
377+
above. It is either a ParseResult or ParseResultBytes object,
378+
depending on the type of the url parameter.
379+
380+
The username, password, hostname, and port sub-components of netloc
381+
can also be accessed as attributes of the returned object.
382+
383+
The scheme argument provides the default value of the scheme
384+
component when no scheme is found in url.
385+
386+
If allow_fragments is False, no attempt is made to separate the
387+
fragment component from the previous component, which can be either
388+
path or query.
389+
390+
Note that % escapes are not expanded.
391+
"""
375392
url, scheme, _coerce_result = _coerce_args(url, scheme)
376393
splitresult = urlsplit(url, scheme, allow_fragments)
377394
scheme, netloc, url, query, fragment = splitresult
@@ -417,20 +434,33 @@ def _checknetloc(netloc):
417434
raise ValueError("netloc '" + netloc + "' contains invalid " +
418435
"characters under NFKC normalization")
419436

420-
def _remove_unsafe_bytes_from_url(url):
421-
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
422-
url = url.replace(b, "")
423-
return url
424-
425437
def urlsplit(url, scheme='', allow_fragments=True):
426438
"""Parse a URL into 5 components:
427439
<scheme>://<netloc>/<path>?<query>#<fragment>
428-
Return a 5-tuple: (scheme, netloc, path, query, fragment).
429-
Note that we don't break the components up in smaller bits
430-
(e.g. netloc is a single string) and we don't expand % escapes."""
440+
441+
The result is a named 5-tuple with fields corresponding to the
442+
above. It is either a SplitResult or SplitResultBytes object,
443+
depending on the type of the url parameter.
444+
445+
The username, password, hostname, and port sub-components of netloc
446+
can also be accessed as attributes of the returned object.
447+
448+
The scheme argument provides the default value of the scheme
449+
component when no scheme is found in url.
450+
451+
If allow_fragments is False, no attempt is made to separate the
452+
fragment component from the previous component, which can be either
453+
path or query.
454+
455+
Note that % escapes are not expanded.
456+
"""
457+
431458
url, scheme, _coerce_result = _coerce_args(url, scheme)
432-
url = _remove_unsafe_bytes_from_url(url)
433-
scheme = _remove_unsafe_bytes_from_url(scheme)
459+
460+
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
461+
url = url.replace(b, "")
462+
scheme = scheme.replace(b, "")
463+
434464
allow_fragments = bool(allow_fragments)
435465
key = url, scheme, allow_fragments, type(url), type(scheme)
436466
cached = _parse_cache.get(key, None)
@@ -441,31 +471,11 @@ def urlsplit(url, scheme='', allow_fragments=True):
441471
netloc = query = fragment = ''
442472
i = url.find(':')
443473
if i > 0:
444-
if url[:i] == 'http': # optimize the common case
445-
url = url[i+1:]
446-
if url[:2] == '//':
447-
netloc, url = _splitnetloc(url, 2)
448-
if (('[' in netloc and ']' not in netloc) or
449-
(']' in netloc and '[' not in netloc)):
450-
raise ValueError("Invalid IPv6 URL")
451-
if allow_fragments and '#' in url:
452-
url, fragment = url.split('#', 1)
453-
if '?' in url:
454-
url, query = url.split('?', 1)
455-
_checknetloc(netloc)
456-
v = SplitResult('http', netloc, url, query, fragment)
457-
_parse_cache[key] = v
458-
return _coerce_result(v)
459474
for c in url[:i]:
460475
if c not in scheme_chars:
461476
break
462477
else:
463-
# make sure "url" is not actually a port number (in which case
464-
# "scheme" is really part of the path)
465-
rest = url[i+1:]
466-
if not rest or any(c not in '0123456789' for c in rest):
467-
# not a port number
468-
scheme, url = url[:i].lower(), rest
478+
scheme, url = url[:i].lower(), url[i+1:]
469479

470480
if url[:2] == '//':
471481
netloc, url = _splitnetloc(url, 2)
@@ -642,7 +652,7 @@ def unquote(string, encoding='utf-8', errors='replace'):
642652
unquote('abc%20def') -> 'abc def'.
643653
"""
644654
if isinstance(string, bytes):
645-
raise TypeError('Expected str, got bytes')
655+
return unquote_to_bytes(string).decode(encoding, errors)
646656
if '%' not in string:
647657
string.split
648658
return string
@@ -744,9 +754,8 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
744754
if max_num_fields < num_fields:
745755
raise ValueError('Max number of fields exceeded')
746756

747-
pairs = [s1 for s1 in qs.split(separator)]
748757
r = []
749-
for name_value in pairs:
758+
for name_value in qs.split(separator):
750759
if not name_value and not strict_parsing:
751760
continue
752761
nv = name_value.split('=', 1)

0 commit comments

Comments
 (0)