Merge pull request RustPython#4009 from youknowone/htmlurllib

youknowone · web-flow · commit 8b7a3ea8ea1a · 2022-08-07T17:23:14.000+09:00
Update html/urllib and their tests
diff --git a/Lib/html/entities.py b/Lib/html/entities.py
@@ -4,6 +4,7 @@
 
 
 # maps the HTML entity name to the Unicode code point
+# from https://html.spec.whatwg.org/multipage/named-characters.html
 name2codepoint = {
     'AElig':    0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
     'Aacute':   0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
@@ -9,7 +9,6 @@
 
 
 import re
-import warnings
 import _markupbase
 
 from html import unescape
@@ -47,7 +46,7 @@
           |"[^"]*"                   # LIT-enclosed value
           |(?!['"])[^>\s]*           # bare value
          )
-         (?:\s*,)*                   # possibly followed by a comma
+        \s*                          # possibly followed by a space
        )?(?:\s|/(?!>))*
      )*
    )?
@@ -406,7 +405,7 @@ def parse_endtag(self, i):
             tagname = namematch.group(1).lower()
             # consume and ignore other stuff between the name and the >
             # Note: this is not 100% correct, since we might have things like
-            # </tag attr=">">, but looking for > after tha name should cover
+            # </tag attr=">">, but looking for > after the name should cover
             # most of the cases and is much simpler
             gtpos = rawdata.find('>', namematch.end())
             self.handle_endtag(tagname)
@@ -418,7 +417,7 @@ def parse_endtag(self, i):
                 self.handle_data(rawdata[i:gtpos])
                 return gtpos
 
-        self.handle_endtag(elem.lower())
+        self.handle_endtag(elem)
         self.clear_cdata_mode()
         return gtpos
 
@@ -461,10 +460,3 @@ def handle_pi(self, data):
 
     def unknown_decl(self, data):
         pass
-
-    # Internal -- helper to remove special character quoting
-    def unescape(self, s):
-        warnings.warn('The unescape method is deprecated and will be removed '
-                      'in 3.5, use html.unescape() instead.',
-                      DeprecationWarning, stacklevel=2)
-        return unescape(s)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
@@ -537,13 +537,6 @@ def test_EOF_in_charref(self):
         for html, expected in data:
             self._run_check(html, expected)
 
-    def test_unescape_method(self):
-        from html import unescape
-        p = self.get_collector()
-        with self.assertWarns(DeprecationWarning):
-            s = '&quot;&#34;&#x22;&quot&#34&#x22&#bad;'
-            self.assertEqual(p.unescape(s), unescape(s))
-
     def test_broken_comments(self):
         html = ('<! not really a comment >'
                 '<! not a comment either -->'
@@ -761,8 +754,6 @@ def test_with_unquoted_attributes(self):
         ]
         self._run_check(html, expected)
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_comma_between_attributes(self):
         # see bpo 41478
         # HTMLParser preserves duplicate attributes, leaving the task of
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
@@ -9,7 +9,8 @@
 import unittest
 from unittest.mock import patch
 from test import support
-from test.support import os_helper, warnings_helper
+from test.support import os_helper
+from test.support import warnings_helper
 import os
 try:
     import ssl
diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py
@@ -141,8 +141,6 @@ def test_request_headers_methods(self):
         req.remove_header("Unredirected-spam")
         self.assertFalse(req.has_header("Unredirected-spam"))
 
-    # TODO: RUSTPYTHON, AssertionError: Tuples differ: ('foo', 'ni') != (None, None)
-    @unittest.expectedFailure
     def test_password_manager(self):
         mgr = urllib.request.HTTPPasswordMgr()
         add = mgr.add_password
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
@@ -613,8 +613,8 @@ def test_urlsplit_attributes(self):
             p.port
 
     def test_urlsplit_remove_unsafe_bytes(self):
-        # Remove ASCII tabs and newlines from input, for http common case scenario.
-        url = "h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
+        # Remove ASCII tabs and newlines from input
+        url = "http\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
         p = urllib.parse.urlsplit(url)
         self.assertEqual(p.scheme, "http")
         self.assertEqual(p.netloc, "www.python.org")
@@ -627,8 +627,8 @@ def test_urlsplit_remove_unsafe_bytes(self):
         self.assertEqual(p.port, None)
         self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
 
-        # Remove ASCII tabs and newlines from input as bytes, for http common case scenario.
-        url = b"h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
+        # Remove ASCII tabs and newlines from input as bytes.
+        url = b"http\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
         p = urllib.parse.urlsplit(url)
         self.assertEqual(p.scheme, b"http")
         self.assertEqual(p.netloc, b"www.python.org")
@@ -641,24 +641,13 @@ def test_urlsplit_remove_unsafe_bytes(self):
         self.assertEqual(p.port, None)
         self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/?query=something#fragment")
 
-        # any scheme
-        url = "x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
-        p = urllib.parse.urlsplit(url)
-        self.assertEqual(p.geturl(), "x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment")
-
-        # Remove ASCII tabs and newlines from input as bytes, any scheme.
-        url = b"x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
-        p = urllib.parse.urlsplit(url)
-        self.assertEqual(p.geturl(), b"x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment")
-
-        # Unsafe bytes is not returned from urlparse cache.
-        # scheme is stored after parsing, sending an scheme with unsafe bytes *will not* return an unsafe scheme
-        url = "https://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
-        scheme = "htt\nps"
+        # with scheme as cache-key
+        url = "http://www.python.org/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment"
+        scheme = "ht\ntp"
         for _ in range(2):
             p = urllib.parse.urlsplit(url, scheme=scheme)
-            self.assertEqual(p.scheme, "https")
-            self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment")
+            self.assertEqual(p.scheme, "http")
+            self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
 
     def test_attributes_bad_port(self):
         """Check handling of invalid ports."""
@@ -745,15 +734,17 @@ def test_withoutscheme(self):
 
     def test_portseparator(self):
         # Issue 754016 makes changes for port separator ':' from scheme separator
-        self.assertEqual(urllib.parse.urlparse("path:80"),
-                ('','','path:80','','',''))
+        self.assertEqual(urllib.parse.urlparse("http:80"), ('http','','80','','',''))
+        self.assertEqual(urllib.parse.urlparse("https:80"), ('https','','80','','',''))
+        self.assertEqual(urllib.parse.urlparse("path:80"), ('path','','80','','',''))
         self.assertEqual(urllib.parse.urlparse("http:"),('http','','','','',''))
         self.assertEqual(urllib.parse.urlparse("https:"),('https','','','','',''))
         self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
                 ('http','www.python.org:80','','','',''))
         # As usual, need to check bytes input as well
-        self.assertEqual(urllib.parse.urlparse(b"path:80"),
-                (b'',b'',b'path:80',b'',b'',b''))
+        self.assertEqual(urllib.parse.urlparse(b"http:80"), (b'http',b'',b'80',b'',b'',b''))
+        self.assertEqual(urllib.parse.urlparse(b"https:80"), (b'https',b'',b'80',b'',b'',b''))
+        self.assertEqual(urllib.parse.urlparse(b"path:80"), (b'path',b'',b'80',b'',b'',b''))
         self.assertEqual(urllib.parse.urlparse(b"http:"),(b'http',b'',b'',b'',b'',b''))
         self.assertEqual(urllib.parse.urlparse(b"https:"),(b'https',b'',b'',b'',b'',b''))
         self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"),
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
@@ -29,6 +29,7 @@
 
 import re
 import sys
+import types
 import collections
 import warnings
 
@@ -179,6 +180,8 @@ def port(self):
                 raise ValueError("Port out of range 0-65535")
         return port
 
+    __class_getitem__ = classmethod(types.GenericAlias)
+
 
 class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
     __slots__ = ()
@@ -369,9 +372,23 @@ def _fix_result_transcoding():
 def urlparse(url, scheme='', allow_fragments=True):
     """Parse a URL into 6 components:
     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
-    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
-    Note that we don't break the components up in smaller bits
-    (e.g. netloc is a single string) and we don't expand % escapes."""
+
+    The result is a named 6-tuple with fields corresponding to the
+    above. It is either a ParseResult or ParseResultBytes object,
+    depending on the type of the url parameter.
+
+    The username, password, hostname, and port sub-components of netloc
+    can also be accessed as attributes of the returned object.
+
+    The scheme argument provides the default value of the scheme
+    component when no scheme is found in url.
+
+    If allow_fragments is False, no attempt is made to separate the
+    fragment component from the previous component, which can be either
+    path or query.
+
+    Note that % escapes are not expanded.
+    """
     url, scheme, _coerce_result = _coerce_args(url, scheme)
     splitresult = urlsplit(url, scheme, allow_fragments)
     scheme, netloc, url, query, fragment = splitresult
@@ -417,20 +434,33 @@ def _checknetloc(netloc):
             raise ValueError("netloc '" + netloc + "' contains invalid " +
                              "characters under NFKC normalization")
 
-def _remove_unsafe_bytes_from_url(url):
-    for b in _UNSAFE_URL_BYTES_TO_REMOVE:
-        url = url.replace(b, "")
-    return url
-
 def urlsplit(url, scheme='', allow_fragments=True):
     """Parse a URL into 5 components:
     <scheme>://<netloc>/<path>?<query>#<fragment>
-    Return a 5-tuple: (scheme, netloc, path, query, fragment).
-    Note that we don't break the components up in smaller bits
-    (e.g. netloc is a single string) and we don't expand % escapes."""
+
+    The result is a named 5-tuple with fields corresponding to the
+    above. It is either a SplitResult or SplitResultBytes object,
+    depending on the type of the url parameter.
+
+    The username, password, hostname, and port sub-components of netloc
+    can also be accessed as attributes of the returned object.
+
+    The scheme argument provides the default value of the scheme
+    component when no scheme is found in url.
+
+    If allow_fragments is False, no attempt is made to separate the
+    fragment component from the previous component, which can be either
+    path or query.
+
+    Note that % escapes are not expanded.
+    """
+
     url, scheme, _coerce_result = _coerce_args(url, scheme)
-    url = _remove_unsafe_bytes_from_url(url)
-    scheme = _remove_unsafe_bytes_from_url(scheme)
+
+    for b in _UNSAFE_URL_BYTES_TO_REMOVE:
+        url = url.replace(b, "")
+        scheme = scheme.replace(b, "")
+
     allow_fragments = bool(allow_fragments)
     key = url, scheme, allow_fragments, type(url), type(scheme)
     cached = _parse_cache.get(key, None)
@@ -441,31 +471,11 @@ def urlsplit(url, scheme='', allow_fragments=True):
     netloc = query = fragment = ''
     i = url.find(':')
     if i > 0:
-        if url[:i] == 'http': # optimize the common case
-            url = url[i+1:]
-            if url[:2] == '//':
-                netloc, url = _splitnetloc(url, 2)
-                if (('[' in netloc and ']' not in netloc) or
-                        (']' in netloc and '[' not in netloc)):
-                    raise ValueError("Invalid IPv6 URL")
-            if allow_fragments and '#' in url:
-                url, fragment = url.split('#', 1)
-            if '?' in url:
-                url, query = url.split('?', 1)
-            _checknetloc(netloc)
-            v = SplitResult('http', netloc, url, query, fragment)
-            _parse_cache[key] = v
-            return _coerce_result(v)
         for c in url[:i]:
             if c not in scheme_chars:
                 break
         else:
-            # make sure "url" is not actually a port number (in which case
-            # "scheme" is really part of the path)
-            rest = url[i+1:]
-            if not rest or any(c not in '0123456789' for c in rest):
-                # not a port number
-                scheme, url = url[:i].lower(), rest
+            scheme, url = url[:i].lower(), url[i+1:]
 
     if url[:2] == '//':
         netloc, url = _splitnetloc(url, 2)
@@ -642,7 +652,7 @@ def unquote(string, encoding='utf-8', errors='replace'):
     unquote('abc%20def') -> 'abc def'.
     """
     if isinstance(string, bytes):
-        raise TypeError('Expected str, got bytes')
+        return unquote_to_bytes(string).decode(encoding, errors)
     if '%' not in string:
         string.split
         return string
@@ -744,9 +754,8 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
         if max_num_fields < num_fields:
             raise ValueError('Max number of fields exceeded')
 
-    pairs = [s1 for s1 in qs.split(separator)]
     r = []
-    for name_value in pairs:
+    for name_value in qs.split(separator):
         if not name_value and not strict_parsing:
             continue
         nv = name_value.split('=', 1)
diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
diff --git a/Lib/urllib/response.py b/Lib/urllib/response.py
diff --git a/stdlib/src/ssl.rs b/stdlib/src/ssl.rs