Update encodings/test_codecs

CPython developers · mumallaeng · commit ce0ac2b11b06 · 2022-07-24T16:40:51.000+09:00
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py
@@ -61,7 +61,8 @@ def normalize_encoding(encoding):
         if c.isalnum() or c == '.':
             if punct and chars:
                 chars.append('_')
-            chars.append(c)
+            if c.isascii():
+                chars.append(c)
             punct = False
         else:
             punct = True
diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py
@@ -450,6 +450,7 @@
 
     # mac_latin2 codec
     'maccentraleurope'   : 'mac_latin2',
+    'mac_centeuro'       : 'mac_latin2',
     'maclatin2'          : 'mac_latin2',
 
     # mac_roman codec
@@ -493,9 +494,6 @@
     'sjisx0213'          : 'shift_jisx0213',
     's_jisx0213'         : 'shift_jisx0213',
 
-    # tactis codec
-    'tis260'             : 'tactis',
-
     # tis_620 codec
     'tis620'             : 'tis_620',
     'tis_620_0'          : 'tis_620',
diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
     def encode(self, input, final=False):
         return codecs.raw_unicode_escape_encode(input, self.errors)[0]
 
-class IncrementalDecoder(codecs.IncrementalDecoder):
-    def decode(self, input, final=False):
-        return codecs.raw_unicode_escape_decode(input, self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def _buffer_decode(self, input, errors, final):
+        return codecs.raw_unicode_escape_decode(input, errors, final)
 
 class StreamWriter(Codec,codecs.StreamWriter):
     pass
 
 class StreamReader(Codec,codecs.StreamReader):
-    pass
+    def decode(self, input, errors='strict'):
+        return codecs.raw_unicode_escape_decode(input, errors, False)
 
 ### encodings module API
 
diff --git a/Lib/encodings/unicode_escape.py b/Lib/encodings/unicode_escape.py
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
     def encode(self, input, final=False):
         return codecs.unicode_escape_encode(input, self.errors)[0]
 
-class IncrementalDecoder(codecs.IncrementalDecoder):
-    def decode(self, input, final=False):
-        return codecs.unicode_escape_decode(input, self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def _buffer_decode(self, input, errors, final):
+        return codecs.unicode_escape_decode(input, errors, final)
 
 class StreamWriter(Codec,codecs.StreamWriter):
     pass
 
 class StreamReader(Codec,codecs.StreamReader):
-    pass
+    def decode(self, input, errors='strict'):
+        return codecs.unicode_escape_decode(input, errors, False)
 
 ### encodings module API
 
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
@@ -1,5 +1,6 @@
 import codecs
 import html.entities
+import itertools
 import sys
 import unicodedata
 import unittest
@@ -22,6 +23,18 @@ def handle(self, exc):
             self.pos = len(exc.object)
         return ("<?>", oldpos)
 
+class RepeatedPosReturn:
+    def __init__(self, repl="<?>"):
+        self.repl = repl
+        self.pos = 0
+        self.count = 0
+
+    def handle(self, exc):
+        if self.count > 0:
+            self.count -= 1
+            return (self.repl, self.pos)
+        return (self.repl, exc.end)
+
 # A UnicodeEncodeError object with a bad start attribute
 class BadStartUnicodeEncodeError(UnicodeEncodeError):
     def __init__(self):
@@ -807,20 +820,104 @@ def test_lookup(self):
             codecs.lookup_error("namereplace")
         )
 
-    def test_unencodablereplacement(self):
+    def test_encode_nonascii_replacement(self):
+        def handle(exc):
+            if isinstance(exc, UnicodeEncodeError):
+                return (repl, exc.end)
+            raise TypeError("don't know how to handle %r" % exc)
+        codecs.register_error("test.replacing", handle)
+
+        for enc, input, repl in (
+                ("ascii", "[¤]", "abc"),
+                ("iso-8859-1", "[€]", "½¾"),
+                ("iso-8859-15", "[¤]", "œŸ"),
+        ):
+            res = input.encode(enc, "test.replacing")
+            self.assertEqual(res, ("[" + repl + "]").encode(enc))
+
+        for enc, input, repl in (
+                ("utf-8", "[\udc80]", "\U0001f40d"),
+                ("utf-16", "[\udc80]", "\U0001f40d"),
+                ("utf-32", "[\udc80]", "\U0001f40d"),
+        ):
+            with self.subTest(encoding=enc):
+                with self.assertRaises(UnicodeEncodeError) as cm:
+                    input.encode(enc, "test.replacing")
+                exc = cm.exception
+                self.assertEqual(exc.start, 1)
+                self.assertEqual(exc.end, 2)
+                self.assertEqual(exc.object, input)
+
+    def test_encode_unencodable_replacement(self):
         def unencrepl(exc):
             if isinstance(exc, UnicodeEncodeError):
-                return ("\u4242", exc.end)
+                return (repl, exc.end)
             else:
                 raise TypeError("don't know how to handle %r" % exc)
         codecs.register_error("test.unencreplhandler", unencrepl)
-        for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
-            self.assertRaises(
-                UnicodeEncodeError,
-                "\u4242".encode,
-                enc,
-                "test.unencreplhandler"
-            )
+
+        for enc, input, repl in (
+                ("ascii", "[¤]", "½"),
+                ("iso-8859-1", "[€]", "œ"),
+                ("iso-8859-15", "[¤]", "½"),
+                ("utf-8", "[\udc80]", "\udcff"),
+                ("utf-16", "[\udc80]", "\udcff"),
+                ("utf-32", "[\udc80]", "\udcff"),
+        ):
+            with self.subTest(encoding=enc):
+                with self.assertRaises(UnicodeEncodeError) as cm:
+                    input.encode(enc, "test.unencreplhandler")
+                exc = cm.exception
+                self.assertEqual(exc.start, 1)
+                self.assertEqual(exc.end, 2)
+                self.assertEqual(exc.object, input)
+
+    def test_encode_bytes_replacement(self):
+        def handle(exc):
+            if isinstance(exc, UnicodeEncodeError):
+                return (repl, exc.end)
+            raise TypeError("don't know how to handle %r" % exc)
+        codecs.register_error("test.replacing", handle)
+
+        # It works even if the bytes sequence is not decodable.
+        for enc, input, repl in (
+                ("ascii", "[¤]", b"\xbd\xbe"),
+                ("iso-8859-1", "[€]", b"\xbd\xbe"),
+                ("iso-8859-15", "[¤]", b"\xbd\xbe"),
+                ("utf-8", "[\udc80]", b"\xbd\xbe"),
+                ("utf-16le", "[\udc80]", b"\xbd\xbe"),
+                ("utf-16be", "[\udc80]", b"\xbd\xbe"),
+                ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
+                ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
+        ):
+            with self.subTest(encoding=enc):
+                res = input.encode(enc, "test.replacing")
+                self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
+
+    def test_encode_odd_bytes_replacement(self):
+        def handle(exc):
+            if isinstance(exc, UnicodeEncodeError):
+                return (repl, exc.end)
+            raise TypeError("don't know how to handle %r" % exc)
+        codecs.register_error("test.replacing", handle)
+
+        input = "[\udc80]"
+        # Tests in which the replacement bytestring contains not whole number
+        # of code units.
+        for enc, repl in (
+            *itertools.product(("utf-16le", "utf-16be"),
+                               [b"a", b"abc"]),
+            *itertools.product(("utf-32le", "utf-32be"),
+                               [b"a", b"ab", b"abc", b"abcde"]),
+        ):
+            with self.subTest(encoding=enc, repl=repl):
+                with self.assertRaises(UnicodeEncodeError) as cm:
+                    input.encode(enc, "test.replacing")
+                exc = cm.exception
+                self.assertEqual(exc.start, 1)
+                self.assertEqual(exc.end, 2)
+                self.assertEqual(exc.object, input)
+                self.assertEqual(exc.reason, "surrogates not allowed")
 
     # TODO: RUSTPYTHON
     @unittest.expectedFailure
@@ -968,6 +1065,68 @@ def __getitem__(self, key):
             self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
             self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
 
+    def test_decodehelper_bug36819(self):
+        handler = RepeatedPosReturn("x")
+        codecs.register_error("test.bug36819", handler.handle)
+
+        testcases = [
+            ("ascii", b"\xff"),
+            ("utf-8", b"\xff"),
+            ("utf-16be", b'\xdc\x80'),
+            ("utf-32be", b'\x00\x00\xdc\x80'),
+            ("iso-8859-6", b"\xff"),
+        ]
+        for enc, bad in testcases:
+            input = "abcd".encode(enc) + bad
+            with self.subTest(encoding=enc):
+                handler.count = 50
+                decoded = input.decode(enc, "test.bug36819")
+                self.assertEqual(decoded, 'abcdx' * 51)
+
+    def test_encodehelper_bug36819(self):
+        handler = RepeatedPosReturn()
+        codecs.register_error("test.bug36819", handler.handle)
+
+        input = "abcd\udc80"
+        encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"]  # built-in
+        encodings += ["iso-8859-15"]  # charmap codec
+        if sys.platform == 'win32':
+            encodings = ["mbcs", "oem"]  # code page codecs
+
+        handler.repl = "\udcff"
+        for enc in encodings:
+            with self.subTest(encoding=enc):
+                handler.count = 50
+                with self.assertRaises(UnicodeEncodeError) as cm:
+                    input.encode(enc, "test.bug36819")
+                exc = cm.exception
+                self.assertEqual(exc.start, 4)
+                self.assertEqual(exc.end, 5)
+                self.assertEqual(exc.object, input)
+        if sys.platform == "win32":
+            handler.count = 50
+            with self.assertRaises(UnicodeEncodeError) as cm:
+                codecs.code_page_encode(437, input, "test.bug36819")
+            exc = cm.exception
+            self.assertEqual(exc.start, 4)
+            self.assertEqual(exc.end, 5)
+            self.assertEqual(exc.object, input)
+
+        handler.repl = "x"
+        for enc in encodings:
+            with self.subTest(encoding=enc):
+                # The interpreter should segfault after a handful of attempts.
+                # 50 was chosen to try to ensure a segfault without a fix,
+                # but not OOM a machine with one.
+                handler.count = 50
+                encoded = input.encode(enc, "test.bug36819")
+                self.assertEqual(encoded.decode(enc), "abcdx" * 51)
+        if sys.platform == "win32":
+            handler.count = 50
+            encoded = codecs.code_page_encode(437, input, "test.bug36819")
+            self.assertEqual(encoded[0].decode(), "abcdx" * 51)
+            self.assertEqual(encoded[1], len(input))
+
     def test_translatehelper(self):
         # enhance coverage of:
         # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
@@ -32,7 +32,7 @@ def check(input, expect):
 # On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
 def is_code_page_present(cp):
     from ctypes import POINTER, WINFUNCTYPE, WinDLL
-    from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
+    from ctypes.wintypes import BOOL, BYTE, WCHAR, UINT, DWORD
 
     MAX_LEADBYTES = 12  # 5 ranges, 2 bytes ea., 0 term.
     MAX_DEFAULTCHAR = 2 # single or double byte