Skip to content

Commit ce0ac2b

Browse files
CPython developersmumallaeng
authored andcommitted
Update encodings/test_codecs
1 parent 559442c commit ce0ac2b

File tree

6 files changed

+182
-22
lines changed

6 files changed

+182
-22
lines changed

Lib/encodings/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ def normalize_encoding(encoding):
6161
if c.isalnum() or c == '.':
6262
if punct and chars:
6363
chars.append('_')
64-
chars.append(c)
64+
if c.isascii():
65+
chars.append(c)
6566
punct = False
6667
else:
6768
punct = True

Lib/encodings/aliases.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,7 @@
450450

451451
# mac_latin2 codec
452452
'maccentraleurope' : 'mac_latin2',
453+
'mac_centeuro' : 'mac_latin2',
453454
'maclatin2' : 'mac_latin2',
454455

455456
# mac_roman codec
@@ -493,9 +494,6 @@
493494
'sjisx0213' : 'shift_jisx0213',
494495
's_jisx0213' : 'shift_jisx0213',
495496

496-
# tactis codec
497-
'tis260' : 'tactis',
498-
499497
# tis_620 codec
500498
'tis620' : 'tis_620',
501499
'tis_620_0' : 'tis_620',

Lib/encodings/raw_unicode_escape.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
2121
def encode(self, input, final=False):
2222
return codecs.raw_unicode_escape_encode(input, self.errors)[0]
2323

24-
class IncrementalDecoder(codecs.IncrementalDecoder):
25-
def decode(self, input, final=False):
26-
return codecs.raw_unicode_escape_decode(input, self.errors)[0]
24+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
25+
def _buffer_decode(self, input, errors, final):
26+
return codecs.raw_unicode_escape_decode(input, errors, final)
2727

2828
class StreamWriter(Codec,codecs.StreamWriter):
2929
pass
3030

3131
class StreamReader(Codec,codecs.StreamReader):
32-
pass
32+
def decode(self, input, errors='strict'):
33+
return codecs.raw_unicode_escape_decode(input, errors, False)
3334

3435
### encodings module API
3536

Lib/encodings/unicode_escape.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
2121
def encode(self, input, final=False):
2222
return codecs.unicode_escape_encode(input, self.errors)[0]
2323

24-
class IncrementalDecoder(codecs.IncrementalDecoder):
25-
def decode(self, input, final=False):
26-
return codecs.unicode_escape_decode(input, self.errors)[0]
24+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
25+
def _buffer_decode(self, input, errors, final):
26+
return codecs.unicode_escape_decode(input, errors, final)
2727

2828
class StreamWriter(Codec,codecs.StreamWriter):
2929
pass
3030

3131
class StreamReader(Codec,codecs.StreamReader):
32-
pass
32+
def decode(self, input, errors='strict'):
33+
return codecs.unicode_escape_decode(input, errors, False)
3334

3435
### encodings module API
3536

Lib/test/test_codeccallbacks.py

Lines changed: 168 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import codecs
22
import html.entities
3+
import itertools
34
import sys
45
import unicodedata
56
import unittest
@@ -22,6 +23,18 @@ def handle(self, exc):
2223
self.pos = len(exc.object)
2324
return ("<?>", oldpos)
2425

26+
class RepeatedPosReturn:
27+
def __init__(self, repl="<?>"):
28+
self.repl = repl
29+
self.pos = 0
30+
self.count = 0
31+
32+
def handle(self, exc):
33+
if self.count > 0:
34+
self.count -= 1
35+
return (self.repl, self.pos)
36+
return (self.repl, exc.end)
37+
2538
# A UnicodeEncodeError object with a bad start attribute
2639
class BadStartUnicodeEncodeError(UnicodeEncodeError):
2740
def __init__(self):
@@ -807,20 +820,104 @@ def test_lookup(self):
807820
codecs.lookup_error("namereplace")
808821
)
809822

810-
def test_unencodablereplacement(self):
823+
def test_encode_nonascii_replacement(self):
824+
def handle(exc):
825+
if isinstance(exc, UnicodeEncodeError):
826+
return (repl, exc.end)
827+
raise TypeError("don't know how to handle %r" % exc)
828+
codecs.register_error("test.replacing", handle)
829+
830+
for enc, input, repl in (
831+
("ascii", "[¤]", "abc"),
832+
("iso-8859-1", "[€]", "½¾"),
833+
("iso-8859-15", "[¤]", "œŸ"),
834+
):
835+
res = input.encode(enc, "test.replacing")
836+
self.assertEqual(res, ("[" + repl + "]").encode(enc))
837+
838+
for enc, input, repl in (
839+
("utf-8", "[\udc80]", "\U0001f40d"),
840+
("utf-16", "[\udc80]", "\U0001f40d"),
841+
("utf-32", "[\udc80]", "\U0001f40d"),
842+
):
843+
with self.subTest(encoding=enc):
844+
with self.assertRaises(UnicodeEncodeError) as cm:
845+
input.encode(enc, "test.replacing")
846+
exc = cm.exception
847+
self.assertEqual(exc.start, 1)
848+
self.assertEqual(exc.end, 2)
849+
self.assertEqual(exc.object, input)
850+
851+
def test_encode_unencodable_replacement(self):
811852
def unencrepl(exc):
812853
if isinstance(exc, UnicodeEncodeError):
813-
return ("\u4242", exc.end)
854+
return (repl, exc.end)
814855
else:
815856
raise TypeError("don't know how to handle %r" % exc)
816857
codecs.register_error("test.unencreplhandler", unencrepl)
817-
for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
818-
self.assertRaises(
819-
UnicodeEncodeError,
820-
"\u4242".encode,
821-
enc,
822-
"test.unencreplhandler"
823-
)
858+
859+
for enc, input, repl in (
860+
("ascii", "[¤]", "½"),
861+
("iso-8859-1", "[€]", "œ"),
862+
("iso-8859-15", "[¤]", "½"),
863+
("utf-8", "[\udc80]", "\udcff"),
864+
("utf-16", "[\udc80]", "\udcff"),
865+
("utf-32", "[\udc80]", "\udcff"),
866+
):
867+
with self.subTest(encoding=enc):
868+
with self.assertRaises(UnicodeEncodeError) as cm:
869+
input.encode(enc, "test.unencreplhandler")
870+
exc = cm.exception
871+
self.assertEqual(exc.start, 1)
872+
self.assertEqual(exc.end, 2)
873+
self.assertEqual(exc.object, input)
874+
875+
def test_encode_bytes_replacement(self):
876+
def handle(exc):
877+
if isinstance(exc, UnicodeEncodeError):
878+
return (repl, exc.end)
879+
raise TypeError("don't know how to handle %r" % exc)
880+
codecs.register_error("test.replacing", handle)
881+
882+
# It works even if the bytes sequence is not decodable.
883+
for enc, input, repl in (
884+
("ascii", "[¤]", b"\xbd\xbe"),
885+
("iso-8859-1", "[€]", b"\xbd\xbe"),
886+
("iso-8859-15", "[¤]", b"\xbd\xbe"),
887+
("utf-8", "[\udc80]", b"\xbd\xbe"),
888+
("utf-16le", "[\udc80]", b"\xbd\xbe"),
889+
("utf-16be", "[\udc80]", b"\xbd\xbe"),
890+
("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
891+
("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
892+
):
893+
with self.subTest(encoding=enc):
894+
res = input.encode(enc, "test.replacing")
895+
self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
896+
897+
def test_encode_odd_bytes_replacement(self):
898+
def handle(exc):
899+
if isinstance(exc, UnicodeEncodeError):
900+
return (repl, exc.end)
901+
raise TypeError("don't know how to handle %r" % exc)
902+
codecs.register_error("test.replacing", handle)
903+
904+
input = "[\udc80]"
905+
# Tests in which the replacement bytestring contains not whole number
906+
# of code units.
907+
for enc, repl in (
908+
*itertools.product(("utf-16le", "utf-16be"),
909+
[b"a", b"abc"]),
910+
*itertools.product(("utf-32le", "utf-32be"),
911+
[b"a", b"ab", b"abc", b"abcde"]),
912+
):
913+
with self.subTest(encoding=enc, repl=repl):
914+
with self.assertRaises(UnicodeEncodeError) as cm:
915+
input.encode(enc, "test.replacing")
916+
exc = cm.exception
917+
self.assertEqual(exc.start, 1)
918+
self.assertEqual(exc.end, 2)
919+
self.assertEqual(exc.object, input)
920+
self.assertEqual(exc.reason, "surrogates not allowed")
824921

825922
# TODO: RUSTPYTHON
826923
@unittest.expectedFailure
@@ -968,6 +1065,68 @@ def __getitem__(self, key):
9681065
self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
9691066
self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
9701067

1068+
def test_decodehelper_bug36819(self):
1069+
handler = RepeatedPosReturn("x")
1070+
codecs.register_error("test.bug36819", handler.handle)
1071+
1072+
testcases = [
1073+
("ascii", b"\xff"),
1074+
("utf-8", b"\xff"),
1075+
("utf-16be", b'\xdc\x80'),
1076+
("utf-32be", b'\x00\x00\xdc\x80'),
1077+
("iso-8859-6", b"\xff"),
1078+
]
1079+
for enc, bad in testcases:
1080+
input = "abcd".encode(enc) + bad
1081+
with self.subTest(encoding=enc):
1082+
handler.count = 50
1083+
decoded = input.decode(enc, "test.bug36819")
1084+
self.assertEqual(decoded, 'abcdx' * 51)
1085+
1086+
def test_encodehelper_bug36819(self):
1087+
handler = RepeatedPosReturn()
1088+
codecs.register_error("test.bug36819", handler.handle)
1089+
1090+
input = "abcd\udc80"
1091+
encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in
1092+
encodings += ["iso-8859-15"] # charmap codec
1093+
if sys.platform == 'win32':
1094+
encodings = ["mbcs", "oem"] # code page codecs
1095+
1096+
handler.repl = "\udcff"
1097+
for enc in encodings:
1098+
with self.subTest(encoding=enc):
1099+
handler.count = 50
1100+
with self.assertRaises(UnicodeEncodeError) as cm:
1101+
input.encode(enc, "test.bug36819")
1102+
exc = cm.exception
1103+
self.assertEqual(exc.start, 4)
1104+
self.assertEqual(exc.end, 5)
1105+
self.assertEqual(exc.object, input)
1106+
if sys.platform == "win32":
1107+
handler.count = 50
1108+
with self.assertRaises(UnicodeEncodeError) as cm:
1109+
codecs.code_page_encode(437, input, "test.bug36819")
1110+
exc = cm.exception
1111+
self.assertEqual(exc.start, 4)
1112+
self.assertEqual(exc.end, 5)
1113+
self.assertEqual(exc.object, input)
1114+
1115+
handler.repl = "x"
1116+
for enc in encodings:
1117+
with self.subTest(encoding=enc):
1118+
# The interpreter should segfault after a handful of attempts.
1119+
# 50 was chosen to try to ensure a segfault without a fix,
1120+
# but not OOM a machine with one.
1121+
handler.count = 50
1122+
encoded = input.encode(enc, "test.bug36819")
1123+
self.assertEqual(encoded.decode(enc), "abcdx" * 51)
1124+
if sys.platform == "win32":
1125+
handler.count = 50
1126+
encoded = codecs.code_page_encode(437, input, "test.bug36819")
1127+
self.assertEqual(encoded[0].decode(), "abcdx" * 51)
1128+
self.assertEqual(encoded[1], len(input))
1129+
9711130
def test_translatehelper(self):
9721131
# enhance coverage of:
9731132
# Objects/unicodeobject.c::unicode_encode_call_errorhandler()

Lib/test/test_codecs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def check(input, expect):
3232
# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
3333
def is_code_page_present(cp):
3434
from ctypes import POINTER, WINFUNCTYPE, WinDLL
35-
from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
35+
from ctypes.wintypes import BOOL, BYTE, WCHAR, UINT, DWORD
3636

3737
MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term.
3838
MAX_DEFAULTCHAR = 2 # single or double byte

0 commit comments

Comments
 (0)