11import codecs
22import html .entities
3+ import itertools
34import sys
45import unicodedata
56import unittest
@@ -22,6 +23,18 @@ def handle(self, exc):
2223 self .pos = len (exc .object )
2324 return ("<?>" , oldpos )
2425
26+ class RepeatedPosReturn :
27+ def __init__ (self , repl = "<?>" ):
28+ self .repl = repl
29+ self .pos = 0
30+ self .count = 0
31+
32+ def handle (self , exc ):
33+ if self .count > 0 :
34+ self .count -= 1
35+ return (self .repl , self .pos )
36+ return (self .repl , exc .end )
37+
2538# A UnicodeEncodeError object with a bad start attribute
2639class BadStartUnicodeEncodeError (UnicodeEncodeError ):
2740 def __init__ (self ):
@@ -807,20 +820,104 @@ def test_lookup(self):
807820 codecs .lookup_error ("namereplace" )
808821 )
809822
810- def test_unencodablereplacement (self ):
823+ def test_encode_nonascii_replacement (self ):
824+ def handle (exc ):
825+ if isinstance (exc , UnicodeEncodeError ):
826+ return (repl , exc .end )
827+ raise TypeError ("don't know how to handle %r" % exc )
828+ codecs .register_error ("test.replacing" , handle )
829+
830+ for enc , input , repl in (
831+ ("ascii" , "[¤]" , "abc" ),
832+ ("iso-8859-1" , "[€]" , "½¾" ),
833+ ("iso-8859-15" , "[¤]" , "œŸ" ),
834+ ):
835+ res = input .encode (enc , "test.replacing" )
836+ self .assertEqual (res , ("[" + repl + "]" ).encode (enc ))
837+
838+ for enc , input , repl in (
839+ ("utf-8" , "[\udc80 ]" , "\U0001f40d " ),
840+ ("utf-16" , "[\udc80 ]" , "\U0001f40d " ),
841+ ("utf-32" , "[\udc80 ]" , "\U0001f40d " ),
842+ ):
843+ with self .subTest (encoding = enc ):
844+ with self .assertRaises (UnicodeEncodeError ) as cm :
845+ input .encode (enc , "test.replacing" )
846+ exc = cm .exception
847+ self .assertEqual (exc .start , 1 )
848+ self .assertEqual (exc .end , 2 )
849+ self .assertEqual (exc .object , input )
850+
851+ def test_encode_unencodable_replacement (self ):
811852 def unencrepl (exc ):
812853 if isinstance (exc , UnicodeEncodeError ):
813- return (" \u4242 " , exc .end )
854+ return (repl , exc .end )
814855 else :
815856 raise TypeError ("don't know how to handle %r" % exc )
816857 codecs .register_error ("test.unencreplhandler" , unencrepl )
817- for enc in ("ascii" , "iso-8859-1" , "iso-8859-15" ):
818- self .assertRaises (
819- UnicodeEncodeError ,
820- "\u4242 " .encode ,
821- enc ,
822- "test.unencreplhandler"
823- )
858+
859+ for enc , input , repl in (
860+ ("ascii" , "[¤]" , "½" ),
861+ ("iso-8859-1" , "[€]" , "œ" ),
862+ ("iso-8859-15" , "[¤]" , "½" ),
863+ ("utf-8" , "[\udc80 ]" , "\udcff " ),
864+ ("utf-16" , "[\udc80 ]" , "\udcff " ),
865+ ("utf-32" , "[\udc80 ]" , "\udcff " ),
866+ ):
867+ with self .subTest (encoding = enc ):
868+ with self .assertRaises (UnicodeEncodeError ) as cm :
869+ input .encode (enc , "test.unencreplhandler" )
870+ exc = cm .exception
871+ self .assertEqual (exc .start , 1 )
872+ self .assertEqual (exc .end , 2 )
873+ self .assertEqual (exc .object , input )
874+
875+ def test_encode_bytes_replacement (self ):
876+ def handle (exc ):
877+ if isinstance (exc , UnicodeEncodeError ):
878+ return (repl , exc .end )
879+ raise TypeError ("don't know how to handle %r" % exc )
880+ codecs .register_error ("test.replacing" , handle )
881+
882+ # It works even if the bytes sequence is not decodable.
883+ for enc , input , repl in (
884+ ("ascii" , "[¤]" , b"\xbd \xbe " ),
885+ ("iso-8859-1" , "[€]" , b"\xbd \xbe " ),
886+ ("iso-8859-15" , "[¤]" , b"\xbd \xbe " ),
887+ ("utf-8" , "[\udc80 ]" , b"\xbd \xbe " ),
888+ ("utf-16le" , "[\udc80 ]" , b"\xbd \xbe " ),
889+ ("utf-16be" , "[\udc80 ]" , b"\xbd \xbe " ),
890+ ("utf-32le" , "[\udc80 ]" , b"\xbc \xbd \xbe \xbf " ),
891+ ("utf-32be" , "[\udc80 ]" , b"\xbc \xbd \xbe \xbf " ),
892+ ):
893+ with self .subTest (encoding = enc ):
894+ res = input .encode (enc , "test.replacing" )
895+ self .assertEqual (res , "[" .encode (enc ) + repl + "]" .encode (enc ))
896+
897+ def test_encode_odd_bytes_replacement (self ):
898+ def handle (exc ):
899+ if isinstance (exc , UnicodeEncodeError ):
900+ return (repl , exc .end )
901+ raise TypeError ("don't know how to handle %r" % exc )
902+ codecs .register_error ("test.replacing" , handle )
903+
904+ input = "[\udc80 ]"
905+ # Tests in which the replacement bytestring contains not whole number
906+ # of code units.
907+ for enc , repl in (
908+ * itertools .product (("utf-16le" , "utf-16be" ),
909+ [b"a" , b"abc" ]),
910+ * itertools .product (("utf-32le" , "utf-32be" ),
911+ [b"a" , b"ab" , b"abc" , b"abcde" ]),
912+ ):
913+ with self .subTest (encoding = enc , repl = repl ):
914+ with self .assertRaises (UnicodeEncodeError ) as cm :
915+ input .encode (enc , "test.replacing" )
916+ exc = cm .exception
917+ self .assertEqual (exc .start , 1 )
918+ self .assertEqual (exc .end , 2 )
919+ self .assertEqual (exc .object , input )
920+ self .assertEqual (exc .reason , "surrogates not allowed" )
824921
825922 # TODO: RUSTPYTHON
826923 @unittest .expectedFailure
@@ -968,6 +1065,68 @@ def __getitem__(self, key):
9681065 self .assertRaises (ValueError , codecs .charmap_encode , "\xff " , err , D ())
9691066 self .assertRaises (TypeError , codecs .charmap_encode , "\xff " , err , {0xff : 300 })
9701067
1068+ def test_decodehelper_bug36819 (self ):
1069+ handler = RepeatedPosReturn ("x" )
1070+ codecs .register_error ("test.bug36819" , handler .handle )
1071+
1072+ testcases = [
1073+ ("ascii" , b"\xff " ),
1074+ ("utf-8" , b"\xff " ),
1075+ ("utf-16be" , b'\xdc \x80 ' ),
1076+ ("utf-32be" , b'\x00 \x00 \xdc \x80 ' ),
1077+ ("iso-8859-6" , b"\xff " ),
1078+ ]
1079+ for enc , bad in testcases :
1080+ input = "abcd" .encode (enc ) + bad
1081+ with self .subTest (encoding = enc ):
1082+ handler .count = 50
1083+ decoded = input .decode (enc , "test.bug36819" )
1084+ self .assertEqual (decoded , 'abcdx' * 51 )
1085+
1086+ def test_encodehelper_bug36819 (self ):
1087+ handler = RepeatedPosReturn ()
1088+ codecs .register_error ("test.bug36819" , handler .handle )
1089+
1090+ input = "abcd\udc80 "
1091+ encodings = ["ascii" , "latin1" , "utf-8" , "utf-16" , "utf-32" ] # built-in
1092+ encodings += ["iso-8859-15" ] # charmap codec
1093+ if sys .platform == 'win32' :
1094+ encodings = ["mbcs" , "oem" ] # code page codecs
1095+
1096+ handler .repl = "\udcff "
1097+ for enc in encodings :
1098+ with self .subTest (encoding = enc ):
1099+ handler .count = 50
1100+ with self .assertRaises (UnicodeEncodeError ) as cm :
1101+ input .encode (enc , "test.bug36819" )
1102+ exc = cm .exception
1103+ self .assertEqual (exc .start , 4 )
1104+ self .assertEqual (exc .end , 5 )
1105+ self .assertEqual (exc .object , input )
1106+ if sys .platform == "win32" :
1107+ handler .count = 50
1108+ with self .assertRaises (UnicodeEncodeError ) as cm :
1109+ codecs .code_page_encode (437 , input , "test.bug36819" )
1110+ exc = cm .exception
1111+ self .assertEqual (exc .start , 4 )
1112+ self .assertEqual (exc .end , 5 )
1113+ self .assertEqual (exc .object , input )
1114+
1115+ handler .repl = "x"
1116+ for enc in encodings :
1117+ with self .subTest (encoding = enc ):
1118+ # The interpreter should segfault after a handful of attempts.
1119+ # 50 was chosen to try to ensure a segfault without a fix,
1120+ # but not OOM a machine with one.
1121+ handler .count = 50
1122+ encoded = input .encode (enc , "test.bug36819" )
1123+ self .assertEqual (encoded .decode (enc ), "abcdx" * 51 )
1124+ if sys .platform == "win32" :
1125+ handler .count = 50
1126+ encoded = codecs .code_page_encode (437 , input , "test.bug36819" )
1127+ self .assertEqual (encoded [0 ].decode (), "abcdx" * 51 )
1128+ self .assertEqual (encoded [1 ], len (input ))
1129+
9711130 def test_translatehelper (self ):
9721131 # enhance coverage of:
9731132 # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
0 commit comments