11import codecs
22import html .entities
3+ import itertools
34import sys
45import unicodedata
56import unittest
@@ -22,6 +23,18 @@ def handle(self, exc):
2223 self .pos = len (exc .object )
2324 return ("<?>" , oldpos )
2425
26+ class RepeatedPosReturn :
27+ def __init__ (self , repl = "<?>" ):
28+ self .repl = repl
29+ self .pos = 0
30+ self .count = 0
31+
32+ def handle (self , exc ):
33+ if self .count > 0 :
34+ self .count -= 1
35+ return (self .repl , self .pos )
36+ return (self .repl , exc .end )
37+
2538# A UnicodeEncodeError object with a bad start attribute
2639class BadStartUnicodeEncodeError (UnicodeEncodeError ):
2740 def __init__ (self ):
@@ -783,20 +796,104 @@ def test_lookup(self):
783796 codecs .lookup_error ("namereplace" )
784797 )
785798
786- def test_unencodablereplacement (self ):
799+ def test_encode_nonascii_replacement (self ):
800+ def handle (exc ):
801+ if isinstance (exc , UnicodeEncodeError ):
802+ return (repl , exc .end )
803+ raise TypeError ("don't know how to handle %r" % exc )
804+ codecs .register_error ("test.replacing" , handle )
805+
806+ for enc , input , repl in (
807+ ("ascii" , "[¤]" , "abc" ),
808+ ("iso-8859-1" , "[€]" , "½¾" ),
809+ ("iso-8859-15" , "[¤]" , "œŸ" ),
810+ ):
811+ res = input .encode (enc , "test.replacing" )
812+ self .assertEqual (res , ("[" + repl + "]" ).encode (enc ))
813+
814+ for enc , input , repl in (
815+ ("utf-8" , "[\udc80 ]" , "\U0001f40d " ),
816+ ("utf-16" , "[\udc80 ]" , "\U0001f40d " ),
817+ ("utf-32" , "[\udc80 ]" , "\U0001f40d " ),
818+ ):
819+ with self .subTest (encoding = enc ):
820+ with self .assertRaises (UnicodeEncodeError ) as cm :
821+ input .encode (enc , "test.replacing" )
822+ exc = cm .exception
823+ self .assertEqual (exc .start , 1 )
824+ self .assertEqual (exc .end , 2 )
825+ self .assertEqual (exc .object , input )
826+
827+ def test_encode_unencodable_replacement (self ):
787828 def unencrepl (exc ):
788829 if isinstance (exc , UnicodeEncodeError ):
789- return (" \u4242 " , exc .end )
830+ return (repl , exc .end )
790831 else :
791832 raise TypeError ("don't know how to handle %r" % exc )
792833 codecs .register_error ("test.unencreplhandler" , unencrepl )
793- for enc in ("ascii" , "iso-8859-1" , "iso-8859-15" ):
794- self .assertRaises (
795- UnicodeEncodeError ,
796- "\u4242 " .encode ,
797- enc ,
798- "test.unencreplhandler"
799- )
834+
835+ for enc , input , repl in (
836+ ("ascii" , "[¤]" , "½" ),
837+ ("iso-8859-1" , "[€]" , "œ" ),
838+ ("iso-8859-15" , "[¤]" , "½" ),
839+ ("utf-8" , "[\udc80 ]" , "\udcff " ),
840+ ("utf-16" , "[\udc80 ]" , "\udcff " ),
841+ ("utf-32" , "[\udc80 ]" , "\udcff " ),
842+ ):
843+ with self .subTest (encoding = enc ):
844+ with self .assertRaises (UnicodeEncodeError ) as cm :
845+ input .encode (enc , "test.unencreplhandler" )
846+ exc = cm .exception
847+ self .assertEqual (exc .start , 1 )
848+ self .assertEqual (exc .end , 2 )
849+ self .assertEqual (exc .object , input )
850+
851+ def test_encode_bytes_replacement (self ):
852+ def handle (exc ):
853+ if isinstance (exc , UnicodeEncodeError ):
854+ return (repl , exc .end )
855+ raise TypeError ("don't know how to handle %r" % exc )
856+ codecs .register_error ("test.replacing" , handle )
857+
858+ # It works even if the bytes sequence is not decodable.
859+ for enc , input , repl in (
860+ ("ascii" , "[¤]" , b"\xbd \xbe " ),
861+ ("iso-8859-1" , "[€]" , b"\xbd \xbe " ),
862+ ("iso-8859-15" , "[¤]" , b"\xbd \xbe " ),
863+ ("utf-8" , "[\udc80 ]" , b"\xbd \xbe " ),
864+ ("utf-16le" , "[\udc80 ]" , b"\xbd \xbe " ),
865+ ("utf-16be" , "[\udc80 ]" , b"\xbd \xbe " ),
866+ ("utf-32le" , "[\udc80 ]" , b"\xbc \xbd \xbe \xbf " ),
867+ ("utf-32be" , "[\udc80 ]" , b"\xbc \xbd \xbe \xbf " ),
868+ ):
869+ with self .subTest (encoding = enc ):
870+ res = input .encode (enc , "test.replacing" )
871+ self .assertEqual (res , "[" .encode (enc ) + repl + "]" .encode (enc ))
872+
873+ def test_encode_odd_bytes_replacement (self ):
874+ def handle (exc ):
875+ if isinstance (exc , UnicodeEncodeError ):
876+ return (repl , exc .end )
877+ raise TypeError ("don't know how to handle %r" % exc )
878+ codecs .register_error ("test.replacing" , handle )
879+
880+ input = "[\udc80 ]"
881+ # Tests in which the replacement bytestring contains not whole number
882+ # of code units.
883+ for enc , repl in (
884+ * itertools .product (("utf-16le" , "utf-16be" ),
885+ [b"a" , b"abc" ]),
886+ * itertools .product (("utf-32le" , "utf-32be" ),
887+ [b"a" , b"ab" , b"abc" , b"abcde" ]),
888+ ):
889+ with self .subTest (encoding = enc , repl = repl ):
890+ with self .assertRaises (UnicodeEncodeError ) as cm :
891+ input .encode (enc , "test.replacing" )
892+ exc = cm .exception
893+ self .assertEqual (exc .start , 1 )
894+ self .assertEqual (exc .end , 2 )
895+ self .assertEqual (exc .object , input )
896+ self .assertEqual (exc .reason , "surrogates not allowed" )
800897
801898 def test_badregistercall (self ):
802899 # enhance coverage of:
@@ -940,6 +1037,68 @@ def __getitem__(self, key):
9401037 self .assertRaises (ValueError , codecs .charmap_encode , "\xff " , err , D ())
9411038 self .assertRaises (TypeError , codecs .charmap_encode , "\xff " , err , {0xff : 300 })
9421039
1040+ def test_decodehelper_bug36819 (self ):
1041+ handler = RepeatedPosReturn ("x" )
1042+ codecs .register_error ("test.bug36819" , handler .handle )
1043+
1044+ testcases = [
1045+ ("ascii" , b"\xff " ),
1046+ ("utf-8" , b"\xff " ),
1047+ ("utf-16be" , b'\xdc \x80 ' ),
1048+ ("utf-32be" , b'\x00 \x00 \xdc \x80 ' ),
1049+ ("iso-8859-6" , b"\xff " ),
1050+ ]
1051+ for enc , bad in testcases :
1052+ input = "abcd" .encode (enc ) + bad
1053+ with self .subTest (encoding = enc ):
1054+ handler .count = 50
1055+ decoded = input .decode (enc , "test.bug36819" )
1056+ self .assertEqual (decoded , 'abcdx' * 51 )
1057+
1058+ def test_encodehelper_bug36819 (self ):
1059+ handler = RepeatedPosReturn ()
1060+ codecs .register_error ("test.bug36819" , handler .handle )
1061+
1062+ input = "abcd\udc80 "
1063+ encodings = ["ascii" , "latin1" , "utf-8" , "utf-16" , "utf-32" ] # built-in
1064+ encodings += ["iso-8859-15" ] # charmap codec
1065+ if sys .platform == 'win32' :
1066+ encodings = ["mbcs" , "oem" ] # code page codecs
1067+
1068+ handler .repl = "\udcff "
1069+ for enc in encodings :
1070+ with self .subTest (encoding = enc ):
1071+ handler .count = 50
1072+ with self .assertRaises (UnicodeEncodeError ) as cm :
1073+ input .encode (enc , "test.bug36819" )
1074+ exc = cm .exception
1075+ self .assertEqual (exc .start , 4 )
1076+ self .assertEqual (exc .end , 5 )
1077+ self .assertEqual (exc .object , input )
1078+ if sys .platform == "win32" :
1079+ handler .count = 50
1080+ with self .assertRaises (UnicodeEncodeError ) as cm :
1081+ codecs .code_page_encode (437 , input , "test.bug36819" )
1082+ exc = cm .exception
1083+ self .assertEqual (exc .start , 4 )
1084+ self .assertEqual (exc .end , 5 )
1085+ self .assertEqual (exc .object , input )
1086+
1087+ handler .repl = "x"
1088+ for enc in encodings :
1089+ with self .subTest (encoding = enc ):
1090+ # The interpreter should segfault after a handful of attempts.
1091+ # 50 was chosen to try to ensure a segfault without a fix,
1092+ # but not OOM a machine with one.
1093+ handler .count = 50
1094+ encoded = input .encode (enc , "test.bug36819" )
1095+ self .assertEqual (encoded .decode (enc ), "abcdx" * 51 )
1096+ if sys .platform == "win32" :
1097+ handler .count = 50
1098+ encoded = codecs .code_page_encode (437 , input , "test.bug36819" )
1099+ self .assertEqual (encoded [0 ].decode (), "abcdx" * 51 )
1100+ self .assertEqual (encoded [1 ], len (input ))
1101+
9431102 def test_translatehelper (self ):
9441103 # enhance coverage of:
9451104 # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
0 commit comments