1111import operator
1212import struct
1313import sys
14+ import textwrap
1415import unicodedata
1516import unittest
1617import warnings
18+ from test .support import import_helper
19+ from test .support import warnings_helper
1720from test import support , string_tests
18- from test .support import import_helper , warnings_helper
21+ from test .support . script_helper import assert_python_failure
1922
2023# Error handling (bad decoder return)
2124def search_function (encoding ):
@@ -33,7 +36,6 @@ def decode2(input, errors="strict"):
3336 return (encode2 , decode2 , None , None )
3437 else :
3538 return None
36- codecs .register (search_function )
3739
3840def duplicate_string (text ):
3941 """
@@ -55,6 +57,10 @@ class UnicodeTest(string_tests.CommonTest,
5557
5658 type2test = str
5759
60+ def setUp (self ):
61+ codecs .register (search_function )
62+ self .addCleanup (codecs .unregister , search_function )
63+
5864 def checkequalnofix (self , result , object , methodname , * args ):
5965 method = getattr (object , methodname )
6066 realresult = method (* args )
@@ -505,6 +511,28 @@ def test_replace_id(self):
505511 text = 'abc def'
506512 self .assertIs (text .replace (pattern , pattern ), text )
507513
514+ def test_repeat_id_preserving (self ):
515+ a = '123abc1@'
516+ b = '456zyx-+'
517+ self .assertEqual (id (a ), id (a ))
518+ self .assertNotEqual (id (a ), id (b ))
519+ self .assertNotEqual (id (a ), id (a * - 4 ))
520+ self .assertNotEqual (id (a ), id (a * 0 ))
521+ self .assertEqual (id (a ), id (a * 1 ))
522+ self .assertEqual (id (a ), id (1 * a ))
523+ self .assertNotEqual (id (a ), id (a * 2 ))
524+
525+ class SubStr (str ):
526+ pass
527+
528+ s = SubStr ('qwerty()' )
529+ self .assertEqual (id (s ), id (s ))
530+ self .assertNotEqual (id (s ), id (s * - 4 ))
531+ self .assertNotEqual (id (s ), id (s * 0 ))
532+ self .assertNotEqual (id (s ), id (s * 1 ))
533+ self .assertNotEqual (id (s ), id (1 * s ))
534+ self .assertNotEqual (id (s ), id (s * 2 ))
535+
508536 def test_bytes_comparison (self ):
509537 with warnings_helper .check_warnings ():
510538 warnings .simplefilter ('ignore' , BytesWarning )
@@ -728,6 +756,16 @@ def test_isidentifier(self):
728756 self .assertFalse ("©" .isidentifier ())
729757 self .assertFalse ("0" .isidentifier ())
730758
759+ @support .cpython_only
760+ @support .requires_legacy_unicode_capi
761+ def test_isidentifier_legacy (self ):
762+ import _testcapi
763+ u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊'
764+ self .assertTrue (u .isidentifier ())
765+ with warnings_helper .check_warnings ():
766+ warnings .simplefilter ('ignore' , DeprecationWarning )
767+ self .assertTrue (_testcapi .unicode_legacy_string (u ).isidentifier ())
768+
731769 # TODO: RUSTPYTHON
732770 @unittest .expectedFailure
733771 def test_isprintable (self ):
@@ -1103,6 +1141,12 @@ def __repr__(self):
11031141 self .assertEqual ('{0:^8s}' .format ('result' ), ' result ' )
11041142 self .assertEqual ('{0:^9s}' .format ('result' ), ' result ' )
11051143 self .assertEqual ('{0:^10s}' .format ('result' ), ' result ' )
1144+ self .assertEqual ('{0:8s}' .format ('result' ), 'result ' )
1145+ self .assertEqual ('{0:0s}' .format ('result' ), 'result' )
1146+ self .assertEqual ('{0:08s}' .format ('result' ), 'result00' )
1147+ self .assertEqual ('{0:<08s}' .format ('result' ), 'result00' )
1148+ self .assertEqual ('{0:>08s}' .format ('result' ), '00result' )
1149+ self .assertEqual ('{0:^08s}' .format ('result' ), '0result0' )
11061150 self .assertEqual ('{0:10000}' .format ('a' ), 'a' + ' ' * 9999 )
11071151 self .assertEqual ('{0:10000}' .format ('' ), ' ' * 10000 )
11081152 self .assertEqual ('{0:10000000}' .format ('' ), ' ' * 10000000 )
@@ -1230,8 +1274,11 @@ def __repr__(self):
12301274 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 )
12311275
12321276 # string format spec errors
1233- self .assertRaises (ValueError , "{0:-s}" .format , '' )
1234- self .assertRaises (ValueError , format , "" , "-" )
1277+ sign_msg = "Sign not allowed in string format specifier"
1278+ self .assertRaisesRegex (ValueError , sign_msg , "{0:-s}" .format , '' )
1279+ self .assertRaisesRegex (ValueError , sign_msg , format , "" , "-" )
1280+ space_msg = "Space not allowed in string format specifier"
1281+ self .assertRaisesRegex (ValueError , space_msg , "{: }" .format , '' )
12351282 self .assertRaises (ValueError , "{0:=s}" .format , '' )
12361283
12371284 # Alternate formatting is not supported
@@ -1789,7 +1836,7 @@ def test_issue8271(self):
17891836 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
17901837 # only the start byte and the continuation byte(s) are now considered
17911838 # invalid, instead of the number of bytes specified by the start byte.
1792- # See http ://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1839+ # See https ://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
17931840 # table 3-8, Row 2) for more information about the algorithm used.
17941841 FFFD = '\ufffd '
17951842 sequences = [
@@ -2247,22 +2294,6 @@ def test_concatenation(self):
22472294 self .assertEqual (("abc" "def" "ghi" ), "abcdefghi" )
22482295 self .assertEqual (("abc" "def" "ghi" ), "abcdefghi" )
22492296
2250- def test_printing (self ):
2251- class BitBucket :
2252- def write (self , text ):
2253- pass
2254-
2255- out = BitBucket ()
2256- print ('abc' , file = out )
2257- print ('abc' , 'def' , file = out )
2258- print ('abc' , 'def' , file = out )
2259- print ('abc' , 'def' , file = out )
2260- print ('abc\n ' , file = out )
2261- print ('abc\n ' , end = ' ' , file = out )
2262- print ('abc\n ' , end = ' ' , file = out )
2263- print ('def\n ' , file = out )
2264- print ('def\n ' , file = out )
2265-
22662297 def test_ucs4 (self ):
22672298 x = '\U00100000 '
22682299 y = x .encode ("raw-unicode-escape" ).decode ("raw-unicode-escape" )
@@ -2400,19 +2431,22 @@ def test_getnewargs(self):
24002431 self .assertEqual (len (args ), 1 )
24012432
24022433 @support .cpython_only
2434+ @support .requires_legacy_unicode_capi
24032435 def test_resize (self ):
24042436 from _testcapi import getargs_u
24052437 for length in range (1 , 100 , 7 ):
24062438 # generate a fresh string (refcount=1)
24072439 text = 'a' * length + 'b'
24082440
24092441 # fill wstr internal field
2410- abc = getargs_u (text )
2442+ with self .assertWarns (DeprecationWarning ):
2443+ abc = getargs_u (text )
24112444 self .assertEqual (abc , text )
24122445
24132446 # resize text: wstr field must be cleared and then recomputed
24142447 text += 'c'
2415- abcdef = getargs_u (text )
2448+ with self .assertWarns (DeprecationWarning ):
2449+ abcdef = getargs_u (text )
24162450 self .assertNotEqual (abc , abcdef )
24172451 self .assertEqual (abcdef , text )
24182452
@@ -2496,18 +2530,80 @@ def test_free_after_iterating(self):
24962530 support .check_free_after_iterating (self , iter , str )
24972531 support .check_free_after_iterating (self , reversed , str )
24982532
2533+ def test_check_encoding_errors (self ):
2534+ # bpo-37388: str(bytes) and str.decode() must check encoding and errors
2535+ # arguments in dev mode
2536+ encodings = ('ascii' , 'utf8' , 'latin1' )
2537+ invalid = 'Boom, Shaka Laka, Boom!'
2538+ code = textwrap .dedent (f'''
2539+ import sys
2540+ encodings = { encodings !r}
2541+
2542+ for data in (b'', b'short string'):
2543+ try:
2544+ str(data, encoding={ invalid !r} )
2545+ except LookupError:
2546+ pass
2547+ else:
2548+ sys.exit(21)
2549+
2550+ try:
2551+ str(data, errors={ invalid !r} )
2552+ except LookupError:
2553+ pass
2554+ else:
2555+ sys.exit(22)
2556+
2557+ for encoding in encodings:
2558+ try:
2559+ str(data, encoding, errors={ invalid !r} )
2560+ except LookupError:
2561+ pass
2562+ else:
2563+ sys.exit(22)
2564+
2565+ for data in ('', 'short string'):
2566+ try:
2567+ data.encode(encoding={ invalid !r} )
2568+ except LookupError:
2569+ pass
2570+ else:
2571+ sys.exit(23)
2572+
2573+ try:
2574+ data.encode(errors={ invalid !r} )
2575+ except LookupError:
2576+ pass
2577+ else:
2578+ sys.exit(24)
2579+
2580+ for encoding in encodings:
2581+ try:
2582+ data.encode(encoding, errors={ invalid !r} )
2583+ except LookupError:
2584+ pass
2585+ else:
2586+ sys.exit(24)
2587+
2588+ sys.exit(10)
2589+ ''' )
2590+ proc = assert_python_failure ('-X' , 'dev' , '-c' , code )
2591+ self .assertEqual (proc .rc , 10 , proc )
2592+
24992593
25002594class CAPITest (unittest .TestCase ):
25012595
25022596 # Test PyUnicode_FromFormat()
25032597 def test_from_format (self ):
25042598 import_helper .import_module ('ctypes' )
25052599 from ctypes import (
2600+ c_char_p ,
25062601 pythonapi , py_object , sizeof ,
25072602 c_int , c_long , c_longlong , c_ssize_t ,
25082603 c_uint , c_ulong , c_ulonglong , c_size_t , c_void_p )
25092604 name = "PyUnicode_FromFormat"
25102605 _PyUnicode_FromFormat = getattr (pythonapi , name )
2606+ _PyUnicode_FromFormat .argtypes = (c_char_p ,)
25112607 _PyUnicode_FromFormat .restype = py_object
25122608
25132609 def PyUnicode_FromFormat (format , * args ):
@@ -2807,15 +2903,43 @@ def test_asucs4(self):
28072903 for s in ['abc' , '\xa1 \xa2 ' , '\u4f60 \u597d ' , 'a\U0001f600 ' ,
28082904 'a\ud800 b\udfff c' , '\ud834 \udd1e ' ]:
28092905 l = len (s )
2810- self .assertEqual (unicode_asucs4 (s , l , 1 ), s + '\0 ' )
2811- self .assertEqual (unicode_asucs4 (s , l , 0 ), s + '\uffff ' )
2812- self .assertEqual (unicode_asucs4 (s , l + 1 , 1 ), s + '\0 \uffff ' )
2813- self .assertEqual (unicode_asucs4 (s , l + 1 , 0 ), s + '\0 \uffff ' )
2814- self .assertRaises (SystemError , unicode_asucs4 , s , l - 1 , 1 )
2815- self .assertRaises (SystemError , unicode_asucs4 , s , l - 2 , 0 )
2906+ self .assertEqual (unicode_asucs4 (s , l , True ), s + '\0 ' )
2907+ self .assertEqual (unicode_asucs4 (s , l , False ), s + '\uffff ' )
2908+ self .assertEqual (unicode_asucs4 (s , l + 1 , True ), s + '\0 \uffff ' )
2909+ self .assertEqual (unicode_asucs4 (s , l + 1 , False ), s + '\0 \uffff ' )
2910+ self .assertRaises (SystemError , unicode_asucs4 , s , l - 1 , True )
2911+ self .assertRaises (SystemError , unicode_asucs4 , s , l - 2 , False )
28162912 s = '\0 ' .join ([s , s ])
2817- self .assertEqual (unicode_asucs4 (s , len (s ), 1 ), s + '\0 ' )
2818- self .assertEqual (unicode_asucs4 (s , len (s ), 0 ), s + '\uffff ' )
2913+ self .assertEqual (unicode_asucs4 (s , len (s ), True ), s + '\0 ' )
2914+ self .assertEqual (unicode_asucs4 (s , len (s ), False ), s + '\uffff ' )
2915+
2916+ # Test PyUnicode_AsUTF8()
2917+ @support .cpython_only
2918+ def test_asutf8 (self ):
2919+ from _testcapi import unicode_asutf8
2920+
2921+ bmp = '\u0100 '
2922+ bmp2 = '\uffff '
2923+ nonbmp = chr (0x10ffff )
2924+
2925+ self .assertEqual (unicode_asutf8 (bmp ), b'\xc4 \x80 ' )
2926+ self .assertEqual (unicode_asutf8 (bmp2 ), b'\xef \xbf \xbf ' )
2927+ self .assertEqual (unicode_asutf8 (nonbmp ), b'\xf4 \x8f \xbf \xbf ' )
2928+ self .assertRaises (UnicodeEncodeError , unicode_asutf8 , 'a\ud800 b\udfff c' )
2929+
2930+ # Test PyUnicode_AsUTF8AndSize()
2931+ @support .cpython_only
2932+ def test_asutf8andsize (self ):
2933+ from _testcapi import unicode_asutf8andsize
2934+
2935+ bmp = '\u0100 '
2936+ bmp2 = '\uffff '
2937+ nonbmp = chr (0x10ffff )
2938+
2939+ self .assertEqual (unicode_asutf8andsize (bmp ), (b'\xc4 \x80 ' , 2 ))
2940+ self .assertEqual (unicode_asutf8andsize (bmp2 ), (b'\xef \xbf \xbf ' , 3 ))
2941+ self .assertEqual (unicode_asutf8andsize (nonbmp ), (b'\xf4 \x8f \xbf \xbf ' , 4 ))
2942+ self .assertRaises (UnicodeEncodeError , unicode_asutf8andsize , 'a\ud800 b\udfff c' )
28192943
28202944 # Test PyUnicode_FindChar()
28212945 @support .cpython_only
@@ -2884,32 +3008,38 @@ def test_copycharacters(self):
28843008 self .assertRaises (SystemError , unicode_copycharacters , s , 0 , b'' , 0 , 0 )
28853009
28863010 @support .cpython_only
3011+ @support .requires_legacy_unicode_capi
28873012 def test_encode_decimal (self ):
28883013 from _testcapi import unicode_encodedecimal
2889- self .assertEqual (unicode_encodedecimal ('123' ),
2890- b'123' )
2891- self .assertEqual (unicode_encodedecimal ('\u0663 .\u0661 \u0664 ' ),
2892- b'3.14' )
2893- self .assertEqual (unicode_encodedecimal ("\N{EM SPACE} 3.14\N{EN SPACE} " ),
2894- b' 3.14 ' )
2895- self .assertRaises (UnicodeEncodeError ,
2896- unicode_encodedecimal , "123\u20ac " , "strict" )
2897- self .assertRaisesRegex (
2898- ValueError ,
2899- "^'decimal' codec can't encode character" ,
2900- unicode_encodedecimal , "123\u20ac " , "replace" )
3014+ with warnings_helper .check_warnings ():
3015+ warnings .simplefilter ('ignore' , DeprecationWarning )
3016+ self .assertEqual (unicode_encodedecimal ('123' ),
3017+ b'123' )
3018+ self .assertEqual (unicode_encodedecimal ('\u0663 .\u0661 \u0664 ' ),
3019+ b'3.14' )
3020+ self .assertEqual (unicode_encodedecimal (
3021+ "\N{EM SPACE} 3.14\N{EN SPACE} " ), b' 3.14 ' )
3022+ self .assertRaises (UnicodeEncodeError ,
3023+ unicode_encodedecimal , "123\u20ac " , "strict" )
3024+ self .assertRaisesRegex (
3025+ ValueError ,
3026+ "^'decimal' codec can't encode character" ,
3027+ unicode_encodedecimal , "123\u20ac " , "replace" )
29013028
29023029 @support .cpython_only
3030+ @support .requires_legacy_unicode_capi
29033031 def test_transform_decimal (self ):
29043032 from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2905- self .assertEqual (transform_decimal ('123' ),
2906- '123' )
2907- self .assertEqual (transform_decimal ('\u0663 .\u0661 \u0664 ' ),
2908- '3.14' )
2909- self .assertEqual (transform_decimal ("\N{EM SPACE} 3.14\N{EN SPACE} " ),
2910- "\N{EM SPACE} 3.14\N{EN SPACE} " )
2911- self .assertEqual (transform_decimal ('123\u20ac ' ),
2912- '123\u20ac ' )
3033+ with warnings_helper .check_warnings ():
3034+ warnings .simplefilter ('ignore' , DeprecationWarning )
3035+ self .assertEqual (transform_decimal ('123' ),
3036+ '123' )
3037+ self .assertEqual (transform_decimal ('\u0663 .\u0661 \u0664 ' ),
3038+ '3.14' )
3039+ self .assertEqual (transform_decimal ("\N{EM SPACE} 3.14\N{EN SPACE} " ),
3040+ "\N{EM SPACE} 3.14\N{EN SPACE} " )
3041+ self .assertEqual (transform_decimal ('123\u20ac ' ),
3042+ '123\u20ac ' )
29133043
29143044 @support .cpython_only
29153045 def test_pep393_utf8_caching_bug (self ):
0 commit comments