Skip to content

Commit ff7b96f

Browse files
committed
Update test_unicode* from CPython 3.10.5
1 parent b4c0a76 commit ff7b96f

File tree

3 files changed

+192
-68
lines changed

3 files changed

+192
-68
lines changed

Lib/test/test_unicode.py

Lines changed: 181 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,14 @@
1111
import operator
1212
import struct
1313
import sys
14+
import textwrap
1415
import unicodedata
1516
import unittest
1617
import warnings
18+
from test.support import import_helper
19+
from test.support import warnings_helper
1720
from test import support, string_tests
18-
from test.support import import_helper, warnings_helper
21+
from test.support.script_helper import assert_python_failure
1922

2023
# Error handling (bad decoder return)
2124
def search_function(encoding):
@@ -33,7 +36,6 @@ def decode2(input, errors="strict"):
3336
return (encode2, decode2, None, None)
3437
else:
3538
return None
36-
codecs.register(search_function)
3739

3840
def duplicate_string(text):
3941
"""
@@ -55,6 +57,10 @@ class UnicodeTest(string_tests.CommonTest,
5557

5658
type2test = str
5759

60+
def setUp(self):
61+
codecs.register(search_function)
62+
self.addCleanup(codecs.unregister, search_function)
63+
5864
def checkequalnofix(self, result, object, methodname, *args):
5965
method = getattr(object, methodname)
6066
realresult = method(*args)
@@ -505,6 +511,28 @@ def test_replace_id(self):
505511
text = 'abc def'
506512
self.assertIs(text.replace(pattern, pattern), text)
507513

514+
def test_repeat_id_preserving(self):
515+
a = '123abc1@'
516+
b = '456zyx-+'
517+
self.assertEqual(id(a), id(a))
518+
self.assertNotEqual(id(a), id(b))
519+
self.assertNotEqual(id(a), id(a * -4))
520+
self.assertNotEqual(id(a), id(a * 0))
521+
self.assertEqual(id(a), id(a * 1))
522+
self.assertEqual(id(a), id(1 * a))
523+
self.assertNotEqual(id(a), id(a * 2))
524+
525+
class SubStr(str):
526+
pass
527+
528+
s = SubStr('qwerty()')
529+
self.assertEqual(id(s), id(s))
530+
self.assertNotEqual(id(s), id(s * -4))
531+
self.assertNotEqual(id(s), id(s * 0))
532+
self.assertNotEqual(id(s), id(s * 1))
533+
self.assertNotEqual(id(s), id(1 * s))
534+
self.assertNotEqual(id(s), id(s * 2))
535+
508536
def test_bytes_comparison(self):
509537
with warnings_helper.check_warnings():
510538
warnings.simplefilter('ignore', BytesWarning)
@@ -728,6 +756,16 @@ def test_isidentifier(self):
728756
self.assertFalse("©".isidentifier())
729757
self.assertFalse("0".isidentifier())
730758

759+
@support.cpython_only
760+
@support.requires_legacy_unicode_capi
761+
def test_isidentifier_legacy(self):
762+
import _testcapi
763+
u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊'
764+
self.assertTrue(u.isidentifier())
765+
with warnings_helper.check_warnings():
766+
warnings.simplefilter('ignore', DeprecationWarning)
767+
self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
768+
731769
# TODO: RUSTPYTHON
732770
@unittest.expectedFailure
733771
def test_isprintable(self):
@@ -1103,6 +1141,12 @@ def __repr__(self):
11031141
self.assertEqual('{0:^8s}'.format('result'), ' result ')
11041142
self.assertEqual('{0:^9s}'.format('result'), ' result ')
11051143
self.assertEqual('{0:^10s}'.format('result'), ' result ')
1144+
self.assertEqual('{0:8s}'.format('result'), 'result ')
1145+
self.assertEqual('{0:0s}'.format('result'), 'result')
1146+
self.assertEqual('{0:08s}'.format('result'), 'result00')
1147+
self.assertEqual('{0:<08s}'.format('result'), 'result00')
1148+
self.assertEqual('{0:>08s}'.format('result'), '00result')
1149+
self.assertEqual('{0:^08s}'.format('result'), '0result0')
11061150
self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
11071151
self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
11081152
self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
@@ -1230,8 +1274,11 @@ def __repr__(self):
12301274
0, 1, 2, 3, 4, 5, 6, 7)
12311275

12321276
# string format spec errors
1233-
self.assertRaises(ValueError, "{0:-s}".format, '')
1234-
self.assertRaises(ValueError, format, "", "-")
1277+
sign_msg = "Sign not allowed in string format specifier"
1278+
self.assertRaisesRegex(ValueError, sign_msg, "{0:-s}".format, '')
1279+
self.assertRaisesRegex(ValueError, sign_msg, format, "", "-")
1280+
space_msg = "Space not allowed in string format specifier"
1281+
self.assertRaisesRegex(ValueError, space_msg, "{: }".format, '')
12351282
self.assertRaises(ValueError, "{0:=s}".format, '')
12361283

12371284
# Alternate formatting is not supported
@@ -1789,7 +1836,7 @@ def test_issue8271(self):
17891836
# Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
17901837
# only the start byte and the continuation byte(s) are now considered
17911838
# invalid, instead of the number of bytes specified by the start byte.
1792-
# See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1839+
# See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
17931840
# table 3-8, Row 2) for more information about the algorithm used.
17941841
FFFD = '\ufffd'
17951842
sequences = [
@@ -2247,22 +2294,6 @@ def test_concatenation(self):
22472294
self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
22482295
self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
22492296

2250-
def test_printing(self):
2251-
class BitBucket:
2252-
def write(self, text):
2253-
pass
2254-
2255-
out = BitBucket()
2256-
print('abc', file=out)
2257-
print('abc', 'def', file=out)
2258-
print('abc', 'def', file=out)
2259-
print('abc', 'def', file=out)
2260-
print('abc\n', file=out)
2261-
print('abc\n', end=' ', file=out)
2262-
print('abc\n', end=' ', file=out)
2263-
print('def\n', file=out)
2264-
print('def\n', file=out)
2265-
22662297
def test_ucs4(self):
22672298
x = '\U00100000'
22682299
y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
@@ -2400,19 +2431,22 @@ def test_getnewargs(self):
24002431
self.assertEqual(len(args), 1)
24012432

24022433
@support.cpython_only
2434+
@support.requires_legacy_unicode_capi
24032435
def test_resize(self):
24042436
from _testcapi import getargs_u
24052437
for length in range(1, 100, 7):
24062438
# generate a fresh string (refcount=1)
24072439
text = 'a' * length + 'b'
24082440

24092441
# fill wstr internal field
2410-
abc = getargs_u(text)
2442+
with self.assertWarns(DeprecationWarning):
2443+
abc = getargs_u(text)
24112444
self.assertEqual(abc, text)
24122445

24132446
# resize text: wstr field must be cleared and then recomputed
24142447
text += 'c'
2415-
abcdef = getargs_u(text)
2448+
with self.assertWarns(DeprecationWarning):
2449+
abcdef = getargs_u(text)
24162450
self.assertNotEqual(abc, abcdef)
24172451
self.assertEqual(abcdef, text)
24182452

@@ -2496,18 +2530,80 @@ def test_free_after_iterating(self):
24962530
support.check_free_after_iterating(self, iter, str)
24972531
support.check_free_after_iterating(self, reversed, str)
24982532

2533+
def test_check_encoding_errors(self):
2534+
# bpo-37388: str(bytes) and str.decode() must check encoding and errors
2535+
# arguments in dev mode
2536+
encodings = ('ascii', 'utf8', 'latin1')
2537+
invalid = 'Boom, Shaka Laka, Boom!'
2538+
code = textwrap.dedent(f'''
2539+
import sys
2540+
encodings = {encodings!r}
2541+
2542+
for data in (b'', b'short string'):
2543+
try:
2544+
str(data, encoding={invalid!r})
2545+
except LookupError:
2546+
pass
2547+
else:
2548+
sys.exit(21)
2549+
2550+
try:
2551+
str(data, errors={invalid!r})
2552+
except LookupError:
2553+
pass
2554+
else:
2555+
sys.exit(22)
2556+
2557+
for encoding in encodings:
2558+
try:
2559+
str(data, encoding, errors={invalid!r})
2560+
except LookupError:
2561+
pass
2562+
else:
2563+
sys.exit(22)
2564+
2565+
for data in ('', 'short string'):
2566+
try:
2567+
data.encode(encoding={invalid!r})
2568+
except LookupError:
2569+
pass
2570+
else:
2571+
sys.exit(23)
2572+
2573+
try:
2574+
data.encode(errors={invalid!r})
2575+
except LookupError:
2576+
pass
2577+
else:
2578+
sys.exit(24)
2579+
2580+
for encoding in encodings:
2581+
try:
2582+
data.encode(encoding, errors={invalid!r})
2583+
except LookupError:
2584+
pass
2585+
else:
2586+
sys.exit(24)
2587+
2588+
sys.exit(10)
2589+
''')
2590+
proc = assert_python_failure('-X', 'dev', '-c', code)
2591+
self.assertEqual(proc.rc, 10, proc)
2592+
24992593

25002594
class CAPITest(unittest.TestCase):
25012595

25022596
# Test PyUnicode_FromFormat()
25032597
def test_from_format(self):
25042598
import_helper.import_module('ctypes')
25052599
from ctypes import (
2600+
c_char_p,
25062601
pythonapi, py_object, sizeof,
25072602
c_int, c_long, c_longlong, c_ssize_t,
25082603
c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
25092604
name = "PyUnicode_FromFormat"
25102605
_PyUnicode_FromFormat = getattr(pythonapi, name)
2606+
_PyUnicode_FromFormat.argtypes = (c_char_p,)
25112607
_PyUnicode_FromFormat.restype = py_object
25122608

25132609
def PyUnicode_FromFormat(format, *args):
@@ -2807,15 +2903,43 @@ def test_asucs4(self):
28072903
for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
28082904
'a\ud800b\udfffc', '\ud834\udd1e']:
28092905
l = len(s)
2810-
self.assertEqual(unicode_asucs4(s, l, 1), s+'\0')
2811-
self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff')
2812-
self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff')
2813-
self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff')
2814-
self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1)
2815-
self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0)
2906+
self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
2907+
self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
2908+
self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
2909+
self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
2910+
self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
2911+
self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
28162912
s = '\0'.join([s, s])
2817-
self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0')
2818-
self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff')
2913+
self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
2914+
self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
2915+
2916+
# Test PyUnicode_AsUTF8()
2917+
@support.cpython_only
2918+
def test_asutf8(self):
2919+
from _testcapi import unicode_asutf8
2920+
2921+
bmp = '\u0100'
2922+
bmp2 = '\uffff'
2923+
nonbmp = chr(0x10ffff)
2924+
2925+
self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
2926+
self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
2927+
self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
2928+
self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
2929+
2930+
# Test PyUnicode_AsUTF8AndSize()
2931+
@support.cpython_only
2932+
def test_asutf8andsize(self):
2933+
from _testcapi import unicode_asutf8andsize
2934+
2935+
bmp = '\u0100'
2936+
bmp2 = '\uffff'
2937+
nonbmp = chr(0x10ffff)
2938+
2939+
self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
2940+
self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
2941+
self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
2942+
self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
28192943

28202944
# Test PyUnicode_FindChar()
28212945
@support.cpython_only
@@ -2884,32 +3008,38 @@ def test_copycharacters(self):
28843008
self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
28853009

28863010
@support.cpython_only
3011+
@support.requires_legacy_unicode_capi
28873012
def test_encode_decimal(self):
28883013
from _testcapi import unicode_encodedecimal
2889-
self.assertEqual(unicode_encodedecimal('123'),
2890-
b'123')
2891-
self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2892-
b'3.14')
2893-
self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2894-
b' 3.14 ')
2895-
self.assertRaises(UnicodeEncodeError,
2896-
unicode_encodedecimal, "123\u20ac", "strict")
2897-
self.assertRaisesRegex(
2898-
ValueError,
2899-
"^'decimal' codec can't encode character",
2900-
unicode_encodedecimal, "123\u20ac", "replace")
3014+
with warnings_helper.check_warnings():
3015+
warnings.simplefilter('ignore', DeprecationWarning)
3016+
self.assertEqual(unicode_encodedecimal('123'),
3017+
b'123')
3018+
self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
3019+
b'3.14')
3020+
self.assertEqual(unicode_encodedecimal(
3021+
"\N{EM SPACE}3.14\N{EN SPACE}"), b' 3.14 ')
3022+
self.assertRaises(UnicodeEncodeError,
3023+
unicode_encodedecimal, "123\u20ac", "strict")
3024+
self.assertRaisesRegex(
3025+
ValueError,
3026+
"^'decimal' codec can't encode character",
3027+
unicode_encodedecimal, "123\u20ac", "replace")
29013028

29023029
@support.cpython_only
3030+
@support.requires_legacy_unicode_capi
29033031
def test_transform_decimal(self):
29043032
from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2905-
self.assertEqual(transform_decimal('123'),
2906-
'123')
2907-
self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2908-
'3.14')
2909-
self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2910-
"\N{EM SPACE}3.14\N{EN SPACE}")
2911-
self.assertEqual(transform_decimal('123\u20ac'),
2912-
'123\u20ac')
3033+
with warnings_helper.check_warnings():
3034+
warnings.simplefilter('ignore', DeprecationWarning)
3035+
self.assertEqual(transform_decimal('123'),
3036+
'123')
3037+
self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
3038+
'3.14')
3039+
self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
3040+
"\N{EM SPACE}3.14\N{EN SPACE}")
3041+
self.assertEqual(transform_decimal('123\u20ac'),
3042+
'123\u20ac')
29133043

29143044
@support.cpython_only
29153045
def test_pep393_utf8_caching_bug(self):

0 commit comments

Comments
 (0)