1- """ Test script for the unicodedata module.
1+ """ Tests for the unicodedata module.
22
33 Written by Marc-Andre Lemburg (mal@lemburg.com).
44
55 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
66
77"""
88
9+ import hashlib
10+ from http .client import HTTPException
911import sys
12+ import unicodedata
1013import unittest
11- import hashlib
12- from test .support import script_helper
13-
14- encoding = 'utf-8'
15- errors = 'surrogatepass'
14+ from test .support import (open_urlresource , requires_resource , script_helper ,
15+ cpython_only , check_disallow_instantiation ,
16+ ResourceDenied )
1617
1718
18- ### Run tests
19-
2019class UnicodeMethodsTest (unittest .TestCase ):
2120
2221 # update this, if the database changes
23- expectedchecksum = '9129d6f2bdf008a81c2476e5b5127014a62130c1 '
22+ expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326 '
2423
25- # TODO: RUSTPYTHON
26- @unittest .expectedFailure
24+ @requires_resource ('cpu' )
2725 def test_method_checksum (self ):
2826 h = hashlib .sha1 ()
29- for i in range (0x10000 ):
27+ for i in range (sys . maxunicode + 1 ):
3028 char = chr (i )
3129 data = [
3230 # Predicates (single char)
@@ -63,33 +61,26 @@ def test_method_checksum(self):
6361 (char + 'ABC' ).title (),
6462
6563 ]
66- h .update ('' .join (data ).encode (encoding , errors ))
64+ h .update ('' .join (data ).encode ('utf-8' , 'surrogatepass' ))
6765 result = h .hexdigest ()
6866 self .assertEqual (result , self .expectedchecksum )
6967
7068class UnicodeDatabaseTest (unittest .TestCase ):
71-
72- def setUp (self ):
73- # In case unicodedata is not available, this will raise an ImportError,
74- # but the other test cases will still be run
75- import unicodedata
76- self .db = unicodedata
77-
78- def tearDown (self ):
79- del self .db
69+ db = unicodedata
8070
8171class UnicodeFunctionsTest (UnicodeDatabaseTest ):
8272
8373 # Update this if the database changes. Make sure to do a full rebuild
8474 # (e.g. 'make distclean && make') to get the correct checksum.
85- expectedchecksum = 'c44a49ca7c5cb6441640fe174ede604b45028652 '
75+ expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370 '
8676 # TODO: RUSTPYTHON
8777 @unittest .expectedFailure
78+ @requires_resource ('cpu' )
8879 def test_function_checksum (self ):
8980 data = []
9081 h = hashlib .sha1 ()
9182
92- for i in range (0x10000 ):
83+ for i in range (sys . maxunicode + 1 ):
9384 char = chr (i )
9485 data = [
9586 # Properties
@@ -106,6 +97,13 @@ def test_function_checksum(self):
10697 result = h .hexdigest ()
10798 self .assertEqual (result , self .expectedchecksum )
10899
100+ @requires_resource ('cpu' )
101+ def test_name_inverse_lookup (self ):
102+ for i in range (sys .maxunicode + 1 ):
103+ char = chr (i )
104+ if looked_name := self .db .name (char , None ):
105+ self .assertEqual (self .db .lookup (looked_name ), char )
106+
109107 # TODO: RUSTPYTHON
110108 @unittest .expectedFailure
111109 def test_digit (self ):
@@ -201,15 +199,8 @@ def test_combining(self):
201199 self .assertRaises (TypeError , self .db .combining )
202200 self .assertRaises (TypeError , self .db .combining , 'xx' )
203201
204- def test_normalize (self ):
205- self .assertRaises (TypeError , self .db .normalize )
206- self .assertRaises (ValueError , self .db .normalize , 'unknown' , 'xx' )
207- self .assertEqual (self .db .normalize ('NFKC' , '' ), '' )
208- # The rest can be found in test_normalization.py
209- # which requires an external file.
210-
211202 def test_pr29 (self ):
212- # http ://www.unicode.org/review/pr-29.html
203+ # https ://www.unicode.org/review/pr-29.html
213204 # See issues #1054943 and #10254.
214205 composed = ("\u0b47 \u0300 \u0b3e " , "\u1100 \u0300 \u1161 " ,
215206 'Li\u030d t-s\u1e73 \u0301 ' ,
@@ -240,9 +231,6 @@ def test_issue29456(self):
240231 self .assertEqual (self .db .normalize ('NFC' , u11a7_str_a ), u11a7_str_b )
241232 self .assertEqual (self .db .normalize ('NFC' , u11c3_str_a ), u11c3_str_b )
242233
243- # For tests of unicodedata.is_normalized / self.db.is_normalized ,
244- # see test_normalization.py .
245-
246234 def test_east_asian_width (self ):
247235 eaw = self .db .east_asian_width
248236 self .assertRaises (TypeError , eaw , b'a' )
@@ -265,6 +253,11 @@ def test_east_asian_width_9_0_changes(self):
265253
266254class UnicodeMiscTest (UnicodeDatabaseTest ):
267255
256+ @cpython_only
257+ def test_disallow_instantiation (self ):
258+ # Ensure that the type disallows instantiation (bpo-43916)
259+ check_disallow_instantiation (self , unicodedata .UCD )
260+
268261 # TODO: RUSTPYTHON
269262 @unittest .expectedFailure
270263 def test_failed_import_during_compiling (self ):
@@ -363,5 +356,103 @@ def test_linebreak_7643(self):
363356 self .assertEqual (len (lines ), 1 ,
364357 r"\u%.4x should not be a linebreak" % i )
365358
359+ class NormalizationTest (unittest .TestCase ):
360+ @staticmethod
361+ def check_version (testfile ):
362+ hdr = testfile .readline ()
363+ return unicodedata .unidata_version in hdr
364+
365+ @staticmethod
366+ def unistr (data ):
367+ data = [int (x , 16 ) for x in data .split (" " )]
368+ return "" .join ([chr (x ) for x in data ])
369+
370+ @requires_resource ('network' )
371+ def test_normalization (self ):
372+ TESTDATAFILE = "NormalizationTest.txt"
373+ TESTDATAURL = f"http://www.pythontest.net/unicode/{ unicodedata .unidata_version } /{ TESTDATAFILE } "
374+
375+ # Hit the exception early
376+ try :
377+ testdata = open_urlresource (TESTDATAURL , encoding = "utf-8" ,
378+ check = self .check_version )
379+ except PermissionError :
380+ self .skipTest (f"Permission error when downloading { TESTDATAURL } "
381+ f"into the test data directory" )
382+ except (OSError , HTTPException ) as exc :
383+ self .skipTest (f"Failed to download { TESTDATAURL } : { exc } " )
384+
385+ with testdata :
386+ self .run_normalization_tests (testdata )
387+
388+ def run_normalization_tests (self , testdata ):
389+ part = None
390+ part1_data = {}
391+
392+ def NFC (str ):
393+ return unicodedata .normalize ("NFC" , str )
394+
395+ def NFKC (str ):
396+ return unicodedata .normalize ("NFKC" , str )
397+
398+ def NFD (str ):
399+ return unicodedata .normalize ("NFD" , str )
400+
401+ def NFKD (str ):
402+ return unicodedata .normalize ("NFKD" , str )
403+
404+ for line in testdata :
405+ if '#' in line :
406+ line = line .split ('#' )[0 ]
407+ line = line .strip ()
408+ if not line :
409+ continue
410+ if line .startswith ("@Part" ):
411+ part = line .split ()[0 ]
412+ continue
413+ c1 ,c2 ,c3 ,c4 ,c5 = [self .unistr (x ) for x in line .split (';' )[:- 1 ]]
414+
415+ # Perform tests
416+ self .assertTrue (c2 == NFC (c1 ) == NFC (c2 ) == NFC (c3 ), line )
417+ self .assertTrue (c4 == NFC (c4 ) == NFC (c5 ), line )
418+ self .assertTrue (c3 == NFD (c1 ) == NFD (c2 ) == NFD (c3 ), line )
419+ self .assertTrue (c5 == NFD (c4 ) == NFD (c5 ), line )
420+ self .assertTrue (c4 == NFKC (c1 ) == NFKC (c2 ) == \
421+ NFKC (c3 ) == NFKC (c4 ) == NFKC (c5 ),
422+ line )
423+ self .assertTrue (c5 == NFKD (c1 ) == NFKD (c2 ) == \
424+ NFKD (c3 ) == NFKD (c4 ) == NFKD (c5 ),
425+ line )
426+
427+ self .assertTrue (unicodedata .is_normalized ("NFC" , c2 ))
428+ self .assertTrue (unicodedata .is_normalized ("NFC" , c4 ))
429+
430+ self .assertTrue (unicodedata .is_normalized ("NFD" , c3 ))
431+ self .assertTrue (unicodedata .is_normalized ("NFD" , c5 ))
432+
433+ self .assertTrue (unicodedata .is_normalized ("NFKC" , c4 ))
434+ self .assertTrue (unicodedata .is_normalized ("NFKD" , c5 ))
435+
436+ # Record part 1 data
437+ if part == "@Part1" :
438+ part1_data [c1 ] = 1
439+
440+ # Perform tests for all other data
441+ for c in range (sys .maxunicode + 1 ):
442+ X = chr (c )
443+ if X in part1_data :
444+ continue
445+ self .assertTrue (X == NFC (X ) == NFD (X ) == NFKC (X ) == NFKD (X ), c )
446+
447+ def test_edge_cases (self ):
448+ self .assertRaises (TypeError , unicodedata .normalize )
449+ self .assertRaises (ValueError , unicodedata .normalize , 'unknown' , 'xx' )
450+ self .assertEqual (unicodedata .normalize ('NFKC' , '' ), '' )
451+
452+ def test_bug_834676 (self ):
453+ # Check for bug 834676
454+ unicodedata .normalize ('NFC' , '\ud55c \uae00 ' )
455+
456+
366457if __name__ == "__main__" :
367458 unittest .main ()
0 commit comments