1- """ Test script for the unicodedata module.
1+ """ Tests for the unicodedata module.
22
33 Written by Marc-Andre Lemburg (mal@lemburg.com).
44
55 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
66
77"""
88
9+ import hashlib
10+ from http .client import HTTPException
911import sys
12+ import unicodedata
1013import unittest
11- import hashlib
12- from test .support import script_helper
13-
14- encoding = 'utf-8'
15- errors = 'surrogatepass'
14+ from test .support import (open_urlresource , requires_resource , script_helper ,
15+ cpython_only , check_disallow_instantiation ,
16+ ResourceDenied )
1617
1718
18- ### Run tests
19-
2019class UnicodeMethodsTest (unittest .TestCase ):
2120
2221 # update this, if the database changes
23- expectedchecksum = '9129d6f2bdf008a81c2476e5b5127014a62130c1 '
22+ expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326 '
2423
2524 # TODO: RUSTPYTHON
2625 @unittest .expectedFailure
26+ @requires_resource ('cpu' )
2727 def test_method_checksum (self ):
2828 h = hashlib .sha1 ()
29- for i in range (0x10000 ):
29+ for i in range (sys . maxunicode + 1 ):
3030 char = chr (i )
3131 data = [
3232 # Predicates (single char)
@@ -63,33 +63,26 @@ def test_method_checksum(self):
6363 (char + 'ABC' ).title (),
6464
6565 ]
66- h .update ('' .join (data ).encode (encoding , errors ))
66+ h .update ('' .join (data ).encode ('utf-8' , 'surrogatepass' ))
6767 result = h .hexdigest ()
6868 self .assertEqual (result , self .expectedchecksum )
6969
7070class UnicodeDatabaseTest (unittest .TestCase ):
71-
72- def setUp (self ):
73- # In case unicodedata is not available, this will raise an ImportError,
74- # but the other test cases will still be run
75- import unicodedata
76- self .db = unicodedata
77-
78- def tearDown (self ):
79- del self .db
71+ db = unicodedata
8072
8173class UnicodeFunctionsTest (UnicodeDatabaseTest ):
8274
8375 # Update this if the database changes. Make sure to do a full rebuild
8476 # (e.g. 'make distclean && make') to get the correct checksum.
85- expectedchecksum = 'c44a49ca7c5cb6441640fe174ede604b45028652 '
77+ expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370 '
8678 # TODO: RUSTPYTHON
8779 @unittest .expectedFailure
80+ @requires_resource ('cpu' )
8881 def test_function_checksum (self ):
8982 data = []
9083 h = hashlib .sha1 ()
9184
92- for i in range (0x10000 ):
85+ for i in range (sys . maxunicode + 1 ):
9386 char = chr (i )
9487 data = [
9588 # Properties
@@ -106,6 +99,15 @@ def test_function_checksum(self):
10699 result = h .hexdigest ()
107100 self .assertEqual (result , self .expectedchecksum )
108101
102+ # TODO: RUSTPYTHON
103+ @unittest .expectedFailure
104+ @requires_resource ('cpu' )
105+ def test_name_inverse_lookup (self ):
106+ for i in range (sys .maxunicode + 1 ):
107+ char = chr (i )
108+ if looked_name := self .db .name (char , None ):
109+ self .assertEqual (self .db .lookup (looked_name ), char )
110+
109111 # TODO: RUSTPYTHON
110112 @unittest .expectedFailure
111113 def test_digit (self ):
@@ -201,15 +203,8 @@ def test_combining(self):
201203 self .assertRaises (TypeError , self .db .combining )
202204 self .assertRaises (TypeError , self .db .combining , 'xx' )
203205
204- def test_normalize (self ):
205- self .assertRaises (TypeError , self .db .normalize )
206- self .assertRaises (ValueError , self .db .normalize , 'unknown' , 'xx' )
207- self .assertEqual (self .db .normalize ('NFKC' , '' ), '' )
208- # The rest can be found in test_normalization.py
209- # which requires an external file.
210-
211206 def test_pr29 (self ):
212- # http ://www.unicode.org/review/pr-29.html
207+ # https ://www.unicode.org/review/pr-29.html
213208 # See issues #1054943 and #10254.
214209 composed = ("\u0b47 \u0300 \u0b3e " , "\u1100 \u0300 \u1161 " ,
215210 'Li\u030d t-s\u1e73 \u0301 ' ,
@@ -240,9 +235,6 @@ def test_issue29456(self):
240235 self .assertEqual (self .db .normalize ('NFC' , u11a7_str_a ), u11a7_str_b )
241236 self .assertEqual (self .db .normalize ('NFC' , u11c3_str_a ), u11c3_str_b )
242237
243- # For tests of unicodedata.is_normalized / self.db.is_normalized ,
244- # see test_normalization.py .
245-
246238 def test_east_asian_width (self ):
247239 eaw = self .db .east_asian_width
248240 self .assertRaises (TypeError , eaw , b'a' )
@@ -265,6 +257,11 @@ def test_east_asian_width_9_0_changes(self):
265257
266258class UnicodeMiscTest (UnicodeDatabaseTest ):
267259
260+ @cpython_only
261+ def test_disallow_instantiation (self ):
262+ # Ensure that the type disallows instantiation (bpo-43916)
263+ check_disallow_instantiation (self , unicodedata .UCD )
264+
268265 # TODO: RUSTPYTHON
269266 @unittest .expectedFailure
270267 def test_failed_import_during_compiling (self ):
@@ -363,5 +360,103 @@ def test_linebreak_7643(self):
363360 self .assertEqual (len (lines ), 1 ,
364361 r"\u%.4x should not be a linebreak" % i )
365362
363+ class NormalizationTest (unittest .TestCase ):
364+ @staticmethod
365+ def check_version (testfile ):
366+ hdr = testfile .readline ()
367+ return unicodedata .unidata_version in hdr
368+
369+ @staticmethod
370+ def unistr (data ):
371+ data = [int (x , 16 ) for x in data .split (" " )]
372+ return "" .join ([chr (x ) for x in data ])
373+
374+ @requires_resource ('network' )
375+ def test_normalization (self ):
376+ TESTDATAFILE = "NormalizationTest.txt"
377+ TESTDATAURL = f"http://www.pythontest.net/unicode/{ unicodedata .unidata_version } /{ TESTDATAFILE } "
378+
379+ # Hit the exception early
380+ try :
381+ testdata = open_urlresource (TESTDATAURL , encoding = "utf-8" ,
382+ check = self .check_version )
383+ except PermissionError :
384+ self .skipTest (f"Permission error when downloading { TESTDATAURL } "
385+ f"into the test data directory" )
386+ except (OSError , HTTPException ) as exc :
387+ self .skipTest (f"Failed to download { TESTDATAURL } : { exc } " )
388+
389+ with testdata :
390+ self .run_normalization_tests (testdata )
391+
392+ def run_normalization_tests (self , testdata ):
393+ part = None
394+ part1_data = {}
395+
396+ def NFC (str ):
397+ return unicodedata .normalize ("NFC" , str )
398+
399+ def NFKC (str ):
400+ return unicodedata .normalize ("NFKC" , str )
401+
402+ def NFD (str ):
403+ return unicodedata .normalize ("NFD" , str )
404+
405+ def NFKD (str ):
406+ return unicodedata .normalize ("NFKD" , str )
407+
408+ for line in testdata :
409+ if '#' in line :
410+ line = line .split ('#' )[0 ]
411+ line = line .strip ()
412+ if not line :
413+ continue
414+ if line .startswith ("@Part" ):
415+ part = line .split ()[0 ]
416+ continue
417+ c1 ,c2 ,c3 ,c4 ,c5 = [self .unistr (x ) for x in line .split (';' )[:- 1 ]]
418+
419+ # Perform tests
420+ self .assertTrue (c2 == NFC (c1 ) == NFC (c2 ) == NFC (c3 ), line )
421+ self .assertTrue (c4 == NFC (c4 ) == NFC (c5 ), line )
422+ self .assertTrue (c3 == NFD (c1 ) == NFD (c2 ) == NFD (c3 ), line )
423+ self .assertTrue (c5 == NFD (c4 ) == NFD (c5 ), line )
424+ self .assertTrue (c4 == NFKC (c1 ) == NFKC (c2 ) == \
425+ NFKC (c3 ) == NFKC (c4 ) == NFKC (c5 ),
426+ line )
427+ self .assertTrue (c5 == NFKD (c1 ) == NFKD (c2 ) == \
428+ NFKD (c3 ) == NFKD (c4 ) == NFKD (c5 ),
429+ line )
430+
431+ self .assertTrue (unicodedata .is_normalized ("NFC" , c2 ))
432+ self .assertTrue (unicodedata .is_normalized ("NFC" , c4 ))
433+
434+ self .assertTrue (unicodedata .is_normalized ("NFD" , c3 ))
435+ self .assertTrue (unicodedata .is_normalized ("NFD" , c5 ))
436+
437+ self .assertTrue (unicodedata .is_normalized ("NFKC" , c4 ))
438+ self .assertTrue (unicodedata .is_normalized ("NFKD" , c5 ))
439+
440+ # Record part 1 data
441+ if part == "@Part1" :
442+ part1_data [c1 ] = 1
443+
444+ # Perform tests for all other data
445+ for c in range (sys .maxunicode + 1 ):
446+ X = chr (c )
447+ if X in part1_data :
448+ continue
449+ self .assertTrue (X == NFC (X ) == NFD (X ) == NFKC (X ) == NFKD (X ), c )
450+
451+ def test_edge_cases (self ):
452+ self .assertRaises (TypeError , unicodedata .normalize )
453+ self .assertRaises (ValueError , unicodedata .normalize , 'unknown' , 'xx' )
454+ self .assertEqual (unicodedata .normalize ('NFKC' , '' ), '' )
455+
456+ def test_bug_834676 (self ):
457+ # Check for bug 834676
458+ unicodedata .normalize ('NFC' , '\ud55c \uae00 ' )
459+
460+
366461if __name__ == "__main__" :
367462 unittest .main ()
0 commit comments