Merge pull request RustPython#4678 from dalinaum/test_unicodedata

youknowone · web-flow · commit 9b55505cdf51 · 2023-03-10T18:14:55.000+09:00
Update test_unicodedata from CPython 3.11.2
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -1,32 +1,32 @@
-""" Test script for the unicodedata module.
+""" Tests for the unicodedata module.
 
     Written by Marc-Andre Lemburg (mal@lemburg.com).
 
     (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
 
 """
 
+import hashlib
+from http.client import HTTPException
 import sys
+import unicodedata
 import unittest
-import hashlib
-from test.support import script_helper
-
-encoding = 'utf-8'
-errors = 'surrogatepass'
+from test.support import (open_urlresource, requires_resource, script_helper,
+                          cpython_only, check_disallow_instantiation,
+                          ResourceDenied)
 
 
-### Run tests
-
 class UnicodeMethodsTest(unittest.TestCase):
 
     # update this, if the database changes
-    expectedchecksum = '9129d6f2bdf008a81c2476e5b5127014a62130c1'
+    expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326'
 
     # TODO: RUSTPYTHON
     @unittest.expectedFailure
+    @requires_resource('cpu')
     def test_method_checksum(self):
         h = hashlib.sha1()
-        for i in range(0x10000):
+        for i in range(sys.maxunicode + 1):
             char = chr(i)
             data = [
                 # Predicates (single char)
@@ -63,33 +63,26 @@ def test_method_checksum(self):
                 (char + 'ABC').title(),
 
                 ]
-            h.update(''.join(data).encode(encoding, errors))
+            h.update(''.join(data).encode('utf-8', 'surrogatepass'))
         result = h.hexdigest()
         self.assertEqual(result, self.expectedchecksum)
 
 class UnicodeDatabaseTest(unittest.TestCase):
-
-    def setUp(self):
-        # In case unicodedata is not available, this will raise an ImportError,
-        # but the other test cases will still be run
-        import unicodedata
-        self.db = unicodedata
-
-    def tearDown(self):
-        del self.db
+    db = unicodedata
 
 class UnicodeFunctionsTest(UnicodeDatabaseTest):
 
     # Update this if the database changes. Make sure to do a full rebuild
     # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = 'c44a49ca7c5cb6441640fe174ede604b45028652'
+    expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370'
     # TODO: RUSTPYTHON
     @unittest.expectedFailure
+    @requires_resource('cpu')
     def test_function_checksum(self):
         data = []
         h = hashlib.sha1()
 
-        for i in range(0x10000):
+        for i in range(sys.maxunicode + 1):
             char = chr(i)
             data = [
                 # Properties
@@ -106,6 +99,15 @@ def test_function_checksum(self):
         result = h.hexdigest()
         self.assertEqual(result, self.expectedchecksum)
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
+    @requires_resource('cpu')
+    def test_name_inverse_lookup(self):
+        for i in range(sys.maxunicode + 1):
+            char = chr(i)
+            if looked_name := self.db.name(char, None):
+                self.assertEqual(self.db.lookup(looked_name), char)
+
     # TODO: RUSTPYTHON
     @unittest.expectedFailure
     def test_digit(self):
@@ -201,15 +203,8 @@ def test_combining(self):
         self.assertRaises(TypeError, self.db.combining)
         self.assertRaises(TypeError, self.db.combining, 'xx')
 
-    def test_normalize(self):
-        self.assertRaises(TypeError, self.db.normalize)
-        self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
-        self.assertEqual(self.db.normalize('NFKC', ''), '')
-        # The rest can be found in test_normalization.py
-        # which requires an external file.
-
     def test_pr29(self):
-        # http://www.unicode.org/review/pr-29.html
+        # https://www.unicode.org/review/pr-29.html
         # See issues #1054943 and #10254.
         composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
                     'Li\u030dt-s\u1e73\u0301',
@@ -240,9 +235,6 @@ def test_issue29456(self):
         self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
         self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
 
-    # For tests of unicodedata.is_normalized / self.db.is_normalized ,
-    # see test_normalization.py .
-
     def test_east_asian_width(self):
         eaw = self.db.east_asian_width
         self.assertRaises(TypeError, eaw, b'a')
@@ -265,6 +257,11 @@ def test_east_asian_width_9_0_changes(self):
 
 class UnicodeMiscTest(UnicodeDatabaseTest):
 
+    @cpython_only
+    def test_disallow_instantiation(self):
+        # Ensure that the type disallows instantiation (bpo-43916)
+        check_disallow_instantiation(self, unicodedata.UCD)
+
     # TODO: RUSTPYTHON
     @unittest.expectedFailure
     def test_failed_import_during_compiling(self):
@@ -363,5 +360,103 @@ def test_linebreak_7643(self):
                 self.assertEqual(len(lines), 1,
                                  r"\u%.4x should not be a linebreak" % i)
 
+class NormalizationTest(unittest.TestCase):
+    @staticmethod
+    def check_version(testfile):
+        hdr = testfile.readline()
+        return unicodedata.unidata_version in hdr
+
+    @staticmethod
+    def unistr(data):
+        data = [int(x, 16) for x in data.split(" ")]
+        return "".join([chr(x) for x in data])
+
+    @requires_resource('network')
+    def test_normalization(self):
+        TESTDATAFILE = "NormalizationTest.txt"
+        TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"
+
+        # Hit the exception early
+        try:
+            testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
+                                        check=self.check_version)
+        except PermissionError:
+            self.skipTest(f"Permission error when downloading {TESTDATAURL} "
+                          f"into the test data directory")
+        except (OSError, HTTPException) as exc:
+            self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
+
+        with testdata:
+            self.run_normalization_tests(testdata)
+
+    def run_normalization_tests(self, testdata):
+        part = None
+        part1_data = {}
+
+        def NFC(str):
+            return unicodedata.normalize("NFC", str)
+
+        def NFKC(str):
+            return unicodedata.normalize("NFKC", str)
+
+        def NFD(str):
+            return unicodedata.normalize("NFD", str)
+
+        def NFKD(str):
+            return unicodedata.normalize("NFKD", str)
+
+        for line in testdata:
+            if '#' in line:
+                line = line.split('#')[0]
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith("@Part"):
+                part = line.split()[0]
+                continue
+            c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]
+
+            # Perform tests
+            self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
+            self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
+            self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
+            self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
+            self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
+                            NFKC(c3) == NFKC(c4) == NFKC(c5),
+                            line)
+            self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
+                            NFKD(c3) == NFKD(c4) == NFKD(c5),
+                            line)
+
+            self.assertTrue(unicodedata.is_normalized("NFC", c2))
+            self.assertTrue(unicodedata.is_normalized("NFC", c4))
+
+            self.assertTrue(unicodedata.is_normalized("NFD", c3))
+            self.assertTrue(unicodedata.is_normalized("NFD", c5))
+
+            self.assertTrue(unicodedata.is_normalized("NFKC", c4))
+            self.assertTrue(unicodedata.is_normalized("NFKD", c5))
+
+            # Record part 1 data
+            if part == "@Part1":
+                part1_data[c1] = 1
+
+        # Perform tests for all other data
+        for c in range(sys.maxunicode+1):
+            X = chr(c)
+            if X in part1_data:
+                continue
+            self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
+
+    def test_edge_cases(self):
+        self.assertRaises(TypeError, unicodedata.normalize)
+        self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
+        self.assertEqual(unicodedata.normalize('NFKC', ''), '')
+
+    def test_bug_834676(self):
+        # Check for bug 834676
+        unicodedata.normalize('NFC', '\ud55c\uae00')
+
+
 if __name__ == "__main__":
     unittest.main()