@@ -616,6 +616,216 @@ def test_isxidcontinue(self):
616616 self .assertRaises (TypeError , self .db .isxidcontinue )
617617 self .assertRaises (TypeError , self .db .isxidcontinue , 'xx' )
618618
619+ def test_grapheme_cluster_break (self ):
620+ gcb = self .db .grapheme_cluster_break
621+ self .assertEqual (gcb (' ' ), 'Other' )
622+ self .assertEqual (gcb ('x' ), 'Other' )
623+ self .assertEqual (gcb ('\U0010FFFF ' ), 'Other' )
624+ self .assertEqual (gcb ('\r ' ), 'CR' )
625+ self .assertEqual (gcb ('\n ' ), 'LF' )
626+ self .assertEqual (gcb ('\0 ' ), 'Control' )
627+ self .assertEqual (gcb ('\t ' ), 'Control' )
628+ self .assertEqual (gcb ('\x1F ' ), 'Control' )
629+ self .assertEqual (gcb ('\x7F ' ), 'Control' )
630+ self .assertEqual (gcb ('\x9F ' ), 'Control' )
631+ self .assertEqual (gcb ('\U000E0001 ' ), 'Control' )
632+ self .assertEqual (gcb ('\u0300 ' ), 'Extend' )
633+ self .assertEqual (gcb ('\u200C ' ), 'Extend' )
634+ self .assertEqual (gcb ('\U000E01EF ' ), 'Extend' )
635+ self .assertEqual (gcb ('\u1159 ' ), 'L' )
636+ self .assertEqual (gcb ('\u11F9 ' ), 'T' )
637+ self .assertEqual (gcb ('\uD788 ' ), 'LV' )
638+ self .assertEqual (gcb ('\uD7A3 ' ), 'LVT' )
639+ # New in 5.0.0
640+ self .assertEqual (gcb ('\u05BA ' ), 'Extend' )
641+ self .assertEqual (gcb ('\u20EF ' ), 'Extend' )
642+ # New in 5.1.0
643+ self .assertEqual (gcb ('\u2064 ' ), 'Control' )
644+ self .assertEqual (gcb ('\uAA4D ' ), 'SpacingMark' )
645+ # New in 5.2.0
646+ self .assertEqual (gcb ('\u0816 ' ), 'Extend' )
647+ self .assertEqual (gcb ('\uA97C ' ), 'L' )
648+ self .assertEqual (gcb ('\uD7C6 ' ), 'V' )
649+ self .assertEqual (gcb ('\uD7FB ' ), 'T' )
650+ # New in 6.0.0
651+ self .assertEqual (gcb ('\u093A ' ), 'Extend' )
652+ self .assertEqual (gcb ('\U00011002 ' ), 'SpacingMark' )
653+ # New in 6.1.0
654+ self .assertEqual (gcb ('\U000E0FFF ' ), 'Control' )
655+ self .assertEqual (gcb ('\U00016F7E ' ), 'SpacingMark' )
656+ # New in 6.2.0
657+ self .assertEqual (gcb ('\U0001F1E6 ' ), 'Regional_Indicator' )
658+ self .assertEqual (gcb ('\U0001F1FF ' ), 'Regional_Indicator' )
659+ # New in 6.3.0
660+ self .assertEqual (gcb ('\u180E ' ), 'Control' )
661+ self .assertEqual (gcb ('\u1A1B ' ), 'Extend' )
662+ # New in 7.0.0
663+ self .assertEqual (gcb ('\u0E33 ' ), 'SpacingMark' )
664+ self .assertEqual (gcb ('\u0EB3 ' ), 'SpacingMark' )
665+ self .assertEqual (gcb ('\U0001BCA3 ' ), 'Control' )
666+ self .assertEqual (gcb ('\U0001E8D6 ' ), 'Extend' )
667+ self .assertEqual (gcb ('\U0001163E ' ), 'SpacingMark' )
668+ # New in 8.0.0
669+ self .assertEqual (gcb ('\u08E3 ' ), 'Extend' )
670+ self .assertEqual (gcb ('\U00011726 ' ), 'SpacingMark' )
671+ # New in 9.0.0
672+ self .assertEqual (gcb ('\u0600 ' ), 'Prepend' )
673+ self .assertEqual (gcb ('\U000E007F ' ), 'Extend' )
674+ self .assertEqual (gcb ('\U00011CB4 ' ), 'SpacingMark' )
675+ self .assertEqual (gcb ('\u200D ' ), 'ZWJ' )
676+ # New in 10.0.0
677+ self .assertEqual (gcb ('\U00011D46 ' ), 'Prepend' )
678+ self .assertEqual (gcb ('\U00011D47 ' ), 'Extend' )
679+ self .assertEqual (gcb ('\U00011A97 ' ), 'SpacingMark' )
680+ # New in 11.0.0
681+ self .assertEqual (gcb ('\U000110CD ' ), 'Prepend' )
682+ self .assertEqual (gcb ('\u07FD ' ), 'Extend' )
683+ self .assertEqual (gcb ('\U00011EF6 ' ), 'SpacingMark' )
684+ # New in 12.0.0
685+ self .assertEqual (gcb ('\U00011A84 ' ), 'Prepend' )
686+ self .assertEqual (gcb ('\U00013438 ' ), 'Control' )
687+ self .assertEqual (gcb ('\U0001E2EF ' ), 'Extend' )
688+ self .assertEqual (gcb ('\U00016F87 ' ), 'SpacingMark' )
689+ # New in 13.0.0
690+ self .assertEqual (gcb ('\U00011941 ' ), 'Prepend' )
691+ self .assertEqual (gcb ('\U00016FE4 ' ), 'Extend' )
692+ self .assertEqual (gcb ('\U00011942 ' ), 'SpacingMark' )
693+ # New in 14.0.0
694+ self .assertEqual (gcb ('\u0891 ' ), 'Prepend' )
695+ self .assertEqual (gcb ('\U0001E2AE ' ), 'Extend' )
696+ # New in 15.0.0
697+ self .assertEqual (gcb ('\U00011F02 ' ), 'Prepend' )
698+ self .assertEqual (gcb ('\U0001343F ' ), 'Control' )
699+ self .assertEqual (gcb ('\U0001E4EF ' ), 'Extend' )
700+ self .assertEqual (gcb ('\U00011F3F ' ), 'SpacingMark' )
701+ # New in 16.0.0
702+ self .assertEqual (gcb ('\U000113D1 ' ), 'Prepend' )
703+ self .assertEqual (gcb ('\U0001E5EF ' ), 'Extend' )
704+ self .assertEqual (gcb ('\U0001612C ' ), 'SpacingMark' )
705+ self .assertEqual (gcb ('\U00016D63 ' ), 'V' )
706+ # New in 17.0.0
707+ self .assertEqual (gcb ('\u1AEB ' ), 'Extend' )
708+ self .assertEqual (gcb ('\U00011B67 ' ), 'SpacingMark' )
709+
710+ self .assertRaises (TypeError , gcb )
711+ self .assertRaises (TypeError , gcb , b'x' )
712+ self .assertRaises (TypeError , gcb , 120 )
713+ self .assertRaises (TypeError , gcb , '' )
714+ self .assertRaises (TypeError , gcb , 'xx' )
715+
716+ def test_indic_conjunct_break (self ):
717+ incb = self .db .indic_conjunct_break
718+ self .assertEqual (incb (' ' ), 'None' )
719+ self .assertEqual (incb ('x' ), 'None' )
720+ self .assertEqual (incb ('\U0010FFFF ' ), 'None' )
721+ # New in 15.1.0
722+ self .assertEqual (incb ('\u094D ' ), 'Linker' )
723+ self .assertEqual (incb ('\u0D4D ' ), 'Linker' )
724+ self .assertEqual (incb ('\u0915 ' ), 'Consonant' )
725+ self .assertEqual (incb ('\u0D3A ' ), 'Consonant' )
726+ self .assertEqual (incb ('\u0300 ' ), 'Extend' )
727+ self .assertEqual (incb ('\U0001E94A ' ), 'Extend' )
728+ # New in 16.0.0
729+ self .assertEqual (incb ('\u034F ' ), 'Extend' )
730+ self .assertEqual (incb ('\U000E01EF ' ), 'Extend' )
731+ # New in 17.0.0
732+ self .assertEqual (incb ('\u1039 ' ), 'Linker' )
733+ self .assertEqual (incb ('\U00011F42 ' ), 'Linker' )
734+ self .assertEqual (incb ('\u1000 ' ), 'Consonant' )
735+ self .assertEqual (incb ('\U00011F33 ' ), 'Consonant' )
736+ self .assertEqual (incb ('\U0001E6F5 ' ), 'Extend' )
737+
738+ self .assertRaises (TypeError , incb )
739+ self .assertRaises (TypeError , incb , b'x' )
740+ self .assertRaises (TypeError , incb , 120 )
741+ self .assertRaises (TypeError , incb , '' )
742+ self .assertRaises (TypeError , incb , 'xx' )
743+
744+ def test_extended_pictographic (self ):
745+ ext_pict = self .db .extended_pictographic
746+ self .assertIs (ext_pict (' ' ), False )
747+ self .assertIs (ext_pict ('x' ), False )
748+ self .assertIs (ext_pict ('\U0010FFFF ' ), False )
749+ # New in 13.0.0
750+ self .assertIs (ext_pict ('\xA9 ' ), True )
751+ self .assertIs (ext_pict ('\u203C ' ), True )
752+ self .assertIs (ext_pict ('\U0001FAD6 ' ), True )
753+ self .assertIs (ext_pict ('\U0001FFFD ' ), True )
754+ # New in 17.0.0
755+ self .assertIs (ext_pict ('\u2388 ' ), False )
756+ self .assertIs (ext_pict ('\U0001FA6D ' ), False )
757+
758+ self .assertRaises (TypeError , ext_pict )
759+ self .assertRaises (TypeError , ext_pict , b'x' )
760+ self .assertRaises (TypeError , ext_pict , 120 )
761+ self .assertRaises (TypeError , ext_pict , '' )
762+ self .assertRaises (TypeError , ext_pict , 'xx' )
763+
764+ def test_grapheme_break (self ):
765+ def graphemes (* args ):
766+ return list (map (str , self .db .iter_graphemes (* args )))
767+
768+ self .assertRaises (TypeError , self .db .iter_graphemes )
769+ self .assertRaises (TypeError , self .db .iter_graphemes , b'x' )
770+ self .assertRaises (TypeError , self .db .iter_graphemes , 'x' , 0 , 0 , 0 )
771+
772+ self .assertEqual (graphemes ('' ), [])
773+ self .assertEqual (graphemes ('abcd' ), ['a' , 'b' , 'c' , 'd' ])
774+ self .assertEqual (graphemes ('abcd' , 1 ), ['b' , 'c' , 'd' ])
775+ self .assertEqual (graphemes ('abcd' , 1 , 3 ), ['b' , 'c' ])
776+ self .assertEqual (graphemes ('abcd' , - 3 ), ['b' , 'c' , 'd' ])
777+ self .assertEqual (graphemes ('abcd' , 1 , - 1 ), ['b' , 'c' ])
778+ self .assertEqual (graphemes ('abcd' , 3 , 1 ), [])
779+ self .assertEqual (graphemes ('abcd' , 5 ), [])
780+ self .assertEqual (graphemes ('abcd' , 0 , 5 ), ['a' , 'b' , 'c' , 'd' ])
781+ self .assertEqual (graphemes ('abcd' , - 5 ), ['a' , 'b' , 'c' , 'd' ])
782+ self .assertEqual (graphemes ('abcd' , 0 , - 5 ), [])
783+ # GB3
784+ self .assertEqual (graphemes ('\r \n ' ), ['\r \n ' ])
785+ # GB4
786+ self .assertEqual (graphemes ('\r \u0308 ' ), ['\r ' , '\u0308 ' ])
787+ self .assertEqual (graphemes ('\n \u0308 ' ), ['\n ' , '\u0308 ' ])
788+ self .assertEqual (graphemes ('\0 \u0308 ' ), ['\0 ' , '\u0308 ' ])
789+ # GB5
790+ self .assertEqual (graphemes ('\u06dd \r ' ), ['\u06dd ' , '\r ' ])
791+ self .assertEqual (graphemes ('\u06dd \n ' ), ['\u06dd ' , '\n ' ])
792+ self .assertEqual (graphemes ('\u06dd \0 ' ), ['\u06dd ' , '\0 ' ])
793+ # GB6
794+ self .assertEqual (graphemes ('\u1100 \u1160 ' ), ['\u1100 \u1160 ' ])
795+ self .assertEqual (graphemes ('\u1100 \uAC00 ' ), ['\u1100 \uAC00 ' ])
796+ self .assertEqual (graphemes ('\u1100 \uAC01 ' ), ['\u1100 \uAC01 ' ])
797+ # GB7
798+ self .assertEqual (graphemes ('\uAC00 \u1160 ' ), ['\uAC00 \u1160 ' ])
799+ self .assertEqual (graphemes ('\uAC00 \u11A8 ' ), ['\uAC00 \u11A8 ' ])
800+ self .assertEqual (graphemes ('\u1160 \u1160 ' ), ['\u1160 \u1160 ' ])
801+ self .assertEqual (graphemes ('\u1160 \u11A8 ' ), ['\u1160 \u11A8 ' ])
802+ # GB8
803+ self .assertEqual (graphemes ('\uAC01 \u11A8 ' ), ['\uAC01 \u11A8 ' ])
804+ self .assertEqual (graphemes ('\u11A8 \u11A8 ' ), ['\u11A8 \u11A8 ' ])
805+ # GB9
806+ self .assertEqual (graphemes ('a\u0300 ' ), ['a\u0300 ' ])
807+ self .assertEqual (graphemes ('a\u200D ' ), ['a\u200D ' ])
808+ # GB9a
809+ self .assertEqual (graphemes ('\u0905 \u0903 ' ), ['\u0905 \u0903 ' ])
810+ # GB9b
811+ self .assertEqual (graphemes ('\u06dd \u0661 ' ), ['\u06dd \u0661 ' ])
812+ # GB9c
813+ self .assertEqual (graphemes ('\u0915 \u094d \u0924 ' ),
814+ ['\u0915 \u094d \u0924 ' ])
815+ self .assertEqual (graphemes ('\u0915 \u094D \u094D \u0924 ' ),
816+ ['\u0915 \u094D \u094D \u0924 ' ])
817+ self .assertEqual (graphemes ('\u0915 \u094D \u0924 \u094D \u092F ' ),
818+ ['\u0915 \u094D \u0924 \u094D \u092F ' ])
819+ # GB11
820+ self .assertEqual (graphemes (
821+ '\U0001F9D1 \U0001F3FE \u200D \u2764 \uFE0F '
822+ '\u200D \U0001F48B \u200D \U0001F9D1 \U0001F3FC ' ),
823+ ['\U0001F9D1 \U0001F3FE \u200D \u2764 \uFE0F '
824+ '\u200D \U0001F48B \u200D \U0001F9D1 \U0001F3FC ' ])
825+ # GB11
826+ self .assertEqual (graphemes ('\U0001F1FA \U0001F1E6 ' ),
827+ ['\U0001F1FA \U0001F1E6 ' ])
828+
619829
620830class Unicode_3_2_0_FunctionsTest (UnicodeFunctionsTest ):
621831 db = unicodedata .ucd_3_2_0
@@ -624,6 +834,11 @@ class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
624834 if quicktest else
625835 'f217b8688d7bdff31db4207e078a96702f091597' )
626836
837+ test_grapheme_cluster_break = None
838+ test_indic_conjunct_break = None
839+ test_extended_pictographic = None
840+ test_grapheme_break = None
841+
627842
628843class UnicodeMiscTest (unittest .TestCase ):
629844 db = unicodedata
@@ -848,5 +1063,61 @@ class MyStr(str):
8481063 self .assertIs (type (normalize (form , MyStr (input_str ))), str )
8491064
8501065
1066+ class GraphemeBreakTest (unittest .TestCase ):
1067+ @staticmethod
1068+ def check_version (testfile ):
1069+ hdr = testfile .readline ()
1070+ return unicodedata .unidata_version in hdr
1071+
1072+ @requires_resource ('network' )
1073+ def test_grapheme_break (self ):
1074+ TESTDATAFILE = "auxiliary/GraphemeBreakTest.txt"
1075+ TESTDATAURL = f"https://www.unicode.org/Public/{ unicodedata .unidata_version } /ucd/{ TESTDATAFILE } "
1076+
1077+ # Hit the exception early
1078+ try :
1079+ testdata = open_urlresource (TESTDATAURL , encoding = "utf-8" ,
1080+ check = self .check_version )
1081+ except PermissionError :
1082+ self .skipTest (f"Permission error when downloading { TESTDATAURL } "
1083+ f"into the test data directory" )
1084+ except (OSError , HTTPException ) as exc :
1085+ self .skipTest (f"Failed to download { TESTDATAURL } : { exc } " )
1086+
1087+ with testdata :
1088+ self .run_grapheme_break_tests (testdata , unicodedata )
1089+
1090+ def run_grapheme_break_tests (self , testdata , ucd ):
1091+ for line in testdata :
1092+ line , _ , comment = line .partition ('#' )
1093+ line = line .strip ()
1094+ if not line :
1095+ continue
1096+ comment = comment .strip ()
1097+
1098+ chunks = []
1099+ breaks = []
1100+ pos = 0
1101+ for field in line .replace ('×' , ' ' ).split ():
1102+ if field == '÷' :
1103+ chunks .append ('' )
1104+ breaks .append (pos )
1105+ else :
1106+ chunks [- 1 ] += chr (int (field , 16 ))
1107+ pos += 1
1108+ self .assertEqual (chunks .pop (), '' , line )
1109+ input = '' .join (chunks )
1110+ with self .subTest (line ):
1111+ result = list (unicodedata .iter_graphemes (input ))
1112+ self .assertEqual (list (map (str , result )), chunks , comment )
1113+ self .assertEqual ([x .start for x in result ], breaks [:- 1 ], comment )
1114+ self .assertEqual ([x .end for x in result ], breaks [1 :], comment )
1115+ for i in range (1 , len (breaks ) - 1 ):
1116+ result = list (unicodedata .iter_graphemes (input , breaks [i ]))
1117+ self .assertEqual (list (map (str , result )), chunks [i :], comment )
1118+ self .assertEqual ([x .start for x in result ], breaks [i :- 1 ], comment )
1119+ self .assertEqual ([x .end for x in result ], breaks [i + 1 :], comment )
1120+
1121+
8511122if __name__ == "__main__" :
8521123 unittest .main ()
0 commit comments