Skip to content

Commit c0511cc

Browse files
Commit
1 parent fbf0843 commit c0511cc

File tree

7 files changed

+198
-1
lines changed

7 files changed

+198
-1
lines changed

Doc/library/unicodedata.rst

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,32 @@ following functions:
144144
1
145145

146146

147+
.. function:: isidstart(chr, /)
148+
149+
Return ``True`` if the character has the ``XID_Start`` property, ``False``
150+
otherwise. For example::
151+
152+
>>> unicodedata.isidstart('S')
153+
True
154+
>>> unicodedata.isidstart('0')
155+
False
156+
157+
.. versionadded:: next
158+
159+
160+
.. function:: isidcontinue(chr, /)
161+
162+
Return ``True`` if the character has the ``XID_Continue`` property, ``False``
163+
otherwise. For example::
164+
165+
>>> unicodedata.isidcontinue('S')
166+
True
167+
>>> unicodedata.isidcontinue(' ')
168+
False
169+
170+
.. versionadded:: next
171+
172+
147173
.. function:: decomposition(chr)
148174

149175
Returns the character decomposition mapping assigned to the character

Doc/whatsnew/3.15.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,10 @@ unicodedata
768768

769769
* The Unicode database has been updated to Unicode 17.0.0.
770770

771+
* Add :func:`unicodedata.isidstart` and :func:`unicodedata.isidcontinue`
772+
functions.
773+
(Contributed by Stan Ulbrych in :gh:`129117`.)
774+
771775

772776
wave
773777
----

Include/cpython/unicodeobject.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,14 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
733733
Py_UCS4 ch /* Unicode character */
734734
);
735735

736+
PyAPI_FUNC(int) _PyUnicode_IsXidStart(
737+
Py_UCS4 ch /* Unicode character */
738+
);
739+
740+
PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
741+
Py_UCS4 ch /* Unicode character */
742+
);
743+
736744
// Helper array used by Py_UNICODE_ISSPACE().
737745
PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
738746

Lib/test/test_unicodedata.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,33 @@ def test_east_asian_width_9_0_changes(self):
276276
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
277277
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
278278

279+
def test_isidstart(self):
280+
self.assertTrue(self.db.isidstart('S'))
281+
self.assertTrue(self.db.isidstart('\u0AD0')) # GUJARATI OM
282+
self.assertTrue(self.db.isidstart('\u0EC6')) # LAO KO LA
283+
self.assertTrue(self.db.isidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA
284+
self.assertTrue(self.db.isidstart('\uA015')) # YI SYLLABLE WU
285+
self.assertTrue(self.db.isidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM
286+
287+
self.assertFalse(self.db.isidstart(' '))
288+
self.assertRaises(TypeError, self.db.isidstart)
289+
self.assertRaises(TypeError, self.db.isidstart, 'xx')
290+
291+
def test_isidcontinue(self):
292+
self.assertTrue(self.db.isidcontinue('S'))
293+
self.assertTrue(self.db.isidcontinue('_'))
294+
self.assertTrue(self.db.isidcontinue('0'))
295+
self.assertTrue(self.db.isidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR
296+
self.assertTrue(self.db.isidcontinue('\u0640')) # ARABIC TATWEEL
297+
self.assertTrue(self.db.isidcontinue('\u0710')) # SYRIAC LETTER ALAPH
298+
self.assertTrue(self.db.isidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA
299+
self.assertTrue(self.db.isidcontinue('\u17D7')) # KHMER SIGN LEK TOO
300+
301+
self.assertFalse(self.db.isidcontinue(' '))
302+
self.assertFalse(self.db.isidstart('0'))
303+
self.assertRaises(TypeError, self.db.isidcontinue)
304+
self.assertRaises(TypeError, self.db.isidcontinue, 'xx')
305+
279306
class UnicodeMiscTest(UnicodeDatabaseTest):
280307

281308
@cpython_only
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
:mod:`unicodedata`: Add :func:`~unicodedata.isidstart` and
2+
:func:`~unicodedata.isidcontinue` functions.

Modules/clinic/unicodedata.c.h

Lines changed: 73 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/unicodedata.c

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1525,6 +1525,62 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
15251525
return PyUnicode_FromString(name);
15261526
}
15271527

1528+
/*[clinic input]
1529+
unicodedata.UCD.isidstart
1530+
1531+
self: self
1532+
chr: int(accept={str})
1533+
/
1534+
1535+
Return True if the character has the XID_Start property, else False.
1536+
1537+
[clinic start generated code]*/
1538+
1539+
static PyObject *
1540+
unicodedata_UCD_isidstart_impl(PyObject *self, int chr)
1541+
/*[clinic end generated code: output=29fbeaf6491d9f85 input=b71b6b1b2db3c16d]*/
1542+
{
1543+
Py_UCS4 c = (Py_UCS4)chr;
1544+
1545+
if (UCD_Check(self)) {
1546+
const change_record *old = get_old_record(self, c);
1547+
if (old->category_changed == 0) {
1548+
/* unassigned */
1549+
Py_RETURN_FALSE;
1550+
}
1551+
}
1552+
1553+
return PyBool_FromLong(_PyUnicode_IsXidStart(c));
1554+
}
1555+
1556+
/*[clinic input]
1557+
unicodedata.UCD.isidcontinue
1558+
1559+
self: self
1560+
chr: int(accept={str})
1561+
/
1562+
1563+
Return True if the character has the XID_Continue property, else False.
1564+
1565+
[clinic start generated code]*/
1566+
1567+
static PyObject *
1568+
unicodedata_UCD_isidcontinue_impl(PyObject *self, int chr)
1569+
/*[clinic end generated code: output=5ae694da0ee16534 input=01b4ccd399484e6b]*/
1570+
{
1571+
Py_UCS4 c = (Py_UCS4)chr;
1572+
1573+
if (UCD_Check(self)) {
1574+
const change_record *old = get_old_record(self, c);
1575+
if (old->category_changed == 0) {
1576+
/* unassigned */
1577+
Py_RETURN_FALSE;
1578+
}
1579+
}
1580+
1581+
return PyBool_FromLong(_PyUnicode_IsXidContinue(c));
1582+
}
1583+
15281584
/*[clinic input]
15291585
unicodedata.UCD.lookup
15301586
@@ -1590,6 +1646,8 @@ static PyMethodDef unicodedata_functions[] = {
15901646
UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
15911647
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
15921648
UNICODEDATA_UCD_NAME_METHODDEF
1649+
UNICODEDATA_UCD_ISIDSTART_METHODDEF
1650+
UNICODEDATA_UCD_ISIDCONTINUE_METHODDEF
15931651
UNICODEDATA_UCD_LOOKUP_METHODDEF
15941652
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
15951653
UNICODEDATA_UCD_NORMALIZE_METHODDEF

0 commit comments

Comments
 (0)