Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ def test_issue10254(self):
self.assertEqual(self.db.normalize('NFC', a), b)

def test_long_combining_mark_run(self):
# GH-XXXXX: avoid quadratic canonical ordering.
# gh-149079: avoid quadratic canonical ordering.
payload = "a" + ("\u0300\u0327" * 32)
nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32)
nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31)
Expand All @@ -628,7 +628,7 @@ def test_long_combining_mark_run(self):
self.assertEqual(self.db.normalize("NFKC", payload), nfc)

def test_combining_mark_run_fast_paths(self):
# GH-XXXXX: cover short runs and already-sorted long runs.
# gh-149079: cover short runs and already-sorted long runs.
short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300"
short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10)
short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Fix a potential denial of service in :func:`unicodedata.normalize`. The
canonical ordering step of Unicode normalization used an O(n²) insertion
canonical ordering step of Unicode normalization used a quadratic-time insertion
sort for reordering combining characters, which could be exploited with
crafted input containing many combining characters in non-canonical order.
Replaced with a linear-time counting sort for long runs.
30 changes: 11 additions & 19 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -586,22 +586,14 @@ canonical_ordering_sort_counting(int kind, void *data,
Py_ssize_t counts[256] = {0};
Py_ssize_t run_length = end - start;
Py_ssize_t total = 0;
unsigned char min_combining = 255;
unsigned char max_combining = 0;

for (Py_ssize_t i = start; i < end; i++) {
Py_UCS4 code = PyUnicode_READ(kind, data, i);
unsigned char combining = _getrecord_ex(code)->combining;
counts[combining]++;
if (combining < min_combining) {
min_combining = combining;
}
if (combining > max_combining) {
max_combining = combining;
}
}

for (Py_ssize_t i = min_combining; i <= max_combining; i++) {
for (size_t i = 0; i < Py_ARRAY_LENGTH(counts); i++) {
Py_ssize_t count = counts[i];
counts[i] = total;
total += count;
Expand Down Expand Up @@ -629,7 +621,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
void *result_data;
/* Longest decomposition in Unicode 3.2: U+FDFA */
Py_UCS4 stack[20];
Py_ssize_t space, isize, length;
Py_ssize_t space, isize;
int index, prefix, count, stackptr;
unsigned char prev, cur;
Py_UCS4 *sortbuf = NULL;
Expand Down Expand Up @@ -727,25 +719,24 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)

result_kind = PyUnicode_KIND(result);
result_data = PyUnicode_DATA(result);
length = PyUnicode_GET_LENGTH(result);

/* Sort each consecutive combining-character run canonically. */
i = 0;
while (i < length) {
while (i < o) {
Py_ssize_t run_length, run_start;
int needs_sort = 0;

prev = _getrecord_ex(
PyUnicode_READ(result_kind, result_data, i))->combining;
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
prev = _getrecord_ex(ch)->combining;
if (prev == 0) {
i++;
continue;
}

run_start = i++;
while (i < length) {
cur = _getrecord_ex(
PyUnicode_READ(result_kind, result_data, i))->combining;
while (i < o) {
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
cur = _getrecord_ex(ch)->combining;
if (cur == 0) {
break;
}
Expand All @@ -767,8 +758,9 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
}

if (run_length > sortbuflen) {
Py_UCS4 *new_sortbuf = PyMem_Realloc(sortbuf,
run_length * sizeof(Py_UCS4));
Py_UCS4 *new_sortbuf = PyMem_Resize(sortbuf,
Py_UCS4,
run_length);
if (new_sortbuf == NULL) {
PyErr_NoMemory();
PyMem_Free(sortbuf);
Expand Down