Skip to content

Commit 8138c2e

Browse files
committed
Fold in Stefan's suggestions, and minor cleanup.
1 parent 835190d commit 8138c2e

File tree

2 files changed

+88
-42
lines changed

2 files changed

+88
-42
lines changed

Objects/listobject.c

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1747,10 +1747,9 @@ struct s_MergeState {
17471747
int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *);
17481748

17491749
/* Varisbles used for minrun computation. The "ideal" minrun length is
1750-
* the infinite precision listlen / 2**e, which is represented as the
1751-
* marhematical value of mr_int + mr_frac / 2**e.
1750+
* the infinite precision listlen / 2**e. See listlen.txt.
17521751
*/
1753-
Py_ssize_t mr_int, mr_frac, mr_current_frac, mr_e, mr_mask;
1752+
Py_ssize_t mr_current, mr_e, mr_mask;
17541753
};
17551754

17561755
/* binarysort is the best method for sorting small arrays: it does few
@@ -2213,15 +2212,13 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc,
22132212
ms->listlen = list_size;
22142213
ms->basekeys = lo->keys;
22152214

2216-
ms->mr_int = list_size;
2215+
/* State for generating minrun values. See listsort.txt. */
22172216
ms->mr_e = 0;
2218-
while (ms->mr_int >= MAX_MINRUN) {
2219-
ms->mr_int >>= 1;
2217+
while (list_size >> ms->mr_e >= MAX_MINRUN) {
22202218
++ms->mr_e;
22212219
}
22222220
ms->mr_mask = (1 << ms->mr_e) - 1;
2223-
ms->mr_frac = list_size & ms->mr_mask;
2224-
ms->mr_current_frac = 0;
2221+
ms->mr_current = 0;
22252222
}
22262223

22272224
/* Free all the temp memory owned by the MergeState. This must be called
@@ -2700,13 +2697,13 @@ merge_force_collapse(MergeState *ms)
27002697
}
27012698

27022699
/* Return the next minrun value to use. See listsort.txt. */
2703-
static inline Py_ssize_t
2700+
Py_LOCAL_INLINE(Py_ssize_t)
27042701
minrun_next(MergeState *ms)
27052702
{
2706-
ms->mr_current_frac += ms->mr_frac;
2707-
assert(ms->mr_current_frac >> ms->mr_e <= 1);
2708-
Py_ssize_t result = ms->mr_int + (ms->mr_current_frac >> ms->mr_e);
2709-
ms->mr_current_frac &= ms->mr_mask;
2703+
ms->mr_current += ms->listlen;
2704+
assert(ms->mr_current >= 0); /* no overflow */
2705+
Py_ssize_t result = ms->mr_current >> ms->mr_e;
2706+
ms->mr_current &= ms->mr_mask;
27102707
return result;
27112708
}
27122709

Objects/listsort.txt

Lines changed: 78 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -316,24 +316,78 @@ merge tree:
316316
So, in all respects, as perfectly balanced as possible.
317317

318318
For the 2112 case, that also keeps minrun at 33, but we were lucky there
319-
that 2112 is a power of 2 times 33. The new approach doesn't rely on luck.
319+
that 2112 is 33 times a power of 2. The new approach doesn't rely on luck.
320320

321-
The basic idea is to conceive of the ideal run length as being a real number
322-
rather than just an integer. For an array of length `n`, let `e` be the
323-
smallest int such that n/2**e < MAX_MINRUN. Then mr = n/2**e is the ideal
324-
run length, and obviously mr * 2**e is n, so there are exactly 2**e runs.
321+
For example, with 315 random elements, the old scheme uses fixed minrun=40 and
322+
produces runs of length 40, except for the last. The new scheme produces a
323+
mix of lengths 39 and 40:
325324

326-
Of course runs can't have a fractional length, so we start the i'th (zero-
327-
based) run at index int(mr * i), for i in range(2**e). The differences between
328-
adjacent starting indices are the run lengths, and it's left as an exercise
329-
for the reader to show that they have the nice properties listed above. See
330-
note MINRUN CODE for an executable Python implementation to help make it all
331-
concrete.
325+
old: 40 40 40 40 40 40 40 35
326+
new: 39 39 40 39 39 40 39 40
332327

333-
The code doesn't actually compute the starting indices, or use floats. Instead
334-
mr is represented as a pair of integers such that the infinite precision mr is
335-
equal to mr_int + mr_frac / 2**e, and only the delta (run length) from one
336-
index to the next is computed.
328+
Both schemes produce eight runs, a power of 2. That's good for a balanced
329+
merge tree. But the new scheme allows merges where left and right length
330+
never differ by more than 1:
331+
332+
39 39 40 39 39 40 39 40
333+
78 79 79 79
334+
157 158
335+
315
336+
337+
(This shows merges downward, e.g., two runs of length 39 are merged and
338+
become a run of length 78.)
339+
340+
With larger lists, the old scheme can get even more unbalanced. For example,
341+
with 32769 elements (that's 2**15 + 1), it uses minrun=33 and produces 993
342+
runs (of length 33). That's not even a power of 2. The new scheme instead
343+
produces 1024 runs, all with length 32 except for the last one with length 33.
344+
345+
How does it work? Ideally, all runs would be exactly equally long. For the
346+
above example, each run would have 315/8 = 39.375 elements. Which of course
347+
doesn't work. But we can get close:
348+
349+
For the first run, we'd like 39.375 elements. Since that's impossible, we
350+
instead use 39 (the floor) and remember the current leftover fraction 0.375.
351+
For the second run, we add 0.375 + 39.375 = 39.75. Again impossible, so we
352+
instead use 39 and remember 0.75. For the third run, we add 0.75 + 39.375 =
353+
40.125. This time we get 40 and remember 0.125. And so on. Here's a Python
354+
generator doing that:
355+
356+
def gen_minruns_with_floats(n):
357+
mr = n
358+
while mr >= MAX_MINRUN:
359+
mr /= 2
360+
361+
mr_current = 0
362+
while True:
363+
mr_current += mr
364+
yield int(mr_current)
365+
mr_current %= 1
366+
367+
But while all arithmetic here can be done exactly using binery floating point,
368+
floats have less precision that a Py_ssize_t, and mixing floats with ints is
369+
needlessly expensive anyway.
370+
371+
So here's an integer version, where the internal numbers are scaled up by
372+
2**e, or rather not divided by 2**e. Instead, only each yielded minrun gets
373+
divided (by right-shifting). For example instead of adding 39.375 and
374+
reducing modulo 1, it just adds 315 and reduces modulo 8. And always divides
375+
by 8 to get each actual minrun value:
376+
377+
def gen_minruns_simpler(n):
378+
e = 0
379+
while (n >> e) >= MAX_MINRUN:
380+
e += 1
381+
mask = (1 << e) - 1
382+
383+
mr_current = 0
384+
while True:
385+
mr_current += n
386+
yield mr_current >> e
387+
mr_current &= mask
388+
389+
See note MINRUN CODE for a full implementation and a driver that exhaustively
390+
verifies the claims above for all list lengths through 2 million.
337391

338392

339393
The Merge Pattern
@@ -852,23 +906,18 @@ except ImportError:
852906
MAX_MINRUN = 64
853907

854908
def gen_minruns(n):
855-
# mr_int = minrun's integral part
856-
# mr_frac = minrun's fractional part with mr_e bits and
857-
# mask mr_mask
858-
mr_int = n
909+
# In listobject.c, initialization is done in merge_init(), and
910+
# the body of the loop in minrun_next().
859911
mr_e = 0
860-
while mr_int >= MAX_MINRUN:
861-
mr_int >>= 1
912+
while (n >> mr_e) >= MAX_MINRUN:
862913
mr_e += 1
863914
mr_mask = (1 << mr_e) - 1
864-
mr_frac = n & mr_mask
865915

866-
mr_current_frac = 0
916+
mr_current = 0
867917
while True:
868-
mr_current_frac += mr_frac
869-
assert mr_current_frac >> mr_e <= 1
870-
yield mr_int + (mr_current_frac >> mr_e)
871-
mr_current_frac &= mr_mask
918+
mr_current += n
919+
yield mr_current >> mr_e
920+
mr_current &= mr_mask
872921

873922
def chew(n, show=False):
874923
if n < 1:
@@ -884,7 +933,7 @@ def chew(n, show=False):
884933
assert tot == n
885934
print(n, len(sizes))
886935

887-
small, large = 32, 64
936+
small, large = MAX_MINRUN // 2, MAX_MINRUN
888937
while len(sizes) > 1:
889938
assert not len(sizes) & 1
890939
assert len(sizes).bit_count() == 1 # i.e., power of 2
@@ -913,4 +962,4 @@ def chew(n, show=False):
913962
assert sizes[0] == n
914963

915964
for n in range(2_000_001):
916-
chew(n)
965+
chew(n)

0 commit comments

Comments
 (0)