Skip to content

Commit 20730d6

Browse files
committed
balancing score work
1 parent 5b0bbbf commit 20730d6

File tree

1 file changed

+56
-17
lines changed

1 file changed

+56
-17
lines changed

Lib/difflib.py

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2755,6 +2755,41 @@ def _calc_skew(i, j, k, alo, ahi, blo, bhi):
27552755
return apos - bpos
27562756

27572757

2758+
# s.t.: c^p == (0.9c)^p + (0.2c)^p
2759+
_BALANCE_SCORE_POWER = 1.284320049734199
2760+
2761+
2762+
def _calc_candidate_score(block0, block1, block2):
2763+
"""Calculates the score for 1-3 block candidates for balancing procedure
2764+
2765+
Score is calculated so that long match is preferred
2766+
to many small ones but not too aggressively,
2767+
so to be able to jump out of skewed positions.
2768+
2769+
total = ∑ length^p
2770+
where p is such that c^p == (0.9c)^p + (0.2c)^p
2771+
2772+
If only 1 block found, it is a definite score.
2773+
Otherwise, it gets bonus for each additional block
2774+
as it has poential to recurse further to each side
2775+
"""
2776+
k0 = block0[2]
2777+
k1 = block1[2]
2778+
k2 = block2[2]
2779+
if not k1:
2780+
raise ValueError('Middle block should not be null')
2781+
lengths = [k1]
2782+
if k0:
2783+
lengths.append(k0)
2784+
if k2:
2785+
lengths.append(k2)
2786+
total = sum(k**_BALANCE_SCORE_POWER for k in lengths)
2787+
nk = len(lengths)
2788+
if nk > 1:
2789+
total += (nk - 1) * min(lengths) / 3
2790+
return total
2791+
2792+
27582793
class GestaltSequenceMatcher(SequenceMatcherBase):
27592794
"""
27602795
GestaltSequenceMatcher is a flexible class for comparing pairs
@@ -2831,6 +2866,9 @@ def __init__(self, isjunk=None, a='', b='', balancing=0):
28312866
### - 3 (only the candidate)
28322867
xx - 4 (xx + yy)
28332868
2869+
NOTE: Selection score is slightly more involved than sum of blocks.
2870+
See `_calc_candidate_score` for details.
2871+
28342872
Thus, for this example, xx is picked.
28352873
28362874
Comparison to SequenceMatcher:
@@ -2882,7 +2920,6 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None, *, quick_only=Fal
28822920

28832921
if bjunk:
28842922
# Extend match to surrounding junk
2885-
# [2026-02-07@dgpb]: Note, expanding will happen even when no-match
28862923
block = _expand_block_to_junk(
28872924
bjunk, block, a, b, alo, ahi, blo, bhi, inverse=False)
28882925

@@ -2944,7 +2981,7 @@ def _modifier(self, depth, block, alo, ahi, blo, bhi):
29442981
29452982
Note, one can get `a`, `b`, `automaton`, etc from self
29462983
"""
2947-
pass
2984+
return None
29482985

29492986
def _get_matching_blocks(self):
29502987
balancing = self.balancing
@@ -2991,10 +3028,14 @@ def _get_matching_blocks(self):
29913028
if mtype is REPLACEBLOCK:
29923029
i, j, k = data
29933030
data = [(i, j, k)]
3031+
29943032
elif mtype is ANCHORBLOCKS or mtype is RESULTBLOCKS:
2995-
data = sorted(data)
3033+
data = list(data)
3034+
if len(data) > 1:
3035+
data.sort()
3036+
29963037
else:
2997-
msg = 'Unknown `_modifier` return type: {!r}'
3038+
msg = 'Unknown `self._modifier(...)` return type: {!r}'
29983039
raise RuntimeError(msg.format(mtype))
29993040

30003041
# 2.1.2. Validate
@@ -3005,11 +3046,11 @@ def _get_matching_blocks(self):
30053046
continue
30063047
if not (i0 <= i <= i + k <= ahi) or not (j0 <= j <= j + k <= bhi):
30073048
msg = (
3008-
'`_modifier` returned invalid block, which '
3049+
'`self._modifier(...)` returned invalid block, which '
30093050
'is either out of bounds or overlaps with a nearby one'
3010-
'block={}, while current interval is {}'
3051+
'block={}, last_bound={}, current_interval={}'
30113052
)
3012-
raise RuntimeError(msg.format(data, bounds))
3053+
raise RuntimeError(msg.format(data, (i0, j0), bounds))
30133054
validated.append((i, j, k))
30143055
i0 = i + k
30153056
j0 = j + k
@@ -3071,17 +3112,15 @@ def _get_matching_blocks(self):
30713112
block_list = self.batch_find_longest_match(jobs)
30723113
job_results = dict(zip(jobs, block_list))
30733114

3074-
# 2.2.3. Pick middle block of best tripple
3115+
# 2.2.3. Pick middle block of the best tripple
30753116
for triple in triples:
3076-
triple[0] = job_results[triple[0]]
3077-
triple[2] = job_results[triple[2]]
3078-
# NOTE: k**1.3 is empirically tuned
3079-
# prefers one long match to many small ones
3080-
# but not too aggressively, so to be able to jump
3081-
# out of skewed positions
3082-
total = sum(t[2]**1.3 for t in triple)
3083-
skew = _calc_skew(*triple[1], *bounds)
3084-
triple.append((total, -abs(skew)))
3117+
triple[0] = t0 = job_results[triple[0]]
3118+
triple[2] = t2 = job_results[triple[2]]
3119+
t1 = triple[1]
3120+
score = _calc_candidate_score(t0, t1, t2)
3121+
# NOTE: secondary key is `skew` as above
3122+
mid_skew = _calc_skew(*t1, *bounds)
3123+
triple.append((score, -abs(mid_skew)))
30853124
best = max(triples, key=lambda x: x[-1])
30863125
tail_blocks = best[1:2]
30873126

0 commit comments

Comments
 (0)