@@ -2755,6 +2755,41 @@ def _calc_skew(i, j, k, alo, ahi, blo, bhi):
27552755 return apos - bpos
27562756
27572757
2758+ # s.t.: c^p == (0.9c)^p + (0.2c)^p
2759+ _BALANCE_SCORE_POWER = 1.284320049734199
2760+
2761+
2762+ def _calc_candidate_score (block0 , block1 , block2 ):
2763+ """Calculates the score for 1-3 block candidates for balancing procedure
2764+
2765+ Score is calculated so that long match is preferred
2766+ to many small ones but not too aggressively,
2767+ so to be able to jump out of skewed positions.
2768+
2769+ total = ∑ length^p
2770+ where p is such that c^p == (0.9c)^p + (0.2c)^p
2771+
2772+ If only 1 block found, it is a definite score.
2773+ Otherwise, it gets bonus for each additional block
2774+ as it has poential to recurse further to each side
2775+ """
2776+ k0 = block0 [2 ]
2777+ k1 = block1 [2 ]
2778+ k2 = block2 [2 ]
2779+ if not k1 :
2780+ raise ValueError ('Middle block should not be null' )
2781+ lengths = [k1 ]
2782+ if k0 :
2783+ lengths .append (k0 )
2784+ if k2 :
2785+ lengths .append (k2 )
2786+ total = sum (k ** _BALANCE_SCORE_POWER for k in lengths )
2787+ nk = len (lengths )
2788+ if nk > 1 :
2789+ total += (nk - 1 ) * min (lengths ) / 3
2790+ return total
2791+
2792+
27582793class GestaltSequenceMatcher (SequenceMatcherBase ):
27592794 """
27602795 GestaltSequenceMatcher is a flexible class for comparing pairs
@@ -2831,6 +2866,9 @@ def __init__(self, isjunk=None, a='', b='', balancing=0):
28312866 ### - 3 (only the candidate)
28322867 xx - 4 (xx + yy)
28332868
2869+ NOTE: Selection score is slightly more involved than sum of blocks.
2870+ See `_calc_candidate_score` for details.
2871+
28342872 Thus, for this example, xx is picked.
28352873
28362874 Comparison to SequenceMatcher:
@@ -2882,7 +2920,6 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None, *, quick_only=Fal
28822920
28832921 if bjunk :
28842922 # Extend match to surrounding junk
2885- # [2026-02-07@dgpb]: Note, expanding will happen even when no-match
28862923 block = _expand_block_to_junk (
28872924 bjunk , block , a , b , alo , ahi , blo , bhi , inverse = False )
28882925
@@ -2944,7 +2981,7 @@ def _modifier(self, depth, block, alo, ahi, blo, bhi):
29442981
29452982 Note, one can get `a`, `b`, `automaton`, etc from self
29462983 """
2947- pass
2984+ return None
29482985
29492986 def _get_matching_blocks (self ):
29502987 balancing = self .balancing
@@ -2991,10 +3028,14 @@ def _get_matching_blocks(self):
29913028 if mtype is REPLACEBLOCK :
29923029 i , j , k = data
29933030 data = [(i , j , k )]
3031+
29943032 elif mtype is ANCHORBLOCKS or mtype is RESULTBLOCKS :
2995- data = sorted (data )
3033+ data = list (data )
3034+ if len (data ) > 1 :
3035+ data .sort ()
3036+
29963037 else :
2997- msg = 'Unknown `_modifier` return type: {!r}'
3038+ msg = 'Unknown `self. _modifier(...) ` return type: {!r}'
29983039 raise RuntimeError (msg .format (mtype ))
29993040
30003041 # 2.1.2. Validate
@@ -3005,11 +3046,11 @@ def _get_matching_blocks(self):
30053046 continue
30063047 if not (i0 <= i <= i + k <= ahi ) or not (j0 <= j <= j + k <= bhi ):
30073048 msg = (
3008- '`_modifier` returned invalid block, which '
3049+ '`self. _modifier(...) ` returned invalid block, which '
30093050 'is either out of bounds or overlaps with a nearby one'
3010- 'block={}, while current interval is {}'
3051+ 'block={}, last_bound={}, current_interval= {}'
30113052 )
3012- raise RuntimeError (msg .format (data , bounds ))
3053+ raise RuntimeError (msg .format (data , ( i0 , j0 ), bounds ))
30133054 validated .append ((i , j , k ))
30143055 i0 = i + k
30153056 j0 = j + k
@@ -3071,17 +3112,15 @@ def _get_matching_blocks(self):
30713112 block_list = self .batch_find_longest_match (jobs )
30723113 job_results = dict (zip (jobs , block_list ))
30733114
3074- # 2.2.3. Pick middle block of best tripple
3115+ # 2.2.3. Pick middle block of the best tripple
30753116 for triple in triples :
3076- triple [0 ] = job_results [triple [0 ]]
3077- triple [2 ] = job_results [triple [2 ]]
3078- # NOTE: k**1.3 is empirically tuned
3079- # prefers one long match to many small ones
3080- # but not too aggressively, so to be able to jump
3081- # out of skewed positions
3082- total = sum (t [2 ]** 1.3 for t in triple )
3083- skew = _calc_skew (* triple [1 ], * bounds )
3084- triple .append ((total , - abs (skew )))
3117+ triple [0 ] = t0 = job_results [triple [0 ]]
3118+ triple [2 ] = t2 = job_results [triple [2 ]]
3119+ t1 = triple [1 ]
3120+ score = _calc_candidate_score (t0 , t1 , t2 )
3121+ # NOTE: secondary key is `skew` as above
3122+ mid_skew = _calc_skew (* t1 , * bounds )
3123+ triple .append ((score , - abs (mid_skew )))
30853124 best = max (triples , key = lambda x : x [- 1 ])
30863125 tail_blocks = best [1 :2 ]
30873126
0 commit comments