@@ -2301,7 +2301,8 @@ def restore(delta, which):
23012301
23022302
23032303class _LCSUBAutomaton :
2304- """Suffix Automaton for finding longest common substring.
2304+ """
2305+ Suffix Automaton for finding longest common substring.
23052306
23062307 Complexity:
23072308 T: O(n1 + n2) ~ n1 + 5 × n2
@@ -2381,9 +2382,10 @@ def build(self, start2=0, stop2=None):
23812382 self .cache = key
23822383
23832384 def find (self , seq1 , start1 = 0 , stop1 = None , start2 = 0 , stop2 = None ):
2384- """Find leftmost longest match
2385- Firstly, it will be leftmost in seq1
2386- Secondly, it will be leftmost in seq2 if more than one occurrence
2385+ """
2386+ Find leftmost longest match
2387+ Firstly, it will be leftmost in seq1
2388+ Secondly, it will be leftmost in seq2 if more than one occurrence
23872389
23882390 Returns:
23892391 match: (start_in_seq1, start_in_seq2, match_length)
@@ -2396,9 +2398,11 @@ def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
23962398 return res
23972399
23982400 def batchfind (self , seq1 , bounds_list ):
2399- """Performance method for many `find` calls
2400- It calls `find` in order that aims to minimize builds needed
2401- Also, does not evaluate same range twice
2401+ """
2402+ Performance method for many `find` calls
2403+ It calls `find` in order that aims to minimize builds needed
2404+ Also, does not evaluate same range twice
2405+
24022406 Args:
24032407 bounds_list : list[tuple[int, int, int, int]]
24042408 list of tuples: (start1, stop1, start2, stop2)
@@ -2423,11 +2427,12 @@ def batchfind(self, seq1, bounds_list):
24232427 # ---------------------------------
24242428
24252429 def _try_find (self , seq1 , start1 , stop1 , start2 , stop2 ):
2426- """Attempts to find match without building
2427- Querying in exactly the same range will always succeed
2428- Also, it might be possible if (start2, stop2) is within cached range
2430+ """
2431+ Attempts to find match without building
2432+ Querying in exactly the same range will always succeed
2433+ Also, it might be possible if (start2, stop2) is within cached range
24292434
2430- returns None on fail
2435+ returns None on fail
24312436 """
24322437 if start1 >= stop1 or start2 >= stop2 :
24332438 return (start1 , start2 , 0 )
@@ -2447,9 +2452,10 @@ def _try_find(self, seq1, start1, stop1, start2, stop2):
24472452 return (e1 + 1 - k , start_in_seq2 , k )
24482453
24492454 def _find (self , seq1 , start1 , stop1 , start2 , stop2 ):
2450- """Returns lefmost longest match
2451- Does not attempt to retrieve from inexactly built range
2452- Always returns an answer
2455+ """
2456+ Returns lefmost longest match
2457+ Does not attempt to retrieve from inexactly built range
2458+ Always returns an answer
24532459 """
24542460 if start1 >= stop1 or start2 >= stop2 :
24552461 return (start1 , start2 , 0 )
@@ -2599,7 +2605,9 @@ def _build(self, start2, stop2):
25992605 return nodes
26002606
26012607 def _finditer (self , seq1 , start1 , stop1 , best = False ):
2602- """Core scanning routine
2608+ """
2609+ Core scanning routine
2610+
26032611 Args:
26042612 best : bool
26052613 False - return all matches, including non-maximal
@@ -2691,8 +2699,10 @@ def __repr__(self):
26912699
26922700
26932701def _calc_skew (i , j , k , alo , ahi , blo , bhi ):
2694- """Difference in normalized positions of block mid-points
2695- Returns skew : float, where -1 < skew < 1
2702+ """
2703+ Difference in normalized positions of block mid-points
2704+
2705+ Returns skew : float, where -1 < skew < 1
26962706 """
26972707 k_div_2 = k / 2
26982708 apos = (i + k_div_2 - alo ) / (ahi - alo )
@@ -2705,18 +2715,19 @@ def _calc_skew(i, j, k, alo, ahi, blo, bhi):
27052715
27062716
27072717def _calc_candidate_score (block0 , block1 , block2 ):
2708- """Calculates the score for 1-3 block candidates for balancing procedure
2718+ """
2719+ Calculates the score for 1-3 block candidates for balancing procedure
27092720
27102721 Score is calculated so that long match is preferred
2711- to many small ones but not too aggressively,
2712- so to be able to jump out of skewed positions.
2722+ to many small ones but not too aggressively,
2723+ so to be able to jump out of skewed positions.
27132724
2714- total = ∑ length^p
2715- where p is such that c^p == (0.9c)^p + (0.2c)^p
2725+ total = ∑ length^p
2726+ where p is such that c^p == (0.9c)^p + (0.2c)^p
27162727
2717- If only 1 block found, it is a definitive score.
2718- Otherwise, it gets bonus for each additional block
2719- as it has poential to recurse further to each side
2728+ If only 1 block found, it is a definitive score.
2729+ Otherwise, it gets bonus for each additional block
2730+ as it has poential to recurse further to each side
27202731 """
27212732 k1 = block1 [2 ]
27222733 if not k1 :
@@ -2748,10 +2759,10 @@ class GestaltSequenceMatcher(SequenceMatcherBase):
27482759
27492760 However, while `SequenceMatcher` is able to obtain same result,
27502761 it is only practical to use with `autojunk` set to False due to
2751- quadratic worst case complexity.
2762+ quadratic worst case complexity of Longest Common Substring algorithm .
27522763
27532764 `GestaltSequenceMatcher`, on the other hand, implements Suffix Automaton,
2754- which has guaranteed O(n) complexity, making it possible to use exact
2765+ which has O(n) complexity guaranteed , making it possible to use exact
27552766 calculation on long sequences.
27562767
27572768 Furthermore, `GestaltSequenceMatcher` has `balancing` parameter.
@@ -2760,6 +2771,20 @@ class GestaltSequenceMatcher(SequenceMatcherBase):
27602771 It does so by sometimes selecting shorter matches by lookin 1 step ahead.
27612772 It produces more concise diffs with more lines matched, while retaining
27622773 block-oriented nature.
2774+
2775+ Time Complexity:
2776+ find_longest_match : O(n)
2777+ get_matching_blocks : O(n) average for common diff case
2778+ O(n^2) worst case.
2779+
2780+ Example of worst case complexity `get_matching_blocks` case:
2781+ chars = ['ab'[i % 2] for i in range(100)]
2782+ seq1 = '+'.join(chars)
2783+ seq2 = '-'.join(chars)
2784+
2785+ Space Complexity:
2786+ find_longest_match: c × O(n), c ~ 3x (compared to `SequenceMatcher`)
2787+ get_matching_blocks: c × O(n), c ~ 3x (compared to `SequenceMatcher`)
27632788 """
27642789
27652790 def __init__ (self , isjunk = None , a = '' , b = '' , balancing = 0 ):
@@ -2826,11 +2851,11 @@ def __init__(self, isjunk=None, a='', b='', balancing=0):
28262851 Examples:
28272852 >>> seq1 = 'aaaa_aaaa_bbbbb'
28282853 >>> seq2 = 'bbbbb-aaaa-aaaa'
2829- >>> m1 = GestaltSequenceMatcher(None, seq1, seq2)
2830- >>> m2 = GestaltSequenceMatcher(None, seq1, seq2, balancing=2/3)
2831- >>> list(map(tuple, m1 .get_matching_blocks()))
2854+ >>> gsm1 = GestaltSequenceMatcher(None, seq1, seq2)
2855+ >>> gsm2 = GestaltSequenceMatcher(None, seq1, seq2, balancing=2/3)
2856+ >>> list(map(tuple, gsm1 .get_matching_blocks()))
28322857 [(10, 0, 5), (15, 15, 0)]
2833- >>> list(map(tuple, m2 .get_matching_blocks()))
2858+ >>> list(map(tuple, gsm2 .get_matching_blocks()))
28342859 [(0, 6, 4), (5, 11, 4), (15, 15, 0)]
28352860 """
28362861 balancing = float (balancing )
@@ -2848,14 +2873,15 @@ def _prepare_seq2(self):
28482873 self .automaton = _LCSUBAutomaton (b , junk = bjunk )
28492874
28502875 def find_longest_match (self , alo = 0 , ahi = None , blo = 0 , bhi = None , * , quick_only = False ):
2851- """Find longest matching block in a[alo:ahi] and b[blo:bhi].
2852- By default it will find the longest match in the entirety of a and b.
2876+ """
2877+ Find longest matching block in a[alo:ahi] and b[blo:bhi].
2878+ By default it will find the longest match in the entirety of a and b.
28532879
2854- Look up docstring of SequenceMatcher.find_longest_match
2855- for more information.
2880+ Look up docstring of SequenceMatcher.find_longest_match
2881+ for more information.
28562882
2857- The only difference is `quick_only` argument, which if set to True
2858- might not return a value if not possible with current build
2883+ The only difference is `quick_only` argument, which if set to True
2884+ might not return a value if not possible with current build
28592885 """
28602886 a , b , bjunk = self .a , self .b , self .bjunk
28612887 automaton = self .automaton
@@ -2873,9 +2899,11 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None, *, quick_only=Fal
28732899 return Match ._make (block )
28742900
28752901 def batch_find_longest_match (self , bounds_list ):
2876- """Performance method for many `find_longest_match` calls
2877- It calls `find` in order that aims to minimize builds needed
2878- Also, does not evaluate same range twice
2902+ """
2903+ Performance method for many `find_longest_match` calls
2904+ It calls `find` in order that aims to minimize builds needed
2905+ Also, does not evaluate same range twice
2906+
28792907 Args:
28802908 bounds_list : list[tuple[int, int, int, int]]
28812909 list of tuples: (alo, ahi, blo, bhi)
@@ -2890,11 +2918,12 @@ def batch_find_longest_match(self, bounds_list):
28902918 yield _make_match (block )
28912919
28922920 def _modifier (self , depth , block , alo , ahi , blo , bhi ):
2893- """An entry point for intercepting `_get_matching_blocks` algorithm
2894- It is subject to be implemented by derived class.
2895- It can be used for:
2896- a) quick peak into what algorithm is doing
2897- b) modification of divide-and-conquer algorithm
2921+ """
2922+ An entry point for intercepting `_get_matching_blocks` algorithm
2923+ It is subject to be implemented by derived class.
2924+ It can be used for:
2925+ a) quick peak into what algorithm is doing
2926+ b) modification of divide-and-conquer algorithm
28982927
28992928 Args:
29002929 depth : int
@@ -2931,6 +2960,18 @@ def _modifier(self, depth, block, alo, ahi, blo, bhi):
29312960 return None
29322961
29332962 def _get_matching_blocks (self ):
2963+ """
2964+ Return list of triples describing matching subsequences.
2965+
2966+ Each triple is of the form (i, j, n), s.t. a[i:i+n] == b[j:j+n].
2967+
2968+ The last triple is a dummy, (len(a), len(b), 0), and is the only
2969+ triple with n==0.
2970+
2971+ >>> gsm = GestaltSequenceMatcher(None, "abxcd", "abcd")
2972+ >>> list(gsm.get_matching_blocks())
2973+ [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
2974+ """
29342975 balancing = self .balancing
29352976 alo , ahi , blo , bhi = 0 , len (self .a ), 0 , len (self .b )
29362977 if alo >= ahi or blo >= bhi :
@@ -3061,12 +3102,12 @@ def _get_matching_blocks(self):
30613102
30623103 # 2.2.3. Pick middle block of the best tripple
30633104 for triple in triples :
3064- triple [0 ] = t0 = job_results [triple [0 ]]
3065- triple [2 ] = t2 = job_results [triple [2 ]]
3066- t1 = triple [1 ]
3067- score = _calc_candidate_score (t0 , t1 , t2 )
3105+ triple [0 ] = block0 = job_results [triple [0 ]]
3106+ triple [2 ] = block2 = job_results [triple [2 ]]
3107+ block1 = triple [1 ]
3108+ score = _calc_candidate_score (block0 , block1 , block2 )
30683109 # NOTE: secondary key is `skew` as above
3069- mid_skew = _calc_skew (* t1 , * bounds )
3110+ mid_skew = _calc_skew (* block1 , * bounds )
30703111 triple .append ((score , - abs (mid_skew )))
30713112 best = max (triples , key = lambda x : x [- 1 ])
30723113 tail_blocks = best [1 :2 ]
0 commit comments