Skip to content

Commit 3b2d26b

Browse files
committed
docstrings and variable names
1 parent 081e18e commit 3b2d26b

File tree

1 file changed

+91
-50
lines changed

1 file changed

+91
-50
lines changed

Lib/difflib.py

Lines changed: 91 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -2301,7 +2301,8 @@ def restore(delta, which):
23012301

23022302

23032303
class _LCSUBAutomaton:
2304-
"""Suffix Automaton for finding longest common substring.
2304+
"""
2305+
Suffix Automaton for finding longest common substring.
23052306
23062307
Complexity:
23072308
T: O(n1 + n2) ~ n1 + 5 × n2
@@ -2381,9 +2382,10 @@ def build(self, start2=0, stop2=None):
23812382
self.cache = key
23822383

23832384
def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
2384-
"""Find leftmost longest match
2385-
Firstly, it will be leftmost in seq1
2386-
Secondly, it will be leftmost in seq2 if more than one occurrence
2385+
"""
2386+
Find leftmost longest match
2387+
Firstly, it will be leftmost in seq1
2388+
Secondly, it will be leftmost in seq2 if more than one occurrence
23872389
23882390
Returns:
23892391
match: (start_in_seq1, start_in_seq2, match_length)
@@ -2396,9 +2398,11 @@ def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
23962398
return res
23972399

23982400
def batchfind(self, seq1, bounds_list):
2399-
"""Performance method for many `find` calls
2400-
It calls `find` in order that aims to minimize builds needed
2401-
Also, does not evaluate same range twice
2401+
"""
2402+
Performance method for many `find` calls
2403+
It calls `find` in order that aims to minimize builds needed
2404+
Also, does not evaluate same range twice
2405+
24022406
Args:
24032407
bounds_list : list[tuple[int, int, int, int]]
24042408
list of tuples: (start1, stop1, start2, stop2)
@@ -2423,11 +2427,12 @@ def batchfind(self, seq1, bounds_list):
24232427
# ---------------------------------
24242428

24252429
def _try_find(self, seq1, start1, stop1, start2, stop2):
2426-
"""Attempts to find match without building
2427-
Querying in exactly the same range will always succeed
2428-
Also, it might be possible if (start2, stop2) is within cached range
2430+
"""
2431+
Attempts to find match without building
2432+
Querying in exactly the same range will always succeed
2433+
Also, it might be possible if (start2, stop2) is within cached range
24292434
2430-
returns None on fail
2435+
returns None on fail
24312436
"""
24322437
if start1 >= stop1 or start2 >= stop2:
24332438
return (start1, start2, 0)
@@ -2447,9 +2452,10 @@ def _try_find(self, seq1, start1, stop1, start2, stop2):
24472452
return (e1 + 1 - k, start_in_seq2, k)
24482453

24492454
def _find(self, seq1, start1, stop1, start2, stop2):
2450-
"""Returns lefmost longest match
2451-
Does not attempt to retrieve from inexactly built range
2452-
Always returns an answer
2455+
"""
2456+
Returns lefmost longest match
2457+
Does not attempt to retrieve from inexactly built range
2458+
Always returns an answer
24532459
"""
24542460
if start1 >= stop1 or start2 >= stop2:
24552461
return (start1, start2, 0)
@@ -2599,7 +2605,9 @@ def _build(self, start2, stop2):
25992605
return nodes
26002606

26012607
def _finditer(self, seq1, start1, stop1, best=False):
2602-
"""Core scanning routine
2608+
"""
2609+
Core scanning routine
2610+
26032611
Args:
26042612
best : bool
26052613
False - return all matches, including non-maximal
@@ -2691,8 +2699,10 @@ def __repr__(self):
26912699

26922700

26932701
def _calc_skew(i, j, k, alo, ahi, blo, bhi):
2694-
"""Difference in normalized positions of block mid-points
2695-
Returns skew : float, where -1 < skew < 1
2702+
"""
2703+
Difference in normalized positions of block mid-points
2704+
2705+
Returns skew : float, where -1 < skew < 1
26962706
"""
26972707
k_div_2 = k / 2
26982708
apos = (i + k_div_2 - alo) / (ahi - alo)
@@ -2705,18 +2715,19 @@ def _calc_skew(i, j, k, alo, ahi, blo, bhi):
27052715

27062716

27072717
def _calc_candidate_score(block0, block1, block2):
2708-
"""Calculates the score for 1-3 block candidates for balancing procedure
2718+
"""
2719+
Calculates the score for 1-3 block candidates for balancing procedure
27092720
27102721
Score is calculated so that long match is preferred
2711-
to many small ones but not too aggressively,
2712-
so to be able to jump out of skewed positions.
2722+
to many small ones but not too aggressively,
2723+
so to be able to jump out of skewed positions.
27132724
2714-
total = ∑ length^p
2715-
where p is such that c^p == (0.9c)^p + (0.2c)^p
2725+
total = ∑ length^p
2726+
where p is such that c^p == (0.9c)^p + (0.2c)^p
27162727
2717-
If only 1 block found, it is a definitive score.
2718-
Otherwise, it gets bonus for each additional block
2719-
as it has poential to recurse further to each side
2728+
If only 1 block found, it is a definitive score.
2729+
Otherwise, it gets bonus for each additional block
2730+
as it has poential to recurse further to each side
27202731
"""
27212732
k1 = block1[2]
27222733
if not k1:
@@ -2748,10 +2759,10 @@ class GestaltSequenceMatcher(SequenceMatcherBase):
27482759
27492760
However, while `SequenceMatcher` is able to obtain same result,
27502761
it is only practical to use with `autojunk` set to False due to
2751-
quadratic worst case complexity.
2762+
quadratic worst case complexity of Longest Common Substring algorithm.
27522763
27532764
`GestaltSequenceMatcher`, on the other hand, implements Suffix Automaton,
2754-
which has guaranteed O(n) complexity, making it possible to use exact
2765+
which has O(n) complexity guaranteed, making it possible to use exact
27552766
calculation on long sequences.
27562767
27572768
Furthermore, `GestaltSequenceMatcher` has `balancing` parameter.
@@ -2760,6 +2771,20 @@ class GestaltSequenceMatcher(SequenceMatcherBase):
27602771
It does so by sometimes selecting shorter matches by lookin 1 step ahead.
27612772
It produces more concise diffs with more lines matched, while retaining
27622773
block-oriented nature.
2774+
2775+
Time Complexity:
2776+
find_longest_match : O(n)
2777+
get_matching_blocks : O(n) average for common diff case
2778+
O(n^2) worst case.
2779+
2780+
Example of worst case complexity `get_matching_blocks` case:
2781+
chars = ['ab'[i % 2] for i in range(100)]
2782+
seq1 = '+'.join(chars)
2783+
seq2 = '-'.join(chars)
2784+
2785+
Space Complexity:
2786+
find_longest_match: c × O(n), c ~ 3x (compared to `SequenceMatcher`)
2787+
get_matching_blocks: c × O(n), c ~ 3x (compared to `SequenceMatcher`)
27632788
"""
27642789

27652790
def __init__(self, isjunk=None, a='', b='', balancing=0):
@@ -2826,11 +2851,11 @@ def __init__(self, isjunk=None, a='', b='', balancing=0):
28262851
Examples:
28272852
>>> seq1 = 'aaaa_aaaa_bbbbb'
28282853
>>> seq2 = 'bbbbb-aaaa-aaaa'
2829-
>>> m1 = GestaltSequenceMatcher(None, seq1, seq2)
2830-
>>> m2 = GestaltSequenceMatcher(None, seq1, seq2, balancing=2/3)
2831-
>>> list(map(tuple, m1.get_matching_blocks()))
2854+
>>> gsm1 = GestaltSequenceMatcher(None, seq1, seq2)
2855+
>>> gsm2 = GestaltSequenceMatcher(None, seq1, seq2, balancing=2/3)
2856+
>>> list(map(tuple, gsm1.get_matching_blocks()))
28322857
[(10, 0, 5), (15, 15, 0)]
2833-
>>> list(map(tuple, m2.get_matching_blocks()))
2858+
>>> list(map(tuple, gsm2.get_matching_blocks()))
28342859
[(0, 6, 4), (5, 11, 4), (15, 15, 0)]
28352860
"""
28362861
balancing = float(balancing)
@@ -2848,14 +2873,15 @@ def _prepare_seq2(self):
28482873
self.automaton = _LCSUBAutomaton(b, junk=bjunk)
28492874

28502875
def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None, *, quick_only=False):
2851-
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
2852-
By default it will find the longest match in the entirety of a and b.
2876+
"""
2877+
Find longest matching block in a[alo:ahi] and b[blo:bhi].
2878+
By default it will find the longest match in the entirety of a and b.
28532879
2854-
Look up docstring of SequenceMatcher.find_longest_match
2855-
for more information.
2880+
Look up docstring of SequenceMatcher.find_longest_match
2881+
for more information.
28562882
2857-
The only difference is `quick_only` argument, which if set to True
2858-
might not return a value if not possible with current build
2883+
The only difference is `quick_only` argument, which if set to True
2884+
might not return a value if not possible with current build
28592885
"""
28602886
a, b, bjunk = self.a, self.b, self.bjunk
28612887
automaton = self.automaton
@@ -2873,9 +2899,11 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None, *, quick_only=Fal
28732899
return Match._make(block)
28742900

28752901
def batch_find_longest_match(self, bounds_list):
2876-
"""Performance method for many `find_longest_match` calls
2877-
It calls `find` in order that aims to minimize builds needed
2878-
Also, does not evaluate same range twice
2902+
"""
2903+
Performance method for many `find_longest_match` calls
2904+
It calls `find` in order that aims to minimize builds needed
2905+
Also, does not evaluate same range twice
2906+
28792907
Args:
28802908
bounds_list : list[tuple[int, int, int, int]]
28812909
list of tuples: (alo, ahi, blo, bhi)
@@ -2890,11 +2918,12 @@ def batch_find_longest_match(self, bounds_list):
28902918
yield _make_match(block)
28912919

28922920
def _modifier(self, depth, block, alo, ahi, blo, bhi):
2893-
"""An entry point for intercepting `_get_matching_blocks` algorithm
2894-
It is subject to be implemented by derived class.
2895-
It can be used for:
2896-
a) quick peak into what algorithm is doing
2897-
b) modification of divide-and-conquer algorithm
2921+
"""
2922+
An entry point for intercepting `_get_matching_blocks` algorithm
2923+
It is subject to be implemented by derived class.
2924+
It can be used for:
2925+
a) quick peak into what algorithm is doing
2926+
b) modification of divide-and-conquer algorithm
28982927
28992928
Args:
29002929
depth : int
@@ -2931,6 +2960,18 @@ def _modifier(self, depth, block, alo, ahi, blo, bhi):
29312960
return None
29322961

29332962
def _get_matching_blocks(self):
2963+
"""
2964+
Return list of triples describing matching subsequences.
2965+
2966+
Each triple is of the form (i, j, n), s.t. a[i:i+n] == b[j:j+n].
2967+
2968+
The last triple is a dummy, (len(a), len(b), 0), and is the only
2969+
triple with n==0.
2970+
2971+
>>> gsm = GestaltSequenceMatcher(None, "abxcd", "abcd")
2972+
>>> list(gsm.get_matching_blocks())
2973+
[Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
2974+
"""
29342975
balancing = self.balancing
29352976
alo, ahi, blo, bhi = 0, len(self.a), 0, len(self.b)
29362977
if alo >= ahi or blo >= bhi:
@@ -3061,12 +3102,12 @@ def _get_matching_blocks(self):
30613102

30623103
# 2.2.3. Pick middle block of the best tripple
30633104
for triple in triples:
3064-
triple[0] = t0 = job_results[triple[0]]
3065-
triple[2] = t2 = job_results[triple[2]]
3066-
t1 = triple[1]
3067-
score = _calc_candidate_score(t0, t1, t2)
3105+
triple[0] = block0 = job_results[triple[0]]
3106+
triple[2] = block2 = job_results[triple[2]]
3107+
block1 = triple[1]
3108+
score = _calc_candidate_score(block0, block1, block2)
30683109
# NOTE: secondary key is `skew` as above
3069-
mid_skew = _calc_skew(*t1, *bounds)
3110+
mid_skew = _calc_skew(*block1, *bounds)
30703111
triple.append((score, -abs(mid_skew)))
30713112
best = max(triples, key=lambda x: x[-1])
30723113
tail_blocks = best[1:2]

0 commit comments

Comments
 (0)