8000 gh-119105: Differ.compare is too slow [for degenerate cases] by tim-one · Pull Request #119492 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-119105: Differ.compare is too slow [for degenerate cases] #119492

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 25, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
WIP - many problems.
  • Loading branch information
tim-one committed May 23, 2024
commit e77d0fe0c7d87dcc27242e0dc704bb4ffa698573
64 changes: 23 additions & 41 deletions Lib/difflib.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,21 +908,16 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
+ abcdefGhijkl
? ^ ^ ^
"""
from operator import ge, gt
# Don't synch up unless the lines have a similarity score of at
# least cutoff; best_ratio tracks the best score seen so far.
# Keep track of all index pairs achieving the best ratio and
# deal with them here. Previously only the smallest pair was
# handled here, and if there are many pairs with the best ratio,
# recursion could grow very deep, and runtime cubic. See:
# Don't synch up unless the lines have a similarity score above
# cutoff. Previously only the smallest pair was handled here,
# and if there are many pairs with the best ratio, recursion
# could grow very deep, and runtime cubic. See:
# https://github.com/python/cpython/issues/119105
best_ratio, cutoff = 0.74, 0.75
cutoff = 0.74999
cruncher = SequenceMatcher(self.charjunk)
eqi, eqj = None, None # 1st indices of equal lines (if any)
# List of index pairs achieving best_ratio. Strictly increasing
# in both index positions.
max_pairs = []
maxi = -1 # `i` index of last pair in max_pairs

# search for the pair that matches best without being identical
# (identical lines must be junk lines, & we don't want to synch
Expand All @@ -935,52 +930,39 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
cruncher.set_seq2(bj)
# Find new best, if possible. Else search for the smallest i
# (if any) > maxi that equals the best ratio
search_equal = True
best_ratio = cutoff
best_i = None
for i in range(alo, ahi):
ai = a[i]
if ai == bj:
if eqi is None:
eqi, eqj = i, j
if best_i is None:
best_i = i
continue
cruncher.set_seq1(ai)
# computing similarity is expensive, so use the quick
# upper bounds first -- have seen this speed up messy
# compares by a factor of 3.
cmp = ge if search_equal and i > maxi else gt
if (cmp(crqr(), best_ratio)
and cmp(cqr(), best_ratio)
and cmp((ratio := cr()), best_ratio)):
if ratio > best_ratio:
best_ratio = ratio
max_pairs.clear()
else:
assert best_ratio == ratio and search_equal
assert i > maxi
max_pairs.append((i, j))
maxi = i
search_equal = False
if best_ratio < cutoff:
assert not max_pairs
# no non-identical "pretty close" pair
if eqi is None:
# no identical pair either -- treat it as a straight replace
yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
return
# no close pair, but an identical pair -- synch up on that
max_pairs = [(eqi, eqj)]
else:
# there's a close pair, so forget the identical pair (if any)
assert max_pairs
eqi = None

if (crqr() > best_ratio
and cqr() > best_ratio
and (ratio := cr()) > best_ratio):
best_ratio = ratio
best_i = i
if best_i is not None:
#assert not max_pairs or best_i > max_pairs[-1][0], (max_pairs, i, j)
if not max_pairs or best_i > max_pairs[-1][0]:
max_pairs.append((best_i, j))
if not max_pairs:
yield from self._plain_replace(a, alo, ahi, b, blo, bhi)

print(max_pairs)
last_i, last_j = alo, blo
for this_i, this_j in max_pairs:
# pump out diffs from before the synch point
yield from self._fancy_helper(a, last_i, this_i,
b, last_j, this_j)
# do intraline marking on the synch pair
aelt, belt = a[this_i], b[this_j]
if eqi is None:
if aelt != belt:
# pump out a '-', '?', '+', '?' quad for the synched lines
atags = btags = ""
cruncher.set_seqs(aelt, belt)
Expand Down
0