8000 gh-119105: difflib.py Differ.compare is too slow [for degenerate case… · python/cpython@07df93d · GitHub
[go: up one dir, main page]

Skip to content

Commit 07df93d

Browse files
authored
gh-119105: difflib.py Differ.compare is too slow [for degenerate cases] (#119376)
Track all pairs achieving the best ratio in Differ(). This repairs the "very deep recursion and cubic time" bad cases in a way that preserves previous output.
1 parent e3bf538 commit 07df93d

File tree

2 files changed

+89
-59
lines changed

2 files changed

+89
-59
lines changed

Lib/difflib.py

Lines changed: 69 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -908,95 +908,105 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
908908
+ abcdefGhijkl
909909
? ^ ^ ^
910910
"""
911-
912-
# don't synch up unless the lines have a similarity score of at
913-
# least cutoff; best_ratio tracks the best score seen so far
914-
# best_ratio is a tuple storing the best .ratio() seen so far, and
915-
# a measure of how far the indices are from their index range
916-
# midpoints. The latter is used to resolve ratio ties. Favoring
917-
# indices near the midpoints tends to cut the ranges in half. Else,
918-
# if there are many pairs with the best ratio, recursion can grow
919-
# very deep, and runtime becomes cubic. See:
911+
from operator import ge, gt
912+
# Don't synch up unless the lines have a similarity score of at
913+
# least cutoff; best_ratio tracks the best score seen so far.
914+
# Keep track of all index pairs achieving the best ratio and
915+
# deal with them here. Previously only the smallest pair was
916+
# handled here, and if there are many pairs with the best ratio,
917+
# recursion could grow very deep, and runtime cubic. See:
920918
# https://github.com/python/cpython/issues/119105
921-
best_ratio, cutoff = (0.74, 0), 0.75
919+
best_ratio, cutoff = 0.74, 0.75
922920
cruncher = SequenceMatcher(self.charjunk)
923921
eqi, eqj = None, None # 1st indices of equal lines (if any)
922+
# List of index pairs achieving best_ratio. Strictly increasing
923+
# in both index positions.
924+
max_pairs = []
925+
maxi = -1 # `i` index of last pair in max_pairs
924926

925927
# search for the pair that matches best without being identical
926-
# (identical lines must be junk lines, & we don't want to synch up
927-
# on junk -- unless we have to)
928-
amid = (alo + ahi - 1) / 2
929-
bmid = (blo + bhi - 1) / 2
928+
# (identical lines must be junk lines, & we don't want to synch
929+
# up on junk -- unless we have to)
930+
crqr = cruncher.real_quick_ratio
931+
cqr = cruncher.quick_ratio
932+
cr = cruncher.ratio
930933
for j in range(blo, bhi):
931934
bj = b[j]
932935
cruncher.set_seq2(bj)
933-
weight_j = - abs(j - bmid)
936+
# Find new best, if possible. Else search for the smallest i
937+
# (if any) > maxi that equals the best ratio
938+
search_equal = True
934939
for i in range(alo, ahi):
935940
ai = a[i]
936941
if ai == bj:
937942
if eqi is None:
938943
eqi, eqj = i, j
939944
continue
940945
cruncher.set_seq1(ai)
941-
# weight is used to balance the recursion by prioritizing
942-
# i and j in the middle of their ranges
943-
weight = weight_j - abs(i - amid)
944946
# computing similarity is expensive, so use the quick
945947
# upper bounds first -- have seen this speed up messy
946948
# compares by a factor of 3.
947-
# note that ratio() is only expensive to compute the first
948-
# time it's called on a sequence pair; the expensive part
949-
# of the computation is cached by cruncher
950-
if (cruncher.real_quick_ratio(), weight) > best_ratio and \
951-
(cruncher.quick_ratio(), weight) > best_ratio and \
952-
(cruncher.ratio(), weight) > best_ratio:
953-
best_ratio, best_i, best_j = (cruncher.ratio(), weight), i, j
954-
best_ratio, _ = best_ratio
949+
cmp = ge if search_equal and i > maxi else gt
950+
if (cmp(crqr(), best_ratio)
951+
and cmp(cqr(), best_ratio)
952+
and cmp((ratio := cr()), best_ratio)):
953+
if ratio > best_ratio:
954+
best_ratio = ratio
955+
max_pairs.clear()
956+
else:
957+
assert best_ratio == ratio and search_equal
958+
assert i > maxi
959+
max_pairs.append((i, j))
960+
maxi = i
961+
search_equal = False
955962
if best_ratio < cutoff:
963+
assert not max_pairs
956964
# no non-identical "pretty close" pair
957965
if eqi is None:
958966
# no identical pair either -- treat it as a straight replace
959967
yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
960968
return
961969
# no close pair, but an identical pair -- synch up on that
962-
best_i, best_j, best_ratio = eqi, eqj, 1.0
970+
max_pairs = [(eqi, eqj)]
963971
else:
964972
# there's a close pair, so forget the identical pair (if any)
973+
assert max_pairs
965974
eqi = None
966975

967-
# a[best_i] very similar to b[best_j]; eqi is None iff they're not
968-
# identical
969-
970-
# pump out diffs from before the synch point
971-
yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
972-
973-
# do intraline marking on the synch pair
974-
aelt, belt = a[best_i], b[best_j]
975-
if eqi is None:
976-
# pump out a '-', '?', '+', '?' quad for the synched lines
977-
atags = btags = ""
978-
cruncher.set_seqs(aelt, belt)
979-
for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
980-
la, lb = ai2 - ai1, bj2 - bj1
981-
if tag == 'replace':
982-
atags += '^' * la
983-
btags += '^' * lb
984-
elif tag == 'delete':
985-
atags += '-' * la
986-
elif tag == 'insert':
987-
btags += '+' * lb
988-
elif tag == 'equal':
989-
atags += ' ' * la
990-
btags += ' ' * lb
991-
else:
992-
raise ValueError('unknown tag %r' % (tag,))
993-
yield from self._qformat(aelt, belt, atags, btags)
994-
else:
995-
# the synch pair is identical
996-
yield ' ' + aelt
976+
last_i, last_j = alo, blo
977+
for this_i, this_j in max_pairs:
978+
# pump out diffs from before the synch point
979+
yield from self._fancy_helper(a, last_i, this_i,
980+
b, last_j, this_j)
981+
# do intraline marking on the synch pair
982+
aelt, belt = a[this_i], b[this_j]
983+
if eqi is None:
984+
# pump out a '-', '?', '+', '?' quad for the synched lines
985+
atags = btags = ""
986+
cruncher.set_seqs(aelt, belt)
987+
for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
988+
la, lb = ai2 - ai1, bj2 - bj1
989+
if tag == 'replace':
990+
atags += '^' * la
991+
btags += '^' * lb
992+
elif tag == 'delete':
993+
atags += '-' * la
994+
elif tag == 'insert':
995+
btags += '+' * lb
996+
elif tag == 'equal':
997+
atags += ' ' * la
998+
btags += ' ' * lb
999+
else:
1000+
raise ValueError('unknown tag %r' % (tag,))
1001+
yield from self._qformat(aelt, belt, atags, btags)
1002+
else:
1003+
# the synch pair is identical
1004+
yield ' ' + aelt
1005+
last_i, last_j = this_i + 1, this_j + 1
9971006

998-
# pump out diffs from after the synch point
999-
yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
1007+
# pump out diffs from after the last synch point
1008+
yield from self._fancy_helper(a, last_i, ahi,
1009+
b, last_j, bhi)
10001010

10011011
def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
10021012
g = []

Lib/test/test_difflib.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,26 @@ def test_make_file_usascii_charset_with_nonascii_input(self):
272272
self.assertIn('content="text/html; charset=us-ascii"', output)
273273
self.assertIn('&#305;mpl&#305;c&#305;t', output)
274274

275+
class TestDiffer(unittest.TestCase):
276+
def test_close_matches_aligned(self):
277+
# Of the 4 closely matching pairs, we want 1 to match with 3,
278+
# and 2 with 4, to align with a "top to bottom" mental model.
279+
a = ["cat\n", "dog\n", "close match 1\n", "close match 2\n"]
280+
b = ["close match 3\n", "close match 4\n", "kitten\n", "puppy\n"]
281+
m = difflib.Differ().compare(a, b)
282+
self.assertEqual(list(m),
283+
['- cat\n',
284+
'- dog\n',
285+
'- close match 1\n',
286+
'? ^\n',
287+
'+ close match 3\n',
288+
'? ^\n',
289+
'- close match 2\n',
290+
'? ^\n',
291+
'+ close match 4\n',
292+
'? ^\n',
293+
'+ kitten\n',
294+
'+ puppy\n'])
275295

276296
class TestOutputFormat(unittest.TestCase):
277297
def test_tab_delimiter(self):

0 commit comments

Comments
 (0)
0