From 9bb454c80efdfd493257c67e71cafd76cf048ec9 Mon Sep 17 00:00:00 2001 From: pulkin Date: Fri, 17 May 2024 22:59:32 +0200 Subject: [PATCH 01/10] difflib.py: fix recursion branching for degenerate cases --- Lib/difflib.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index ba0b256969ebff..b5563dd9480cdb 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -915,12 +915,24 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): cruncher = SequenceMatcher(self.charjunk) eqi, eqj = None, None # 1st indices of equal lines (if any) + max_len_a = max(map(len, a)) + max_len_b = max(map(len, b)) + # twice epsilon can be safely added to distinguish equal ratios + # given ratios are 2 * integer / (len_i + len_j) + # the smallest possible non-zero difference between two arbitrary + # ratios is no less than twice the epsilon + epsilon = 0.99 / (max_len_a + max_len_b) ** 2 + # we use sub-epsilon weights to promote otherwise equal ratios + # that split sequences closer to their midpoints + # this way, we balance the recursion tree in degenerate cases + # search for the pair that matches best without being identical # (identical lines must be junk lines, & we don't want to synch up # on junk -- unless we have to) for j in range(blo, bhi): bj = b[j] cruncher.set_seq2(bj) + weight_b = min(j - blo, bhi - 1 - j) / (bhi - blo) for i in range(alo, ahi): ai = a[i] if ai == bj: @@ -928,16 +940,18 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): eqi, eqj = i, j continue cruncher.set_seq1(ai) + weight_ab = weight_b + min(i - alo, ahi - 1 - i) / (ahi - alo) + weight_ab *= epsilon # computing similarity is expensive, so use the quick # upper bounds first -- have seen this speed up messy # compares by a factor of 3. # note that ratio() is only expensive to compute the first # time it's called on a sequence pair; the expensive part # of the computation is cached by cruncher - if cruncher.real_quick_ratio() > best_ratio and \ - cruncher.quick_ratio() > best_ratio and \ - cruncher.ratio() > best_ratio: - best_ratio, best_i, best_j = cruncher.ratio(), i, j + if cruncher.real_quick_ratio() + weight_ab > best_ratio and \ + cruncher.quick_ratio() + weight_ab > best_ratio and \ + cruncher.ratio() + weight_ab > best_ratio: + best_ratio, best_i, best_j = cruncher.ratio() + weight_ab, i, j if best_ratio < cutoff: # no non-identical "pretty close" pair if eqi is None: From a159954afc78d44ba18a172eb845fdc84cc894e1 Mon Sep 17 00:00:00 2001 From: pulkin Date: Fri, 17 May 2024 23:04:44 +0200 Subject: [PATCH 02/10] difflib.py: grammar --- Lib/difflib.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index b5563dd9480cdb..72ff67ce89e917 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -917,13 +917,13 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): max_len_a = max(map(len, a)) max_len_b = max(map(len, b)) - # twice epsilon can be safely added to distinguish equal ratios - # given ratios are 2 * integer / (len_i + len_j) + # twice the epsilon can be safely added to distinguish otherwise + # equal ratios. given ratios are 2 * integer / (len_i + len_j) # the smallest possible non-zero difference between two arbitrary # ratios is no less than twice the epsilon epsilon = 0.99 / (max_len_a + max_len_b) ** 2 - # we use sub-epsilon weights to promote otherwise equal ratios - # that split sequences closer to their midpoints + # we use sub-epsilon weights to promote i, j that split the + # input range more equally. # this way, we balance the recursion tree in degenerate cases # search for the pair that matches best without being identical From 81b7dae4846386ba663ce9207bbb9f9e3d5b067a Mon Sep 17 00:00:00 2001 From: pulkin Date: Sat, 18 May 2024 10:34:54 +0200 Subject: [PATCH 03/10] difflib.py: weights-free implementation --- Lib/difflib.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index 72ff67ce89e917..8d6abceed41b82 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -911,28 +911,26 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): # don't synch up unless the lines have a similarity score of at # least cutoff; best_ratio tracks the best score seen so far - best_ratio, cutoff = 0.74, 0.75 + best_ratio, cutoff = (0.74, 0), 0.75 cruncher = SequenceMatcher(self.charjunk) eqi, eqj = None, None # 1st indices of equal lines (if any) - max_len_a = max(map(len, a)) - max_len_b = max(map(len, b)) - # twice the epsilon can be safely added to distinguish otherwise - # equal ratios. given ratios are 2 * integer / (len_i + len_j) - # the smallest possible non-zero difference between two arbitrary - # ratios is no less than twice the epsilon - epsilon = 0.99 / (max_len_a + max_len_b) ** 2 - # we use sub-epsilon weights to promote i, j that split the - # input range more equally. - # this way, we balance the recursion tree in degenerate cases - # search for the pair that matches best without being identical # (identical lines must be junk lines, & we don't want to synch up # on junk -- unless we have to) + alen = alo + ahi - 1 + blen = blo + bhi - 1 + # weight is used to balance the recursion by prioritizing + # i and j in the middle of their ranges + weight = 0 for j in range(blo, bhi): bj = b[j] cruncher.set_seq2(bj) - weight_b = min(j - blo, bhi - 1 - j) / (bhi - blo) + if j < blen / 2: + weight += alen + elif j > blen / 2: + weight -= alen + weight = min(j - blo, bhi - 1 - j) * (ahi - alo - 1) for i in range(alo, ahi): ai = a[i] if ai == bj: @@ -940,18 +938,22 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): eqi, eqj = i, j continue cruncher.set_seq1(ai) - weight_ab = weight_b + min(i - alo, ahi - 1 - i) / (ahi - alo) - weight_ab *= epsilon + if i < alen / 2: + weight += blen + elif i > alen / 2: + weight -= blen # computing similarity is expensive, so use the quick # upper bounds first -- have seen this speed up messy # compares by a factor of 3. # note that ratio() is only expensive to compute the first # time it's called on a sequence pair; the expensive part # of the computation is cached by cruncher - if cruncher.real_quick_ratio() + weight_ab > best_ratio and \ - cruncher.quick_ratio() + weight_ab > best_ratio and \ - cruncher.ratio() + weight_ab > best_ratio: - best_ratio, best_i, best_j = cruncher.ratio() + weight_ab, i, j + if (cruncher.real_quick_ratio(), weight) > best_ratio and \ + (cruncher.quick_ratio(), weight) > best_ratio and \ + (cruncher.ratio(), weight) > best_ratio: + best_ratio, best_i, best_j = (cruncher.ratio(), weight), i, j + # assert weight == 0, weight + best_ratio, _ = best_ratio if best_ratio < cutoff: # no non-identical "pretty close" pair if eqi is None: From 2184911512a97be851bd12fdaa71c5327a82330b Mon Sep 17 00:00:00 2001 From: pulkin Date: Sat, 18 May 2024 10:36:05 +0200 Subject: [PATCH 04/10] difflib.py: rm left-over --- Lib/difflib.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index 8d6abceed41b82..4b90660a4aa244 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -930,7 +930,6 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): weight += alen elif j > blen / 2: weight -= alen - weight = min(j - blo, bhi - 1 - j) * (ahi - alo - 1) for i in range(alo, ahi): ai = a[i] if ai == bj: From f9e3480e42654cf2ba09c6fd2ccb1385347604e7 Mon Sep 17 00:00:00 2001 From: pulkin Date: Sun, 19 May 2024 14:18:34 +0200 Subject: [PATCH 05/10] difflib.py: even simpler impl --- Lib/difflib.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/Lib/difflib.py b/Lib/difflib.py index 4b90660a4aa244..3536000a455f30 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -918,18 +918,12 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): # search for the pair that matches best without being identical # (identical lines must be junk lines, & we don't want to synch up # on junk -- unless we have to) - alen = alo + ahi - 1 - blen = blo + bhi - 1 - # weight is used to balance the recursion by prioritizing - # i and j in the middle of their ranges - weight = 0 + amid = (alo + ahi - 1) / 2 + bmid = (blo + bhi - 1) / 2 for j in range(blo, bhi): bj = b[j] cruncher.set_seq2(bj) - if j < blen / 2: - weight += alen - elif j > blen / 2: - weight -= alen + weight_j = - abs(j - bmid) for i in range(alo, ahi): ai = a[i] if ai == bj: @@ -937,10 +931,9 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): eqi, eqj = i, j continue cruncher.set_seq1(ai) - if i < alen / 2: - weight += blen - elif i > alen / 2: - weight -= blen + # weight is used to balance the recursion by prioritizing + # i and j in the middle of their ranges + weight = weight_j - abs(i - amid) # computing similarity is expensive, so use the quick # upper bounds first -- have seen this speed up messy # compares by a factor of 3. @@ -951,7 +944,6 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): (cruncher.quick_ratio(), weight) > best_ratio and \ (cruncher.ratio(), weight) > best_ratio: best_ratio, best_i, best_j = (cruncher.ratio(), weight), i, j - # assert weight == 0, weight best_ratio, _ = best_ratio if best_ratio < cutoff: # no non-identical "pretty close" pair From b0be97f1cb2f8cf616a458dc2aafc23d09f10522 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sun, 19 May 2024 12:25:36 +0000 Subject: [PATCH 06/10] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst new file mode 100644 index 00000000000000..30b71eaef53194 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst @@ -0,0 +1 @@ +`difflib.Differ` is faster for some class of diffs From bea603e09c8e4ca483145e9ff77cfef1bee8db0e Mon Sep 17 00:00:00 2001 From: pulkin Date: Sun, 19 May 2024 14:27:34 +0200 Subject: [PATCH 07/10] difflib.py: add a desc Co-authored-by: Tim Peters --- Lib/difflib.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Lib/difflib.py b/Lib/difflib.py index 3536000a455f30..54ca33d5615f8d 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -911,6 +911,13 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): # don't synch up unless the lines have a similarity score of at # least cutoff; best_ratio tracks the best score seen so far + # best_ratio is a tuple storing the best .ratio() seen so far, and + # a measure of how far the indices are from their index range + # midpoints. The latter is used to resolve ratio ties. Favoring + # indices near the midpoints tends to cut the ranges in half. Else, + # if there are many pairs with the best ratio, recursion can grow + # very deep, and runtime becomes cubic. See: + # https://github.com/python/cpython/issues/119105 best_ratio, cutoff = (0.74, 0), 0.75 cruncher = SequenceMatcher(self.charjunk) eqi, eqj = None, None # 1st indices of equal lines (if any) From df83f505d09fde507b7b6cbe39395b84931891cc Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sun, 19 May 2024 12:42:52 +0000 Subject: [PATCH 08/10] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst new file mode 100644 index 00000000000000..2d973a524bb2d0 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst @@ -0,0 +1 @@ +difflib.Differ is faster for some class of diffs From 0953ede1b68fb62dd9e079ee7c09a2642e90b912 Mon Sep 17 00:00:00 2001 From: Artem Pulkin Date: Sun, 19 May 2024 14:51:44 +0200 Subject: [PATCH 09/10] news: attempt fix build --- .../next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst | 2 +- .../next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst index 30b71eaef53194..b9729e6cb38c57 100644 --- a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst +++ b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst @@ -1 +1 @@ -`difflib.Differ` is faster for some class of diffs +``difflib.Differ`` is faster for some class of diffs diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst deleted file mode 100644 index 2d973a524bb2d0..00000000000000 --- a/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst +++ /dev/null @@ -1 +0,0 @@ -difflib.Differ is faster for some class of diffs From 7237b48dc3416279285a9cf57833aabc10bf3221 Mon Sep 17 00:00:00 2001 From: pulkin Date: Sun, 19 May 2024 19:07:35 +0200 Subject: [PATCH 10/10] difflib.py: NEWS message Co-authored-by: Tim Peters --- .../next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst index b9729e6cb38c57..30b5f97b8059f9 100644 --- a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst +++ b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst @@ -1 +1 @@ -``difflib.Differ`` is faster for some class of diffs +``difflib.Differ`` is much faster for some cases of diffs where many pairs of lines are equally similar.