From 9bb454c80efdfd493257c67e71cafd76cf048ec9 Mon Sep 17 00:00:00 2001
From: pulkin <gpulkin@gmail.com>
Date: Fri, 17 May 2024 22:59:32 +0200
Subject: [PATCH 01/10] difflib.py: fix recursion branching for degenerate
 cases

---
 Lib/difflib.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index ba0b256969ebff..b5563dd9480cdb 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -915,12 +915,24 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
         cruncher = SequenceMatcher(self.charjunk)
         eqi, eqj = None, None   # 1st indices of equal lines (if any)
 
+        max_len_a = max(map(len, a))
+        max_len_b = max(map(len, b))
+        # twice epsilon can be safely added to distinguish equal ratios
+        # given ratios are 2 * integer / (len_i + len_j)
+        # the smallest possible non-zero difference between two arbitrary
+        # ratios is no less than twice the epsilon
+        epsilon = 0.99 / (max_len_a + max_len_b) ** 2
+        # we use sub-epsilon weights to promote otherwise equal ratios
+        # that split sequences closer to their midpoints
+        # this way, we balance the recursion tree in degenerate cases
+
         # search for the pair that matches best without being identical
         # (identical lines must be junk lines, & we don't want to synch up
         # on junk -- unless we have to)
         for j in range(blo, bhi):
             bj = b[j]
             cruncher.set_seq2(bj)
+            weight_b = min(j - blo, bhi - 1 - j) / (bhi - blo)
             for i in range(alo, ahi):
                 ai = a[i]
                 if ai == bj:
@@ -928,16 +940,18 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
                         eqi, eqj = i, j
                     continue
                 cruncher.set_seq1(ai)
+                weight_ab = weight_b + min(i - alo, ahi - 1 - i) / (ahi - alo)
+                weight_ab *= epsilon
                 # computing similarity is expensive, so use the quick
                 # upper bounds first -- have seen this speed up messy
                 # compares by a factor of 3.
                 # note that ratio() is only expensive to compute the first
                 # time it's called on a sequence pair; the expensive part
                 # of the computation is cached by cruncher
-                if cruncher.real_quick_ratio() > best_ratio and \
-                      cruncher.quick_ratio() > best_ratio and \
-                      cruncher.ratio() > best_ratio:
-                    best_ratio, best_i, best_j = cruncher.ratio(), i, j
+                if cruncher.real_quick_ratio() + weight_ab > best_ratio and \
+                      cruncher.quick_ratio() + weight_ab > best_ratio and \
+                      cruncher.ratio() + weight_ab > best_ratio:
+                    best_ratio, best_i, best_j = cruncher.ratio() + weight_ab, i, j
         if best_ratio < cutoff:
             # no non-identical "pretty close" pair
             if eqi is None:

From a159954afc78d44ba18a172eb845fdc84cc894e1 Mon Sep 17 00:00:00 2001
From: pulkin <gpulkin@gmail.com>
Date: Fri, 17 May 2024 23:04:44 +0200
Subject: [PATCH 02/10] difflib.py: grammar

---
 Lib/difflib.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index b5563dd9480cdb..72ff67ce89e917 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -917,13 +917,13 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
 
         max_len_a = max(map(len, a))
         max_len_b = max(map(len, b))
-        # twice epsilon can be safely added to distinguish equal ratios
-        # given ratios are 2 * integer / (len_i + len_j)
+        # twice the epsilon can be safely added to distinguish otherwise
+        # equal ratios. given ratios are 2 * integer / (len_i + len_j)
         # the smallest possible non-zero difference between two arbitrary
         # ratios is no less than twice the epsilon
         epsilon = 0.99 / (max_len_a + max_len_b) ** 2
-        # we use sub-epsilon weights to promote otherwise equal ratios
-        # that split sequences closer to their midpoints
+        # we use sub-epsilon weights to promote i, j that split the
+        # input range more equally.
         # this way, we balance the recursion tree in degenerate cases
 
         # search for the pair that matches best without being identical

From 81b7dae4846386ba663ce9207bbb9f9e3d5b067a Mon Sep 17 00:00:00 2001
From: pulkin <gpulkin@gmail.com>
Date: Sat, 18 May 2024 10:34:54 +0200
Subject: [PATCH 03/10] difflib.py: weights-free implementation

---
 Lib/difflib.py | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index 72ff67ce89e917..8d6abceed41b82 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -911,28 +911,26 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
 
         # don't synch up unless the lines have a similarity score of at
         # least cutoff; best_ratio tracks the best score seen so far
-        best_ratio, cutoff = 0.74, 0.75
+        best_ratio, cutoff = (0.74, 0), 0.75
         cruncher = SequenceMatcher(self.charjunk)
         eqi, eqj = None, None   # 1st indices of equal lines (if any)
 
-        max_len_a = max(map(len, a))
-        max_len_b = max(map(len, b))
-        # twice the epsilon can be safely added to distinguish otherwise
-        # equal ratios. given ratios are 2 * integer / (len_i + len_j)
-        # the smallest possible non-zero difference between two arbitrary
-        # ratios is no less than twice the epsilon
-        epsilon = 0.99 / (max_len_a + max_len_b) ** 2
-        # we use sub-epsilon weights to promote i, j that split the
-        # input range more equally.
-        # this way, we balance the recursion tree in degenerate cases
-
         # search for the pair that matches best without being identical
         # (identical lines must be junk lines, & we don't want to synch up
         # on junk -- unless we have to)
+        alen = alo + ahi - 1
+        blen = blo + bhi - 1
+        # weight is used to balance the recursion by prioritizing
+        # i and j in the middle of their ranges
+        weight = 0
         for j in range(blo, bhi):
             bj = b[j]
             cruncher.set_seq2(bj)
-            weight_b = min(j - blo, bhi - 1 - j) / (bhi - blo)
+            if j < blen / 2:
+                weight += alen
+            elif j > blen / 2:
+                weight -= alen
+            weight = min(j - blo, bhi - 1 - j) * (ahi - alo - 1)
             for i in range(alo, ahi):
                 ai = a[i]
                 if ai == bj:
@@ -940,18 +938,22 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
                         eqi, eqj = i, j
                     continue
                 cruncher.set_seq1(ai)
-                weight_ab = weight_b + min(i - alo, ahi - 1 - i) / (ahi - alo)
-                weight_ab *= epsilon
+                if i < alen / 2:
+                    weight += blen
+                elif i > alen / 2:
+                    weight -= blen
                 # computing similarity is expensive, so use the quick
                 # upper bounds first -- have seen this speed up messy
                 # compares by a factor of 3.
                 # note that ratio() is only expensive to compute the first
                 # time it's called on a sequence pair; the expensive part
                 # of the computation is cached by cruncher
-                if cruncher.real_quick_ratio() + weight_ab > best_ratio and \
-                      cruncher.quick_ratio() + weight_ab > best_ratio and \
-                      cruncher.ratio() + weight_ab > best_ratio:
-                    best_ratio, best_i, best_j = cruncher.ratio() + weight_ab, i, j
+                if (cruncher.real_quick_ratio(), weight) > best_ratio and \
+                      (cruncher.quick_ratio(), weight) > best_ratio and \
+                      (cruncher.ratio(), weight) > best_ratio:
+                    best_ratio, best_i, best_j = (cruncher.ratio(), weight), i, j
+        # assert weight == 0, weight
+        best_ratio, _ = best_ratio
         if best_ratio < cutoff:
             # no non-identical "pretty close" pair
             if eqi is None:

From 2184911512a97be851bd12fdaa71c5327a82330b Mon Sep 17 00:00:00 2001
From: pulkin <gpulkin@gmail.com>
Date: Sat, 18 May 2024 10:36:05 +0200
Subject: [PATCH 04/10] difflib.py: rm left-over

---
 Lib/difflib.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index 8d6abceed41b82..4b90660a4aa244 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -930,7 +930,6 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
                 weight += alen
             elif j > blen / 2:
                 weight -= alen
-            weight = min(j - blo, bhi - 1 - j) * (ahi - alo - 1)
             for i in range(alo, ahi):
                 ai = a[i]
                 if ai == bj:

From f9e3480e42654cf2ba09c6fd2ccb1385347604e7 Mon Sep 17 00:00:00 2001
From: pulkin <gpulkin@gmail.com>
Date: Sun, 19 May 2024 14:18:34 +0200
Subject: [PATCH 05/10] difflib.py: even simpler impl

---
 Lib/difflib.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index 4b90660a4aa244..3536000a455f30 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -918,18 +918,12 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
         # search for the pair that matches best without being identical
         # (identical lines must be junk lines, & we don't want to synch up
         # on junk -- unless we have to)
-        alen = alo + ahi - 1
-        blen = blo + bhi - 1
-        # weight is used to balance the recursion by prioritizing
-        # i and j in the middle of their ranges
-        weight = 0
+        amid = (alo + ahi - 1) / 2
+        bmid = (blo + bhi - 1) / 2
         for j in range(blo, bhi):
             bj = b[j]
             cruncher.set_seq2(bj)
-            if j < blen / 2:
-                weight += alen
-            elif j > blen / 2:
-                weight -= alen
+            weight_j = - abs(j - bmid)
             for i in range(alo, ahi):
                 ai = a[i]
                 if ai == bj:
@@ -937,10 +931,9 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
                         eqi, eqj = i, j
                     continue
                 cruncher.set_seq1(ai)
-                if i < alen / 2:
-                    weight += blen
-                elif i > alen / 2:
-                    weight -= blen
+                # weight is used to balance the recursion by prioritizing
+                # i and j in the middle of their ranges
+                weight = weight_j - abs(i - amid)
                 # computing similarity is expensive, so use the quick
                 # upper bounds first -- have seen this speed up messy
                 # compares by a factor of 3.
@@ -951,7 +944,6 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
                       (cruncher.quick_ratio(), weight) > best_ratio and \
                       (cruncher.ratio(), weight) > best_ratio:
                     best_ratio, best_i, best_j = (cruncher.ratio(), weight), i, j
-        # assert weight == 0, weight
         best_ratio, _ = best_ratio
         if best_ratio < cutoff:
             # no non-identical "pretty close" pair

From b0be97f1cb2f8cf616a458dc2aafc23d09f10522 Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Sun, 19 May 2024 12:25:36 +0000
Subject: [PATCH 06/10] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?=
 =?UTF-8?q?lurb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst  | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst

diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst
new file mode 100644
index 00000000000000..30b71eaef53194
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst
@@ -0,0 +1 @@
+`difflib.Differ` is faster for some class of diffs

From bea603e09c8e4ca483145e9ff77cfef1bee8db0e Mon Sep 17 00:00:00 2001
From: pulkin <gpulkin@gmail.com>
Date: Sun, 19 May 2024 14:27:34 +0200
Subject: [PATCH 07/10] difflib.py: add a desc

Co-authored-by: Tim Peters <tim.peters@gmail.com>
---
 Lib/difflib.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Lib/difflib.py b/Lib/difflib.py
index 3536000a455f30..54ca33d5615f8d 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -911,6 +911,13 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
 
         # don't synch up unless the lines have a similarity score of at
         # least cutoff; best_ratio tracks the best score seen so far
+        # best_ratio is a tuple storing the best .ratio() seen so far, and
+        # a measure of how far the indices are from their index range
+        # midpoints. The latter is used to resolve ratio ties. Favoring
+        # indices near the midpoints tends to cut the ranges in half. Else,
+        # if there are many pairs with the best ratio, recursion can grow
+        # very deep, and runtime becomes cubic. See:
+        # https://github.com/python/cpython/issues/119105
         best_ratio, cutoff = (0.74, 0), 0.75
         cruncher = SequenceMatcher(self.charjunk)
         eqi, eqj = None, None   # 1st indices of equal lines (if any)

From df83f505d09fde507b7b6cbe39395b84931891cc Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Sun, 19 May 2024 12:42:52 +0000
Subject: [PATCH 08/10] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?=
 =?UTF-8?q?lurb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst  | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst

diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst
new file mode 100644
index 00000000000000..2d973a524bb2d0
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst
@@ -0,0 +1 @@
+difflib.Differ is faster for some class of diffs

From 0953ede1b68fb62dd9e079ee7c09a2642e90b912 Mon Sep 17 00:00:00 2001
From: Artem Pulkin <gpulkin@gmail.com>
Date: Sun, 19 May 2024 14:51:44 +0200
Subject: [PATCH 09/10] news: attempt fix build

---
 .../next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst | 2 +-
 .../next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)
 delete mode 100644 Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst

diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst
index 30b71eaef53194..b9729e6cb38c57 100644
--- a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst
+++ b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst
@@ -1 +1 @@
-`difflib.Differ` is faster for some class of diffs
+``difflib.Differ`` is faster for some class of diffs
diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst
deleted file mode 100644
index 2d973a524bb2d0..00000000000000
--- a/Misc/NEWS.d/next/Library/2024-05-19-12-42-51.gh-issue-119105.N2nNLm.rst
+++ /dev/null
@@ -1 +0,0 @@
-difflib.Differ is faster for some class of diffs

From 7237b48dc3416279285a9cf57833aabc10bf3221 Mon Sep 17 00:00:00 2001
From: pulkin <gpulkin@gmail.com>
Date: Sun, 19 May 2024 19:07:35 +0200
Subject: [PATCH 10/10] difflib.py: NEWS message

Co-authored-by: Tim Peters <tim.peters@gmail.com>
---
 .../next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst
index b9729e6cb38c57..30b5f97b8059f9 100644
--- a/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst
+++ b/Misc/NEWS.d/next/Library/2024-05-19-12-25-36.gh-issue-119105.VcR4ig.rst
@@ -1 +1 @@
-``difflib.Differ`` is faster for some class of diffs
+``difflib.Differ`` is much faster for some cases of diffs where many pairs of lines are equally similar.