HypothesisWorks
diff --git a/‎hypothesis-python/RELEASE.rst
Lines changed: 3 additions & 0 deletions b/‎hypothesis-python/RELEASE.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎hypothesis-python/src/hypothesis/internal/conjecture/datatree.py
Lines changed: 35 additions & 29 deletions b/‎hypothesis-python/src/hypothesis/internal/conjecture/datatree.py
Lines changed: 35 additions & 29 deletions
diff --git a/‎hypothesis-python/tests/conjecture/test_choice.py
Lines changed: 6 additions & 2 deletions b/‎hypothesis-python/tests/conjecture/test_choice.py
Lines changed: 6 additions & 2 deletions
@@ -0,0 +1,3 @@
+RELEASE_TYPE: patch
+
+Improve performance of an internal method to count possible choices.
@@ -158,31 +158,47 @@ def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None:
 # The one case where this may be detrimental is fuzzing, where the throughput of
 # examples is so high that it really may saturate important nodes. We'll cross
 # that bridge when we come to it.
-MAX_CHILDREN_EFFECTIVELY_INFINITE: Final[int] = 100_000
+MAX_CHILDREN_EFFECTIVELY_INFINITE: Final[int] = 10_000_000
 
 
 def _count_distinct_strings(*, alphabet_size: int, min_size: int, max_size: int) -> int:
     # We want to estimate if we're going to have more children than
     # MAX_CHILDREN_EFFECTIVELY_INFINITE, without computing a potentially
-    # extremely expensive pow. We'll check if the number of strings in
-    # the largest string size alone is enough to put us over this limit.
-    # We'll also employ a trick of estimating against log, which is cheaper
-    # than computing a pow.
-    #
-    # x = max_size
-    # y = alphabet_size
-    # n = MAX_CHILDREN_EFFECTIVELY_INFINITE
-    #
-    #     x**y > n
-    # <=> log(x**y)  > log(n)
-    # <=> y * log(x) > log(n)
-    definitely_too_large = max_size * math.log(alphabet_size) > math.log(
-        MAX_CHILDREN_EFFECTIVELY_INFINITE
-    )
-    if definitely_too_large:
-        return MAX_CHILDREN_EFFECTIVELY_INFINITE
+    # extremely expensive pow. We'll check the two extreme cases - if the
+    # number of strings in the largest string size alone is enough to put us
+    # over this limit (at alphabet_size >= 2), and if the variation in sizes
+    # (at alphabet_size == 1) is enough. If neither result in an early return,
+    # the exact result should be reasonably cheap to compute.
+    if alphabet_size == 0:
+        # Special-case the empty string, avoid error in math.log(0).
+        return 1
+    elif alphabet_size == 1:
+        # Special-case the constant alphabet, invalid in the geom-series sum.
+        return max_size - min_size + 1
+    else:
+        # Estimate against log, which is cheaper than computing a pow.
+        #
+        #   m = max_size
+        #   a = alphabet_size
+        #   N = MAX_CHILDREN_EFFECTIVELY_INFINITE
+        #
+        #           a**m > N
+        # <=> m * log(a) > log(N)
+        log_max_sized_children = max_size * math.log(alphabet_size)
+        if log_max_sized_children > math.log(MAX_CHILDREN_EFFECTIVELY_INFINITE):
+            return MAX_CHILDREN_EFFECTIVELY_INFINITE
 
-    return sum(alphabet_size**k for k in range(min_size, max_size + 1))
+    # The sum of a geometric series is given by (ref: wikipedia):
+    #     ᵐ∑ₖ₌₀ aᵏ = (aᵐ⁺¹ - 1) / (a - 1)
+    #               = S(m) / S(0)
+    # assuming a != 1 and using the definition
+    #         S(m) := aᵐ⁺¹ - 1.
+    # The sum we want, starting from a number n [0 <= n <= m] rather than zero, is
+    #     ᵐ∑ₖ₌ₙ aᵏ = ᵐ∑ₖ₌₀ aᵏ - ⁿ⁻¹∑ₖ₌₀ aᵏ = S(m) / S(0) - S(n - 1) / S(0)
+    def S(n):
+        return alphabet_size ** (n + 1) - 1
+
+    return (S(max_size) - S(min_size - 1)) // S(0)
 
 
 def compute_max_children(
@@ -226,16 +242,6 @@ def compute_max_children(
         max_size = constraints["max_size"]
         intervals = constraints["intervals"]
 
-        if len(intervals) == 0:
-            # Special-case the empty alphabet to avoid an error in math.log(0).
-            # Only possibility is the empty string.
-            return 1
-
-        # avoid math.log(1) == 0 and incorrectly failing our effectively_infinite
-        # estimate, even when we definitely are too large.
-        if len(intervals) == 1 and max_size > MAX_CHILDREN_EFFECTIVELY_INFINITE:
-            return MAX_CHILDREN_EFFECTIVELY_INFINITE
-
         return _count_distinct_strings(
             alphabet_size=len(intervals), min_size=min_size, max_size=max_size
         )
 
@@ -110,7 +110,7 @@ def test_compute_max_children_is_positive(choice_type_and_constraints):
                 "max_size": COLLECTION_DEFAULT_MAX_SIZE,
                 "intervals": IntervalSet.from_string("a"),
             },
-            MAX_CHILDREN_EFFECTIVELY_INFINITE,
+            COLLECTION_DEFAULT_MAX_SIZE + 1,
         ),
         (
             "string",
@@ -169,7 +169,11 @@ def test_compute_max_children_is_positive(choice_type_and_constraints):
     ],
 )
 def test_compute_max_children(choice_type, constraints, count_children):
-    assert compute_max_children(choice_type, constraints) == count_children
+    candidates = [count_children]
+    if count_children > MAX_CHILDREN_EFFECTIVELY_INFINITE:
+        # Acceptable to return either the exact value or the estimate
+        candidates.append(MAX_CHILDREN_EFFECTIVELY_INFINITE)
+    assert compute_max_children(choice_type, constraints) in candidates
 
 
 @given(st.text(min_size=1, max_size=1), st.integers(0, 100))
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+RELEASE_TYPE: patch`
	`2`	`+`
	`3`	`+Improve performance of an internal method to count possible choices.`