8000 Merge pull request #4389 from jobh/geom-series · HypothesisWorks/hypothesis@b4bd587 · GitHub
[go: up one dir, main page]

Skip to content

Commit b4bd587

Browse files
authored
Merge pull request #4389 from jobh/geom-series
Improve performance of string max-children count
2 parents 2483531 + 7759770 commit b4bd587

File tree

3 files changed

+44
-31
lines changed

3 files changed

+44
-31
lines changed

hypothesis-python/RELEASE.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
RELEASE_TYPE: patch
2+
3+
Improve performance of an internal method to count possible choices.

hypothesis-python/src/hypothesis/internal/conjecture/datatree.py

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -158,31 +158,47 @@ def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None:
158158
# The one case where this may be detrimental is fuzzing, where the throughput of
159159
# examples is so high that it really may saturate important nodes. We'll cross
160160
# that bridge when we come to it.
161-
MAX_CHILDREN_EFFECTIVELY_INFINITE: Final[int] = 100_000
161+
MAX_CHILDREN_EFFECTIVELY_INFINITE: Final[int] = 10_000_000
162162

163163

164164
def _count_distinct_strings(*, alphabet_size: int, min_size: int, max_size: int) -> int:
165165
# We want to estimate if we're going to have more children than
166166
# MAX_CHILDREN_EFFECTIVELY_INFINITE, without computing a potentially
167-
# extremely expensive pow. We'll check if the number of strings in
168-
# the largest string size alone is enough to put us over this limit.
169-
# We'll also employ a trick of estimating against log, which is cheaper
170-
# than computing a pow.
171-
#
172-
# x = max_size
173-
# y = alphabet_size
174-
# n = MAX_CHILDREN_EFFECTIVELY_INFINITE
175-
#
176-
# x**y > n
177-
# <=> log(x**y) > log(n)
178-
# <=> y * log(x) > log(n)
179-
definitely_too_large = max_size * math.log(alphabet_size) > math.log(
180-
MAX_CHILDREN_EFFECTIVELY_INFINITE
181-
)
182-
if definitely_too_large:
183-
return MAX_CHILDREN_EFFECTIVELY_INFINITE
167+
# extremely expensive pow. We'll check the two extreme cases - if the
168+
# number of strings in the largest string size alone is enough to put us
169+
# over this limit (at alphabet_size >= 2), and if the variation in sizes
170+
# (at alphabet_size == 1) is enough. If neither result in an early return,
171+
# the exact result should be reasonably cheap to compute.
172+
if alphabet_size == 0:
173+
# Special-case the empty string, avoid error in math.log(0).
174+
return 1
175+
elif alphabet_size == 1:
176+
# Special-case the constant alphabet, invalid in the geom-series sum.
177+
return max_size - min_size + 1
178+
else:
179+
# Estimate against log, which is cheaper than computing a pow.
180+
#
181+
# m = max_size
182+
# a = alphabet_size
183+
# N = MAX_CHILDREN_EFFECTIVELY_INFINITE
184+
#
185+
# a**m > N
186+
# <=> m * log(a) > log(N)
187+
log_max_sized_children = max_size * math.log(alphabet_size)
188+
if log_max_sized_children > math.log(MAX_CHILDREN_EFFECTIVELY_INFINITE):
189+
return MAX_CHILDREN_EFFECTIVELY_INFINITE
184190

185-
return sum(alphabet_size**k for k in range(min_size, max_size + 1))
191+
# The sum of a geometric series is given by (ref: wikipedia):
192+
# ᵐ∑ₖ₌₀ aᵏ = (aᵐ⁺¹ - 1) / (a - 1)
193+
# = S(m) / S(0)
194+
# assuming a != 1 and using the definition
195+
# S(m) := aᵐ⁺¹ - 1.
196+
# The sum we want, starting from a number n [0 <= n <= m] rather than zero, is
197+
# ᵐ∑ₖ₌ₙ aᵏ = ᵐ∑ₖ₌₀ aᵏ - ⁿ⁻¹∑ₖ₌₀ aᵏ = S(m) / S(0) - S(n - 1) / S(0)
198+
def S(n):
199+
return alphabet_size ** (n + 1) - 1
200+
201+
return (S(max_size) - S(min_size - 1)) // S(0)
186202

187203

188204
def compute_max_children(
@@ -226,16 +242,6 @@ def compute_max_children(
226242
max_size = constraints["max_size"]
227243
intervals = constraints["intervals"]
228244

229-
if len(intervals) == 0:
230-
# Special-case the empty alphabet to avoid an error in math.log(0).
231-
# Only possibility is the empty string.
232-
return 1
233-
234-
# avoid math.log(1) == 0 and incorrectly failing our effectively_infinite
235-
# estimate, even when we definitely are too large.
236-
if len(intervals) == 1 and max_size > MAX_CHILDREN_EFFECTIVELY_INFINITE:
237-
return MAX_CHILDREN_EFFECTIVELY_INFINITE
238-
239245
return _count_distinct_strings(
240246
alphabet_size=len(intervals), min_size=min_size, max_size=max_size
241247
)

hypothesis-python/tests/conjecture/test_choice.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def test_compute_max_children_is_positive(choice_type_and_constraints):
110110
"max_size": COLLECTION_DEFAULT_MAX_SIZE,
111111
"intervals": IntervalSet.from_string("a"),
112112
},
113-
MAX_CHILDREN_EFFECTIVELY_INFINITE,
113+
COLLECTION_DEFAULT_MAX_SIZE + 1,
114114
),
115115
(
116116
"string",
@@ -169,7 +169,11 @@ def test_compute_max_children_is_positive(choice_type_and_constraints):
169169
],
170170
)
171171
def test_compute_max_children(choice_type, constraints, count_children):
172-
assert compute_max_children(choice_type, constraints) == count_children
172+
candidates = [count_children]
173+
if count_children > MAX_CHILDREN_EFFECTIVELY_INFINITE:
174+
# Acceptable to return either the exact value or the estimate
175+
candidates.append(MAX_CHILDREN_EFFECTIVELY_INFINITE)
176+
assert compute_max_children(choice_type, constraints) in candidates
173177

174178

175179
@given(st.text(min_size=1, max_size=1), st.integers(0, 100))

0 commit comments

Comments
 (0)
0