8000 Fix underflow issues due to float precision in TSNE (#19472) · scikit-learn/scikit-learn@e9c6fca · GitHub
[go: up one dir, main page]

Skip to content

Commit e9c6fca

Browse files
authored
Fix underflow issues due to float precision in TSNE (#19472)
1 parent 6a6217f commit e9c6fca

File tree

3 files changed

+30
-10
lines changed

3 files changed

+30
-10
lines changed

doc/whats_new/v1.0.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ parameters, may produce different models from the previous version. This often
2222
occurs due to changes in the modelling logic (bug fixes or enhancements), or in
2323
random sampling procedures.
2424

25-
25+
- |Fix| :class:`manifold.TSNE` now avoids numerical underflow issues during
26+
affinity matrix computation.
2627

2728
Details are listed in the changelog below.
2829

@@ -134,6 +135,13 @@ Changelog
134135
:pr:`17743` by :user:`Maria Telenczuk <maikia>` and
135136
:user:`Alexandre Gramfort <agramfort>`.
136137

138+
:mod:`sklearn.manifold`
139+
.......................
140+
141+
- |Fix| Change numerical precision to prevent underflow issues
142+
during affinity matrix computation for :class:`manifold.TSNE`.
143+
:pr:`19472` by :user:`Dmitry Kobak <dkobak>`.
144+
137145
:mod:`sklearn.metrics`
138146
......................
139147

sklearn/manifold/_utils.pyx

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,18 +51,18 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity(
5151
cdef long n_neighbors = sqdistances.shape[1]
5252
cdef int using_neighbors = n_neighbors < n_samples
5353
# Precisions of conditional Gaussian distributions
54-
cdef float beta
55-
cdef float beta_min
56-
cdef float beta_max
57-
cdef float beta_sum = 0.0
54+
cdef double beta
55+
cdef double beta_min
56+
cdef double beta_max
57+
cdef double beta_sum = 0.0
5858

5959
# Use log scale
60-
cdef float desired_entropy = math.log(desired_perplexity)
61-
cdef float entropy_diff
60+
cdef double desired_entropy = math.log(desired_perplexity)
61+
cdef double entropy_diff
6262

63-
cdef float entropy
64-
cdef float sum_Pi
65-
cdef float sum_disti_Pi
63+
cdef double entropy
64+
cdef double sum_Pi
65+
cdef double sum_disti_Pi
6666
cdef long i, j, k, l
6767

6868
# This array is later used as a 32bit array. It has multiple intermediate

sklearn/manifold/tests/test_t_sne.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,18 @@ def test_binary_search():
116116
assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3)
117117

118118

119+
def test_binary_search_underflow():
120+
# Test if the binary search finds Gaussians with desired perplexity.
121+
# A more challenging case than the one above, producing numeric
122+
# underflow in float precision (see issue #19471 and PR #19472).
123+
random_state = check_random_state(42)
124+
data = random_state.randn(1, 90).astype(np.float32) + 100
125+
desired_perplexity = 30.0
126+
P = _binary_search_perplexity(data, desired_perplexity, verbose=0)
127+
perplexity = 2 ** -np.nansum(P[0, 1:] * np.log2(P[0, 1:]))
128+
assert_almost_equal(perplexity, desired_perplexity, decimal=3)
129+
130+
119131
def test_binary_search_neighbors():
120132
# Binary perplexity search approximation.
121133
# Should be approximately equal to the slow method when we use

0 commit comments

Comments
 (0)
0