8000 DOC Accelerate plot_johnson_lindenstrauss_bound.py example (#21795) · scikit-learn/scikit-learn@0bc9186 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0bc9186

Browse files
lisacsnadrinjalaliglemaitre
committed
DOC Accelerate plot_johnson_lindenstrauss_bound.py example (#21795)
Co-authored-by: Adrin Jalali <adrin.jalali@gmail.com> Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent eef0949 commit 0bc9186

File tree

1 file changed

+21
-23
lines changed

1 file changed

+21
-23
lines changed

examples/miscellaneous/plot_johnson_lindenstrauss_bound.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@
3333
# .. math::
3434
# (1 - eps) \|u - v\|^2 < \|p(u) - p(v)\|^2 < (1 + eps) \|u - v\|^2
3535
#
36-
# Where u and v are any rows taken from a dataset of shape (n_samples,
37-
# n_features) and p is a projection by a random Gaussian N(0, 1) matrix
38-
# of shape (n_components, n_features) (or a sparse Achlioptas matrix).
36+
# Where `u` and `v` are any rows taken from a dataset of shape `(n_samples,
37+
# n_features)` and `p` is a projection by a random Gaussian `N(0, 1)` matrix
38+
# of shape `(n_components, n_features)` (or a sparse Achlioptas matrix).
3939
#
4040
# The minimum number of components to guarantees the eps-embedding is
4141
# given by:
@@ -60,7 +60,7 @@
6060
min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
6161
plt.loglog(n_samples_range, min_n_components, color=color)
6262

63-
plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
63+
plt.legend([f"eps = {eps:0.1f}" for eps in eps_range], loc="lower right")
6464
plt.xlabel("Number of observations to eps-embed")
6565
plt.ylabel("Minimum number of dimensions")
6666
plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
@@ -84,7 +84,7 @@
8484
min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
8585
plt.semilogy(eps_range, min_n_components, color=color)
8686

87-
plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
87+
plt.legend([f"n_samples = {n}" for n in n_samples_range], loc="upper right")
8888
plt.xlabel("Distortion eps")
8989
plt.ylabel("Minimum number of dimensions")
9090
plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
@@ -97,12 +97,12 @@
9797
# We validate the above bounds on the 20 newsgroups text document
9898
# (TF-IDF word frequencies) dataset or on the digits dataset:
9999
#
100-
# - for the 20 newsgroups dataset some 500 documents with 100k
100+
# - for the 20 newsgroups dataset some 300 documents with 100k
101101
# features in total are projected using a sparse random matrix to smaller
102102
# euclidean spaces with various values for the target number of dimensions
103103
# ``n_components``.
104104
#
105-
# - for the digits dataset, some 8x8 gray level pixels data for 500
105+
# - for the digits dataset, some 8x8 gray level pixels data for 300
106106
# handwritten digits pictures are randomly projected to spaces for various
107107
# larger number of dimensions ``n_components``.
108108
#
@@ -111,25 +111,25 @@
111111
# this script.
112112

113113
if "--use-digits-dataset" in sys.argv:
114-
data = load_digits().data[:500]
114+
data = load_digits().data[:300]
115115
else:
116-
data = fetch_20newsgroups_vectorized().data[:500]
116+
data = fetch_20newsgroups_vectorized().data[:300]
117117

118118
# %%
119119
# For each value of ``n_components``, we plot:
120120
#
121121
# - 2D distribution of sample pairs with pairwise distances in original
122-
# and projected spaces as x and y axis respectively.
122+
# and projected spaces as x- and y-axis respectively.
123123
#
124124
# - 1D histogram of the ratio of those distances (projected / original).
125125

126126
n_samples, n_features = data.shape
127127
print(
128-
"Embedding %d samples with dim %d using various random projections"
129-
% (n_samples, n_features)
128+
f"Embedding {n_samples} samples with dim {n_features} using various "
129+
"random projections"
130130
)
131131

132-
n_components_range = np.array([300, 1000, 10000])
132+
n_components_range = np.array([300, 1_000, 10_000])
133133
dists = euclidean_distances(data, squared=True).ravel()
134134

135135
# select only non-identical samples pairs
@@ -141,13 +141,13 @@
141141
rp = SparseRandomProjection(n_components=n_components)
142142
projected_data = rp.fit_transform(data)
143143
print(
144-
"Projected %d samples from %d to %d in %0.3fs"
145-
% (n_samples, n_features, n_components, time() - t0)
144+
f"Projected {n_samples} samples from {n_features} to {n_components} in "
145+
f"{time() - t0:0.3f}s"
146146
)
147147
if hasattr(rp, "components_"):
148148
n_bytes = rp.components_.data.nbytes
149149
n_bytes += rp.components_.indices.nbytes
150-
print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))
150+
print(f"Random matrix with size: {n_bytes / 1e6:0.3f} MB")
151151

152152
projected_dists = euclidean_distances(projected_data, squared=True).ravel()[nonzero]
153153

@@ -168,7 +168,7 @@
168168
cb.set_label("Sample pairs counts")
169169

170170
rates = projected_dists / dists
171-
print("Mean distances rate: %0.2f (%0.2f)" % (np.mean(rates), np.std(rates)))
171+
print(f"Mean distances rate: {np.mean(rates):.2f} ({np.std(rates):.2f})")
172172

173173
plt.figure()
174174
plt.hist(rates, bins=50, range=(0.0, 2.0), edgecolor="k", density=True)
@@ -186,15 +186,13 @@
186186
# We can see that for low values of ``n_components`` the distribution is wide
187187
# with many distorted pairs and a skewed distribution (due to the hard
188188
# limit of zero ratio on the left as distances are always positives)
189-
# while for larger values of n_components the distortion is controlled
189+
# while for larger values of `n_components` the distortion is controlled
190190
# and the distances are well preserved by the random projection.
191-
192-
193-
# %%
191+
#
194192
# Remarks
195193
# =======
196194
#
197-
# According to the JL lemma, projecting 500 samples without too much distortion
195+
# According to the JL lemma, projecting 300 samples without too much distortion
198196
# will require at least several thousands dimensions, irrespective of the
199197
# number of features of the original dataset.
200198
#
@@ -203,5 +201,5 @@
203201
# for dimensionality reduction in this case.
204202
#
205203
# On the twenty newsgroups on the other hand the dimensionality can be
206-
# decreased from 56436 down to 10000 while reasonably preserving
204+
# decreased from 56,436 down to 10,000 while reasonably preserving
207205
# pairwise distances.

0 commit comments

Comments
 (0)
0