8000 DOC Accelerate plot_johnson_lindenstrauss_bound.py example (#21795) · scikit-learn/scikit-learn@5d58d9d · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 5d58d9d

Browse files
lisacsnadrinjalaliglemaitre
authored
DOC Accelerate plot_johnson_lindenstrauss_bound.py example (#21795)
Co-authored-by: Adrin Jalali <adrin.jalali@gmail.com> Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent f2cbedb commit 5d58d9d

File tree

1 file changed

+21
-23
lines changed

1 file changed

+21
-23
lines changed

examples/miscellaneous/plot_johnson_lindenstrauss_bound.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@
3333
# .. math::
3434
# (1 - eps) \|u - v\|^2 < \|p(u) - p(v)\|^2 < (1 + eps) \|u - v\|^2
3535
#
36-
# Where u and v are any rows taken from a dataset of shape (n_samples,
37-
# n_features) and p is a projection by a random Gaussian N(0, 1) matrix
38-
# of shape (n_components, n_features) (or a sparse Achlioptas matrix).
36+
# Where `u` and `v` are any rows taken from a dataset of shape `(n_samples,
37+
# n_features)` and `p` is a projection by a random Gaussian `N(0, 1)` matrix
38+
# of shape `(n_components, n_features)` (or a sparse Achlioptas matrix).
3939
#
4040
# The minimum number of components to guarantees the eps-embedding is
4141
# given by:
@@ -60,7 +60,7 @@
6060
min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
6161
plt.loglog(n_samples_range, min_n_components, color=color)
6262

63-
plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
63+
plt.legend([f"eps = {eps:0.1f}" for eps in eps_range], loc="lower right")
6464
plt.xlabel("Number of observations to eps-embed")
6565
plt.ylabel("Minimum number of dimensions")
6666
plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
@@ -84,7 +84,7 @@
8484
min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
8585
plt.semilogy(eps_range, min_n_components, color=color)
8686

87-
plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
87+
plt.legend([f"n_samples = {n}" for n in n_samples_range], loc="upper right")
8888
plt.xlabel("Distortion eps")
8989
plt.ylabel("Minimum number of dimensions")
9090
plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
@@ -97,12 +97,12 @@
9797
# We validate the above bounds on the 20 newsgroups text document
9898
# (TF-IDF word frequencies) dataset or on the digits dataset:
9999
#
100-
# - for the 20 newsgroups dataset some 500 documents with 100k
100+
# - for the 20 newsgroups dataset some 300 documents with 100k
101101
# features in total are projected using a sparse random matrix to smaller
102102
# euclidean spaces with various values for the target number of dimensions
103103
# ``n_components``.
104104
#
105-
# - for the digits dataset, some 8x8 gray level pixels data for 500
105+
# - for the digits dataset, some 8x8 gray level pixels data for 300
106106
# handwritten digits pictures are randomly projected to spaces for various
107107
# larger number of dimensions ``n_components``.
108108
#
@@ -111,25 +111,25 @@
111111
# this script.
112112

113113
if "--use-digits-dataset" in sys.argv:
114-
data = load_digits().data[:500]
114+
data = load_digits().data[:300]
115115
else:
116-
data = fetch_20newsgroups_vectorized().data[:500]
116+
data = fetch_20newsgroups_vectorized().data[:300]
117117

118118
# %%
119119
# For each value of ``n_components``, we plot:
120120
#
121121
# - 2D distribution of sample pairs with pairwise distances in original
122-
# and projected spaces as x and y axis respectively.
122+
# and projected spaces as x- and y-axis respectively.
123123
#
124124
# - 1D histogram of the ratio of those distances (projected / original).
125125

126126
n_samples, n_features = data.shape
127127
print(
128-
"Embedding %d samples with dim %d using various random projections"
129-
% (n_samples, n_features)
128+
f"Embedding {n_samples} samples with dim {n_features} using various "
129+
"random projections"
130130
)
131131

132-
n_components_range = np.array([300, 1000, 10000])
132+
n_components_range = np.array([300, 1_000, 10_000])
133133
dists = euclidean_distances(data, squared=True).ravel()
134134

135135
# select only non-identical samples pairs
@@ -141,13 +141,13 @@
141141
rp = SparseRandomProjection(n_components=n_components)
142142
projected_data = rp.fit_transform(data)
143143
print(
144-
"Projected %d samples from %d to %d in %0.3fs"
145-
% (n_samples, n_features, n_components, time() - t0)
144+
f"Projected {n_samples} samples from {n_features} to {n_components} in "
145+
f"{time() - t0:0.3f}s"
146146
)
147147
if hasattr(rp, "components_"):
148148
n_bytes = rp.components_.data.nbytes
149149
n_bytes += rp.components_.indices.nbytes
150-
print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))
150+
print(f"Random matrix with size: {n_bytes / 1e6:0.3f} MB")
151151

152152
projected_dists = euclidean_distances(projected_data, squared=True).ravel()[nonzero]
153153

@@ -168,7 +168,7 @@
168168
cb.set_label("Sample pairs counts")
169169

170170
rates = projected_dists / dists
171-
print("Mean distances rate: %0.2f (%0.2f)" % (np.mean(rates), np.std(rates)))
171+
print(f"Mean distances rate: {np.mean(rates):.2f} ({np.std(rates):.2f})")
172172

173173
plt.figure()
174174
plt.hist(rates, bins=50, range=(0.0, 2.0), edgecolor="k", density=True)
@@ -186,15 +186,13 @@
186186
# We can see that for low values of ``n_components`` the distribution is wide
187187
# with many distorted pairs and a skewed distribution (due to the hard
188188
# limit of zero ratio on the left as distances are always positives)
189-
# while for larger values of n_components the distortion is controlled
189+
# while for larger values of `n_components` the distortion is controlled
190190
# and the distances are well preserved by the random projection.
191-
192-
193-
# %%
191+
#
194192
# Remarks
195193
# =======
196194
#
197-
# According to the JL lemma, projecting 500 samples without too much distortion
195+
# According to the JL lemma, projecting 300 samples without too much distortion
198196
# will require at least several thousands dimensions, irrespective of the
199197
# number of features of the original dataset.
200198
#
@@ -203,5 +201,5 @@
203201
# for dimensionality reduction in this case.
204202
#
205203
# On the twenty newsgroups on the other hand the dimensionality can be
206-
# decreased from 56436 down to 10000 while reasonably preserving
204+
# decreased from 56,436 down to 10,000 while reasonably preserving
207205
# pairwise distances.

0 commit comments

Comments
 (0)
0