33
33
# .. math::
34
34
# (1 - eps) \|u - v\|^2 < \|p(u) - p(v)\|^2 < (1 + eps) \|u - v\|^2
35
35
#
36
- # Where u and v are any rows taken from a dataset of shape (n_samples,
37
- # n_features) and p is a projection by a random Gaussian N(0, 1) matrix
38
- # of shape (n_components, n_features) (or a sparse Achlioptas matrix).
36
+ # Where `u` and `v` are any rows taken from a dataset of shape ` (n_samples,
37
+ # n_features)` and `p` is a projection by a random Gaussian ` N(0, 1)` matrix
38
+ # of shape ` (n_components, n_features)` (or a sparse Achlioptas matrix).
39
39
#
40
40
# The minimum number of components to guarantees the eps-embedding is
41
41
# given by:
60
60
min_n_components = johnson_lindenstrauss_min_dim (n_samples_range , eps = eps )
61
61
plt .loglog (n_samples_range , min_n_components , color = color )
62
62
63
- plt .legend (["eps = % 0.1f" % eps for eps in eps_range ], loc = "lower right" )
63
+ plt .legend ([f "eps = { eps : 0.1f} " for eps in eps_range ], loc = "lower right" )
64
64
plt .xlabel ("Number of observations to eps-embed" )
65
65
plt .ylabel ("Minimum number of dimensions" )
66
66
plt .title ("Johnson-Lindenstrauss bounds:\n n_samples vs n_components" )
84
84
min_n_components = johnson_lindenstrauss_min_dim (n_samples , eps = eps_range )
85
85
plt .semilogy (eps_range , min_n_components , color = color )
86
86
87
- plt .legend (["n_samples = %d" % n for n in n_samples_range ], loc = "upper right" )
87
+ plt .legend ([f "n_samples = { n } " for n in n_samples_range ], loc = "upper right" )
88
88
plt .xlabel ("Distortion eps" )
89
89
plt .ylabel ("Minimum number of dimensions" )
90
90
plt .title ("Johnson-Lindenstrauss bounds:\n n_components vs eps" )
97
97
# We validate the above bounds on the 20 newsgroups text document
98
98
# (TF-IDF word frequencies) dataset or on the digits dataset:
99
99
#
100
- # - for the 20 newsgroups dataset some 500 documents with 100k
100
+ # - for the 20 newsgroups dataset some 300 documents with 100k
101
101
# features in total are projected using a sparse random matrix to smaller
102
102
# euclidean spaces with various values for the target number of dimensions
103
103
# ``n_components``.
104
104
#
105
- # - for the digits dataset, some 8x8 gray level pixels data for 500
105
+ # - for the digits dataset, some 8x8 gray level pixels data for 300
106
106
# handwritten digits pictures are randomly projected to spaces for various
107
107
# larger number of dimensions ``n_components``.
108
108
#
111
111
# this script.
112
112
113
113
if "--use-digits-dataset" in sys .argv :
114
- data = load_digits ().data [:500 ]
114
+ data = load_digits ().data [:300 ]
115
115
else :
116
- data = fetch_20newsgroups_vectorized ().data [:500 ]
116
+ data = fetch_20newsgroups_vectorized ().data [:300 ]
117
117
118
118
# %%
119
119
# For each value of ``n_components``, we plot:
120
120
#
121
121
# - 2D distribution of sample pairs with pairwise distances in original
122
- # and projected spaces as x and y axis respectively.
122
+ # and projected spaces as x- and y- axis respectively.
123
123
#
124
124
# - 1D histogram of the ratio of those distances (projected / original).
125
125
126
126
n_samples , n_features = data .shape
127
127
print (
128
- "Embedding %d samples with dim %d using various random projections "
129
- % ( n_samples , n_features )
128
+ f "Embedding { n_samples } samples with dim { n_features } using various "
129
+ "random projections"
130
130
)
131
131
132
- n_components_range = np .array ([300 , 1000 , 10000 ])
132
+ n_components_range = np .array ([300 , 1_000 , 10_000 ])
133
133
dists = euclidean_distances (data , squared = True ).ravel ()
134
134
135
135
# select only non-identical samples pairs
8000
@@ -141,13 +141,13 @@
141
141
rp = SparseRandomProjection (n_components = n_components )
142
142
projected_data = rp .fit_transform (data )
143
143
print (
144
- "Projected %d samples from %d to %d in %0.3fs "
145
- % ( n_samples , n_features , n_components , time () - t0 )
144
+ f "Projected { n_samples } samples from { n_features } to { n_components } in "
145
+ f" { time () - t0 :0.3f } s"
146
146
)
147
147
if hasattr (rp , "components_" ):
148
148
n_bytes = rp .components_ .data .nbytes
149
149
n_bytes += rp .components_ .indices .nbytes
150
- print ("Random matrix with size: %0.3fMB" % ( n_bytes / 1e6 ) )
150
+ print (f "Random matrix with size: { n_bytes / 1e6 :0.3f } MB" )
151
151
152
152
projected_dists = euclidean_distances (projected_data , squared = True ).ravel ()[nonzero ]
153
153
168
168
cb .set_label ("Sample pairs counts" )
169
169
170
170
rates = projected_dists / dists
171
- print ("Mean distances rate: %0.2f (%0.2f)" % ( np .mean (rates ), np .std (rates )) )
171
+ print (f "Mean distances rate: { np .mean (rates ):.2f } ( { np .std (rates ):.2f } )" )
172
172
173
173
plt .figure ()
174
174
plt .hist (rates , bins = 50 , range = (0.0 , 2.0 ), edgecolor = "k" , density = True )
186
186
# We can see that for low values of ``n_components`` the distribution is wide
187
187
# with many distorted pairs and a skewed distribution (due to the hard
188
188
# limit of zero ratio on the left as distances are always positives)
189
- # while for larger values of n_components the distortion is controlled
189
+ # while for larger values of ` n_components` the distortion is controlled
190
190
# and the distances are well preserved by the random projection.
191
-
192
-
193
- # %%
191
+ #
194
192
# Remarks
195
193
# =======
196
194
#
197
- # According to the JL lemma, projecting 500 samples without too much distortion
195
+ # According to the JL lemma, projecting 300 samples without too much distortion
198
196
# will require at least several thousands dimensions, irrespective of the
199
197
# number of features of the original dataset.
200
198
#
203
201
# for dimensionality reduction in this case.
204
202
#
205
203
# On the twenty newsgroups on the other hand the dimensionality can be
206
- # decreased from 56436 down to 10000 while reasonably preserving
204
+ # decreased from 56,436 down to 10,000 while reasonably preserving
207
205
# pairwise distances.
0 commit comments