|
33 | 33 | # .. math::
|
34 | 34 | # (1 - eps) \|u - v\|^2 < \|p(u) - p(v)\|^2 < (1 + eps) \|u - v\|^2
|
35 | 35 | #
|
36 |
| -# Where u and v are any rows taken from a dataset of shape (n_samples, |
37 |
| -# n_features) and p is a projection by a random Gaussian N(0, 1) matrix |
38 |
| -# of shape (n_components, n_features) (or a sparse Achlioptas matrix). |
| 36 | +# Where `u` and `v` are any rows taken from a dataset of shape `(n_samples, |
| 37 | +# n_features)` and `p` is a projection by a random Gaussian `N(0, 1)` matrix |
| 38 | +# of shape `(n_components, n_features)` (or a sparse Achlioptas matrix). |
39 | 39 | #
|
40 | 40 | # The minimum number of components to guarantees the eps-embedding is
|
41 | 41 | # given by:
|
|
60 | 60 | min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
|
61 | 61 | plt.loglog(n_samples_range, min_n_components, color=color)
|
62 | 62 |
|
63 |
| -plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right") |
| 63 | +plt.legend([f"eps = {eps:0.1f}" for eps in eps_range], loc="lower right") |
64 | 64 | plt.xlabel("Number of observations to eps-embed")
|
65 | 65 | plt.ylabel("Minimum number of dimensions")
|
66 | 66 | plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
|
|
84 | 84 | min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
|
85 | 85 | plt.semilogy(eps_range, min_n_components, color=color)
|
86 | 86 |
|
87 |
| -plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right") |
| 87 | +plt.legend([f"n_samples = {n}" for n in n_samples_range], loc="upper right") |
88 | 88 | plt.xlabel("Distortion eps")
|
89 | 89 | plt.ylabel("Minimum number of dimensions")
|
90 | 90 | plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
|
|
97 | 97 | # We validate the above bounds on the 20 newsgroups text document
|
98 | 98 | # (TF-IDF word frequencies) dataset or on the digits dataset:
|
99 | 99 | #
|
100 |
| -# - for the 20 newsgroups dataset some 500 documents with 100k |
| 100 | +# - for the 20 newsgroups dataset some 300 documents with 100k |
101 | 101 | # features in total are projected using a sparse random matrix to smaller
|
102 | 102 | # euclidean spaces with various values for the target number of dimensions
|
103 | 103 | # ``n_components``.
|
104 | 104 | #
|
105 |
| -# - for the digits dataset, some 8x8 gray level pixels data for 500 |
| 105 | +# - for the digits dataset, some 8x8 gray level pixels data for 300 |
106 | 106 | # handwritten digits pictures are randomly projected to spaces for various
|
107 | 107 | # larger number of dimensions ``n_components``.
|
108 | 108 | #
|
|
111 | 111 | # this script.
|
112 | 112 |
|
113 | 113 | if "--use-digits-dataset" in sys.argv:
|
114 |
| - data = load_digits().data[:500] |
| 114 | + data = load_digits().data[:300] |
115 | 115 | else:
|
116 |
| - data = fetch_20newsgroups_vectorized().data[:500] |
| 116 | + data = fetch_20newsgroups_vectorized().data[:300] |
117 | 117 |
|
118 | 118 | # %%
|
119 | 119 | # For each value of ``n_components``, we plot:
|
120 | 120 | #
|
121 | 121 | # - 2D distribution of sample pairs with pairwise distances in original
|
122 |
| -# and projected spaces as x and y axis respectively. |
| 122 | +# and projected spaces as x- and y-axis respectively. |
123 | 123 | #
|
124 | 124 | # - 1D histogram of the ratio of those distances (projected / original).
|
125 | 125 |
|
126 | 126 | n_samples, n_features = data.shape
|
127 | 127 | print(
|
128 |
| - "Embedding %d samples with dim %d using various random projections" |
129 |
| - % (n_samples, n_features) |
| 128 | + f"Embedding {n_samples} samples with dim {n_features} using various " |
| 129 | + "random projections" |
130 | 130 | )
|
131 | 131 |
|
132 |
| -n_components_range = np.array([300, 1000, 10000]) |
| 132 | +n_components_range = np.array([300, 1_000, 10_000]) |
133 | 133 | dists = euclidean_distances(data, squared=True).ravel()
|
134 | 134 |
|
135 | 135 | # select only non-identical samples pairs
|
|
141 | 141 | rp = SparseRandomProjection(n_components=n_components)
|
142 | 142 | projected_data = rp.fit_transform(data)
|
143 | 143 | print(
|
144 |
| - "Projected %d samples from %d to %d in %0.3fs" |
145 |
| - % (n_samples, n_features, n_components, time() - t0) |
| 144 | + f"Projected {n_samples} samples from {n_features} to {n_components} in " |
| 145 | + f"{time() - t0:0.3f}s" |
146 | 146 | )
|
147 | 147 | if hasattr(rp, "components_"):
|
148 | 148 | n_bytes = rp.components_.data.nbytes
|
149 | 149 | n_bytes += rp.components_.indices.nbytes
|
150 |
| - print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6)) |
| 150 | + print(f"Random matrix with size: {n_bytes / 1e6:0.3f} MB") |
151 | 151 |
|
152 | 152 | projected_dists = euclidean_distances(projected_data, squared=True).ravel()[nonzero]
|
153 | 153 |
|
|
168 | 168 | cb.set_label("Sample pairs counts")
|
169 | 169 |
|
170 | 170 | rates = projected_dists / dists
|
171 |
| - print("Mean distances rate: %0.2f (%0.2f)" % (np.mean(rates), np.std(rates))) |
| 171 | + print(f"Mean distances rate: {np.mean(rates):.2f} ({np.std(rates):.2f})") |
172 | 172 |
|
173 | 173 | plt.figure()
|
174 | 174 | plt.hist(rates, bins=50, range=(0.0, 2.0), edgecolor="k", density=True)
|
|
186 | 186 | # We can see that for low values of ``n_components`` the distribution is wide
|
187 | 187 | # with many distorted pairs and a skewed distribution (due to the hard
|
188 | 188 | # limit of zero ratio on the left as distances are always positives)
|
189 |
| -# while for larger values of n_components the distortion is controlled |
| 189 | +# while for larger values of `n_components` the distortion is controlled |
190 | 190 | # and the distances are well preserved by the random projection.
|
191 |
| - |
192 |
| - |
193 |
| -# %% |
| 191 | +# |
194 | 192 | # Remarks
|
195 | 193 | # =======
|
196 | 194 | #
|
197 |
| -# According to the JL lemma, projecting 500 samples without too much distortion |
| 195 | +# According to the JL lemma, projecting 300 samples without too much distortion |
198 | 196 | # will require at least several thousands dimensions, irrespective of the
|
199 | 197 | # number of features of the original dataset.
|
200 | 198 | #
|
|
203 | 201 | # for dimensionality reduction in this case.
|
204 | 202 | #
|
205 | 203 | # On the twenty newsgroups on the other hand the dimensionality can be
|
206 |
| -# decreased from 56436 down to 10000 while reasonably preserving |
| 204 | +# decreased from 56,436 down to 10,000 while reasonably preserving |
207 | 205 | # pairwise distances.
|
0 commit comments