From 498ccced2ae2c241949f53374425184682b85558 Mon Sep 17 00:00:00 2001 From: "Ramana.S" Date: Sun, 27 Dec 2015 00:07:11 +0530 Subject: [PATCH 01/96] Reverted the change, added regression test reverted the comment Resolved merge conflicts --- sklearn/metrics/cluster/tests/test_unsupervised.py | 3 +++ sklearn/metrics/cluster/unsupervised.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index b346c7e54632e..7abd1db2957f1 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -3,6 +3,7 @@ from sklearn import datasets from sklearn.metrics.cluster.unsupervised import silhouette_score +from sklearn.metrics.cluster.unsupervised import silhouette_samples from sklearn.metrics import pairwise_distances from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_almost_equal @@ -50,6 +51,8 @@ def test_no_nan(): D = np.random.RandomState(0).rand(len(labels), len(labels)) silhouette = silhouette_score(D, labels, metric='precomputed') assert_false(np.isnan(silhouette)) + ss = silhouette_samples(D, labels, metric='precomputed') + assert_false(np.isnan(ss).any()) def test_correct_labelsize(): diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index a0d2aaa24ef38..e26031b791ce4 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -200,4 +200,6 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): sil_samples = inter_clust_dists - intra_clust_dists sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) - return sil_samples + + # nan values are for clusters of size 1, and should be 0 + return np.nan_to_num(sil_samples) From 3a4dd6818422e3ea4c4ea387469e75802b15ec40 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 11 Aug 2016 21:22:03 +1000 Subject: [PATCH 02/96] ENH block_size for memory efficiency in silhouette --- .../cluster/tests/test_unsupervised.py | 7 ++ sklearn/metrics/cluster/unsupervised.py | 87 +++++++++++-------- 2 files changed, 57 insertions(+), 37 deletions(-) diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 04bdf8b6f60fe..79ae315dffb45 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -33,6 +33,13 @@ def test_silhouette(): score_euclidean = silhouette_score(X, y, metric='euclidean') assert_almost_equal(score_precomputed, score_euclidean) + score_batched = silhouette_score(X, y, block_size=17, + metric='euclidean') + assert_almost_equal(score_batched, score_euclidean) + score_batched = silhouette_score(D, y, block_size=17, + metric='precomputed') + assert_almost_equal(score_batched, score_euclidean) + if X is X_dense: score_dense_without_sampling = score_precomputed else: diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index c1cca77ded2e6..f2a76d5eacf91 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -106,7 +106,7 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) -def silhouette_samples(X, labels, metric='euclidean', **kwds): +def silhouette_samples(X, labels, metric='euclidean', block_size=None, **kwds): """Compute the Silhouette Coefficient for each sample. The Silhouette Coefficient is a measure of how well samples are clustered @@ -144,6 +144,10 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`. If X is the distance array itself, use "precomputed" as the metric. + block_size : int, optional + The number of rows to process at a time to limit memory usage to + O(block_size * n_samples). Default is n_samples. + `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. If using a ``scipy.spatial.distance`` metric, the parameters are still @@ -168,44 +172,53 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): """ le = LabelEncoder() labels = le.fit_transform(labels) - - distances = pairwise_distances(X, metric=metric, **kwds) - unique_labels = le.classes_ - - # For sample i, store the mean distance of the cluster to which - # it belongs in intra_clust_dists[i] - intra_clust_dists = np.ones(distances.shape[0], dtype=distances.dtype) - - # For sample i, store the mean distance of the second closest - # cluster in inter_clust_dists[i] - inter_clust_dists = np.inf * intra_clust_dists - - for curr_label in unique_labels: - - # Find inter_clust_dist for all samples belonging to the same - # label. - mask = labels == curr_label - current_distances = distances[mask] - - # Leave out current sample. - n_samples_curr_lab = np.sum(mask) - 1 - if n_samples_curr_lab != 0: - intra_clust_dists[mask] = np.sum( - current_distances[:, mask], axis=1) / n_samples_curr_lab - - # Now iterate over all other labels, finding the mean - # cluster distance that is closest to every sample. - for other_label in unique_labels: - if other_label != curr_label: - other_mask = labels == other_label - other_distances = np.mean( - current_distances[:, other_mask], axis=1) - inter_clust_dists[mask] = np.minimum( - inter_clust_dists[mask], other_distances) + n_samples = len(labels) + n_clusters = len(le.classes_) + class_freqs = np.bincount(labels) + class_freqs_minus_1 = class_freqs - 1 + + if block_size is None: + block_size = n_samples + + intra_clust_dists = [] + inter_clust_dists = [] + + # TODO: replace tile by np.broadcast_to + add_at_0 = np.repeat(np.arange(block_size), n_samples) + add_at_1 = np.tile(labels, block_size) + block_range = np.arange(block_size) + + for start in range(0, n_samples, block_size): + stop = min(start + block_size, n_samples) + # TODO: perhaps ensure pairwise_distances args are identical if + # block_size is None + block_dists = pairwise_distances(X[start:stop], X, + metric=metric, **kwds) + clust_dists = np.zeros((stop - start, n_clusters)) + np.add.at(clust_dists, + (add_at_0[:block_dists.size], add_at_1[:block_dists.size]), + block_dists.ravel()) + intra_index = (block_range[:len(clust_dists)], labels[start:stop]) + + denom = class_freqs_minus_1.take(labels[start:stop], mode='clip') + with np.errstate(divide="ignore", invalid="ignore"): + intra_clust_dists.append(clust_dists[intra_index] / denom) + # FIXME: deal with 0 denominator + clust_dists[intra_index] = np.inf + clust_dists /= class_freqs + inter_clust_dists.append(clust_dists.min(axis=1)) + + if len(intra_clust_dists) == 1: + intra_clust_dists = intra_clust_dists[0] + inter_clust_dists = inter_clust_dists[0] + else: + intra_clust_dists = np.hstack(intra_clust_dists) + inter_clust_dists = np.hstack(inter_clust_dists) sil_samples = inter_clust_dists - intra_clust_dists - sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) - return sil_samples + with np.errstate(divide="ignore", invalid="ignore"): + sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) + return np.nan_to_num(sil_samples) def calinski_harabaz_score(X, labels): From c6edfbbb72760ddd042e9aea36462b94fa3bdc9b Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 11 Aug 2016 21:45:40 +1000 Subject: [PATCH 03/96] DOC add versionadded to new parameter --- sklearn/metrics/cluster/unsupervised.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index e1522e19bb7e7..310e016d4e239 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -148,6 +148,8 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, **kwds): The number of rows to process at a time to limit memory usage to O(block_size * n_samples). Default is n_samples. + .. versionadded:: 0.18 + `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. If using a ``scipy.spatial.distance`` metric, the parameters are still From 3b726aaace1a72169bb150c6b3dc7a9b7fe92daf Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 11 Aug 2016 22:59:25 +1000 Subject: [PATCH 04/96] FIX use bincount instead of np.add.at for old numpy --- sklearn/metrics/cluster/unsupervised.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 310e016d4e239..102a60b74d03d 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -181,13 +181,15 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, **kwds): if block_size is None: block_size = n_samples + elif block_size > n_samples: + block_size = min(block_size, n_samples) intra_clust_dists = [] inter_clust_dists = [] - # TODO: replace tile by np.broadcast_to - add_at_0 = np.repeat(np.arange(block_size), n_samples) - add_at_1 = np.tile(labels, block_size) + add_at = np.ravel_multi_index((np.repeat(np.arange(block_size), n_samples), + np.tile(labels, block_size)), + dims=(block_size, n_clusters)) block_range = np.arange(block_size) for start in range(0, n_samples, block_size): @@ -196,10 +198,9 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, **kwds): # block_size is None block_dists = pairwise_distances(X[start:stop], X, metric=metric, **kwds) - clust_dists = np.zeros((stop - start, n_clusters)) - np.add.at(clust_dists, - (add_at_0[:block_dists.size], add_at_1[:block_dists.size]), - block_dists.ravel()) + clust_dists = np.bincount(add_at[:block_dists.size], + block_dists.ravel()) + clust_dists = clust_dists.reshape((stop - start, n_clusters)) intra_index = (block_range[:len(clust_dists)], labels[start:stop]) denom = class_freqs_minus_1.take(labels[start:stop], mode='clip') From 2301fde9dded38f0b25431f509c883a7bfada614 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 11 Aug 2016 23:01:31 +1000 Subject: [PATCH 05/96] ENH use unary pairwise_distances where possible --- sklearn/metrics/cluster/unsupervised.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 102a60b74d03d..e43fefeb0ca59 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -194,10 +194,11 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, **kwds): for start in range(0, n_samples, block_size): stop = min(start + block_size, n_samples) - # TODO: perhaps ensure pairwise_distances args are identical if - # block_size is None - block_dists = pairwise_distances(X[start:stop], X, - metric=metric, **kwds) + if stop - start == n_samples: + block_dists = pairwise_distances(X, metric=metric, **kwds) + else: + block_dists = pairwise_distances(X[start:stop], X, + metric=metric, **kwds) clust_dists = np.bincount(add_at[:block_dists.size], block_dists.ravel()) clust_dists = clust_dists.reshape((stop - start, n_clusters)) From 85d5971df60513abe7daa1e406610ad21e338579 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Fri, 12 Aug 2016 00:07:06 +1000 Subject: [PATCH 06/96] DOC explicit block_size parameter in silhouette_score --- sklearn/metrics/cluster/unsupervised.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index e43fefeb0ca59..b430d46aea667 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -20,7 +20,7 @@ def check_number_of_labels(n_labels, n_samples): def silhouette_score(X, labels, metric='euclidean', sample_size=None, - random_state=None, **kwds): + block_size=None, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples. The Silhouette Coefficient is calculated using the mean intra-cluster @@ -56,6 +56,12 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, `. If X is the distance array itself, use ``metric="precomputed"``. + block_size : int, optional + The number of rows to process at a time to limit memory usage to + O(block_size * n_samples). Default is n_samples. + + .. versionadded:: 0.18 + sample_size : int or None The size of the sample to use when computing the Silhouette Coefficient on a random subset of the data. @@ -103,7 +109,8 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, X, labels = X[indices].T[indices].T, labels[indices] else: X, labels = X[indices], labels[indices] - return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) + return np.mean(silhouette_samples(X, labels, metric=metric, + block_size=block_size, **kwds)) def silhouette_samples(X, labels, metric='euclidean', block_size=None, **kwds): From 53fa8d93541b91231ec7204dd57e201f266e7c23 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Fri, 12 Aug 2016 00:16:20 +1000 Subject: [PATCH 07/96] TST test silhouette_samples explicitly --- .../cluster/tests/test_unsupervised.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 89bb181b8e71f..83a46e6a0bb7f 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -78,6 +78,49 @@ def test_no_nan(): assert_false(np.isnan(ss).any()) +def test_silhouette_paper_example(): + # Explicitly check per-sample results against Rousseeuw (1987) + lower = [5.58, + 7.00, 6.50, + 7.08, 7.00, 3.83, + 4.83, 5.08, 8.17, 5.83, + 2.17, 5.75, 6.67, 6.92, 4.92, + 6.42, 5.00, 5.58, 6.00, 4.67, 6.42, + 3.42, 5.50, 6.42, 6.42, 5.00, 3.92, 6.17, + 2.50, 4.92, 6.25, 7.33, 4.50, 2.25, 6.33, 2.75, + 6.08, 6.67, 4.25, 2.67, 6.00, 6.17, 6.17, 6.92, 6.17, + 5.25, 6.83, 4.50, 3.75, 5.75, 5.42, 6.08, 5.83, 6.67, 3.67, + 4.75, 3.00, 6.08, 6.67, 5.00, 5.58, 4.83, 6.17, 5.67, 6.50, 6.92] + D = np.zeros((12, 12)) + D[np.tril_indices(12, -1)] = lower + D += D.T + + names = ['BEL', 'BRA', 'CHI', 'CUB', 'EGY', 'FRA', 'IND', 'ISR', 'USA', + 'USS', 'YUG', 'ZAI'] + + labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1] + labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2] + + expected1 = {'USA': .43, 'BEL': .39, 'FRA': .35, 'ISR': .30, 'BRA': .22, + 'EGY': .20, 'ZAI': .19, 'CUB': .40, 'USS': .34, 'CHI': .33, + 'YUG': .26, 'IND': -.04} + score1 = .28 + expected2 = {'USA': .47, 'FRA': .44, 'BEL': .42, 'ISR': .37, 'EGY': .02, + 'ZAI': .28, 'BRA': .25, 'IND': .17, 'CUB': .48, 'USS': .44, + 'YUG': .31, 'CHI': .31} + score2 = .33 + + for labels, expected, score in [(labels1, expected1, score1), + (labels2, expected2, score2)]: + expected = [expected[name] for name in names] + assert_almost_equal(expected, silhouette_samples(D, np.array(labels), + metric='precomputed'), + decimal=2) + assert_almost_equal(score, silhouette_score(D, np.array(labels), + metric='precomputed'), + decimal=2) + + def test_correct_labelsize(): # Assert 1 < n_labels < n_samples dataset = datasets.load_iris() From 682864660e08e94880311452c420d51bf905bc93 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Fri, 12 Aug 2016 12:12:14 +1000 Subject: [PATCH 08/96] ENH support n_jobs in silhouette_score and add comments --- .../cluster/tests/test_unsupervised.py | 12 +++ sklearn/metrics/cluster/unsupervised.py | 94 ++++++++++++++----- 2 files changed, 80 insertions(+), 26 deletions(-) diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 83a46e6a0bb7f..5019ac037132f 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -34,12 +34,24 @@ def test_silhouette(): score_euclidean = silhouette_score(X, y, metric='euclidean') assert_almost_equal(score_precomputed, score_euclidean) + # test block_size score_batched = silhouette_score(X, y, block_size=17, metric='euclidean') assert_almost_equal(score_batched, score_euclidean) score_batched = silhouette_score(D, y, block_size=17, metric='precomputed') assert_almost_equal(score_batched, score_euclidean) + score_batched = silhouette_score(D, y, block_size=len(y) + 10, + metric='precomputed') + assert_almost_equal(score_batched, score_euclidean) + + # smoke test n_jobs with and without block_size + score_parallel = silhouette_score(X, y, block_size=None, + n_jobs=2, metric='euclidean') + assert_almost_equal(score_parallel, score_euclidean) + score_parallel = silhouette_score(X, y, block_size=50, + n_jobs=2, metric='euclidean') + assert_almost_equal(score_parallel, score_euclidean) if X is X_dense: score_dense_without_sampling = score_precomputed diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index b430d46aea667..49da04283f748 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -5,10 +5,14 @@ # Thierry Guillemot # License: BSD 3 clause +from __future__ import division + import numpy as np from ...utils import check_random_state from ...utils import check_X_y +from ...utils import _get_n_jobs +from ...externals.joblib import Parallel, delayed from ..pairwise import pairwise_distances from ...preprocessing import LabelEncoder @@ -20,7 +24,7 @@ def check_number_of_labels(n_labels, n_samples): def silhouette_score(X, labels, metric='euclidean', sample_size=None, - block_size=None, random_state=None, **kwds): + block_size=None, n_jobs=1, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples. The Silhouette Coefficient is calculated using the mean intra-cluster @@ -62,6 +66,12 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, .. versionadded:: 0.18 + n_jobs : int, optional (default = 1) + The number of parallel jobs to run. + If ``-1``, then the number of jobs is set to the number of CPU cores. + + .. versionadded:: 0.18 + sample_size : int or None The size of the sample to use when computing the Silhouette Coefficient on a random subset of the data. @@ -113,7 +123,35 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, block_size=block_size, **kwds)) -def silhouette_samples(X, labels, metric='euclidean', block_size=None, **kwds): +def _process_block(X, labels, start, block_size, block_range, add_at, + label_freqs, metric, kwds): + # get distances from block to every other sample + stop = min(start + block_size, X.shape[0]) + if stop - start == X.shape[0]: + # allow pairwise_distances to use fast paths + block_dists = pairwise_distances(X, metric=metric, **kwds) + else: + block_dists = pairwise_distances(X[start:stop], X, + metric=metric, **kwds) + + # accumulate distances from each sample to each cluster + clust_dists = np.bincount(add_at[:block_dists.size], + block_dists.ravel()) + clust_dists = clust_dists.reshape((stop - start, len(label_freqs))) + + # intra_index selects intra-cluster distances within clust_dists + intra_index = (block_range[:len(clust_dists)], labels[start:stop]) + # intra_clust_dists are averaged over cluster size outside this function + intra_clust_dists = clust_dists[intra_index] + # of the remaining distances we normalise and extract the minimum + clust_dists[intra_index] = np.inf + clust_dists /= label_freqs + inter_clust_dists = clust_dists.min(axis=1) + return intra_clust_dists, inter_clust_dists + + +def silhouette_samples(X, labels, metric='euclidean', block_size=None, + n_jobs=1, **kwds): """Compute the Silhouette Coefficient for each sample. The Silhouette Coefficient is a measure of how well samples are clustered @@ -153,7 +191,13 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, **kwds): block_size : int, optional The number of rows to process at a time to limit memory usage to - O(block_size * n_samples). Default is n_samples. + O(block_size * n_samples). Default is n_samples / n_jobs. + + .. versionadded:: 0.18 + + n_jobs : int, optional (default = 1) + The number of parallel jobs to run. + If ``-1``, then the number of jobs is set to the number of CPU cores. .. versionadded:: 0.18 @@ -179,45 +223,39 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, **kwds): `_ """ + X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr']) le = LabelEncoder() labels = le.fit_transform(labels) n_samples = len(labels) - n_clusters = len(le.classes_) - class_freqs = np.bincount(labels) - class_freqs_minus_1 = class_freqs - 1 + label_freqs = np.bincount(labels) + n_jobs = _get_n_jobs(n_jobs) if block_size is None: - block_size = n_samples + block_size = int(np.ceil(n_samples / n_jobs)) elif block_size > n_samples: block_size = min(block_size, n_samples) + # note block_size > (n_samples / n_jobs) just means not all + # available CPUs are used intra_clust_dists = [] inter_clust_dists = [] + # We use these indices as bins to accumulate distances from each sample in + # a block to each cluster. + # NB: we currently use np.bincount but could use np.add.at when Numpy >=1.8 + # is minimum dependency, which would avoid materialising this index. add_at = np.ravel_multi_index((np.repeat(np.arange(block_size), n_samples), np.tile(labels, block_size)), - dims=(block_size, n_clusters)) + dims=(block_size, len(label_freqs))) block_range = np.arange(block_size) + parallel = Parallel(n_jobs=n_jobs) - for start in range(0, n_samples, block_size): - stop = min(start + block_size, n_samples) - if stop - start == n_samples: - block_dists = pairwise_distances(X, metric=metric, **kwds) - else: - block_dists = pairwise_distances(X[start:stop], X, - metric=metric, **kwds) - clust_dists = np.bincount(add_at[:block_dists.size], - block_dists.ravel()) - clust_dists = clust_dists.reshape((stop - start, n_clusters)) - intra_index = (block_range[:len(clust_dists)], labels[start:stop]) - - denom = class_freqs_minus_1.take(labels[start:stop], mode='clip') - with np.errstate(divide="ignore", invalid="ignore"): - intra_clust_dists.append(clust_dists[intra_index] / denom) - clust_dists[intra_index] = np.inf - clust_dists /= class_freqs - inter_clust_dists.append(clust_dists.min(axis=1)) + results = parallel(delayed(_process_block)(X, labels, start, block_size, + block_range, add_at, + label_freqs, metric, kwds) + for start in range(0, n_samples, block_size)) + intra_clust_dists, inter_clust_dists = zip(*results) if len(intra_clust_dists) == 1: intra_clust_dists = intra_clust_dists[0] inter_clust_dists = inter_clust_dists[0] @@ -225,6 +263,10 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, **kwds): intra_clust_dists = np.hstack(intra_clust_dists) inter_clust_dists = np.hstack(inter_clust_dists) + denom = (label_freqs - 1).take(labels, mode='clip') + with np.errstate(divide="ignore", invalid="ignore"): + intra_clust_dists /= denom + sil_samples = inter_clust_dists - intra_clust_dists with np.errstate(divide="ignore", invalid="ignore"): sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) From 969eab309b1426cb0ac24eba7a0212dc755f3f65 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Fri, 12 Aug 2016 12:23:51 +1000 Subject: [PATCH 09/96] DOC update block_size description given n_jobs --- sklearn/metrics/cluster/unsupervised.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 49da04283f748..0639dcd3f6268 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -62,7 +62,7 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, block_size : int, optional The number of rows to process at a time to limit memory usage to - O(block_size * n_samples). Default is n_samples. + O(block_size * n_jobs * n_samples). Default is n_samples / n_jobs. .. versionadded:: 0.18 @@ -191,7 +191,7 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, block_size : int, optional The number of rows to process at a time to limit memory usage to - O(block_size * n_samples). Default is n_samples / n_jobs. + O(block_size * n_jobs * n_samples). Default is n_samples / n_jobs. .. versionadded:: 0.18 From bfbde51fd3936f5e9a755c05205cb8112abaa6ab Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sat, 13 Aug 2016 21:42:35 +1000 Subject: [PATCH 10/96] DOC docstring formatting --- sklearn/metrics/cluster/unsupervised.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 0639dcd3f6268..bbfb7a6b33ddb 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -62,7 +62,8 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, block_size : int, optional The number of rows to process at a time to limit memory usage to - O(block_size * n_jobs * n_samples). Default is n_samples / n_jobs. + ``O(block_size * n_jobs * n_samples)``. + Default is ``n_samples / n_jobs``. .. versionadded:: 0.18 @@ -191,7 +192,8 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, block_size : int, optional The number of rows to process at a time to limit memory usage to - O(block_size * n_jobs * n_samples). Default is n_samples / n_jobs. + ``O(block_size * n_jobs * n_samples)``. + Default is ``n_samples / n_jobs``. .. versionadded:: 0.18 From 03a73abf931fd3d5636d797e0e828ed72051760a Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sat, 13 Aug 2016 22:19:58 +1000 Subject: [PATCH 11/96] ENH specify silhouette block size in bytes --- .../cluster/tests/test_unsupervised.py | 21 +++++--- sklearn/metrics/cluster/unsupervised.py | 51 ++++++++++--------- 2 files changed, 43 insertions(+), 29 deletions(-) diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 5019ac037132f..1ac3428978751 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -35,21 +35,22 @@ def test_silhouette(): assert_almost_equal(score_precomputed, score_euclidean) # test block_size - score_batched = silhouette_score(X, y, block_size=17, + score_batched = silhouette_score(X, y, block_size=10000, metric='euclidean') assert_almost_equal(score_batched, score_euclidean) - score_batched = silhouette_score(D, y, block_size=17, + score_batched = silhouette_score(D, y, block_size=10000, metric='precomputed') assert_almost_equal(score_batched, score_euclidean) - score_batched = silhouette_score(D, y, block_size=len(y) + 10, + # absurdly large block_size + score_batched = silhouette_score(D, y, block_size=1e100, metric='precomputed') assert_almost_equal(score_batched, score_euclidean) - # smoke test n_jobs with and without block_size - score_parallel = silhouette_score(X, y, block_size=None, + # smoke test n_jobs with and without explicit block_size + score_parallel = silhouette_score(X, y, n_jobs=2, metric='euclidean') assert_almost_equal(score_parallel, score_euclidean) - score_parallel = silhouette_score(X, y, block_size=50, + score_parallel = silhouette_score(X, y, block_size=5000, n_jobs=2, metric='euclidean') assert_almost_equal(score_parallel, score_euclidean) @@ -76,6 +77,14 @@ def test_silhouette(): assert_almost_equal(score_euclidean, score_dense_with_sampling) +def test_silhouette_invalid_block_size(): + X = [[0], [0], [1]] + y = [1, 1, 2] + assert_raise_message(ValueError, 'block_size should be at least n_samples ' + '* 8 = 24 bytes, got 1', + silhouette_score, X, y, block_size=1) + + def test_no_nan(): # Assert Silhouette Coefficient != nan when there is 1 sample in a class. # This tests for the condition that caused issue #960. diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index bbfb7a6b33ddb..b62aff4891794 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -23,8 +23,13 @@ def check_number_of_labels(n_labels, n_samples): "to n_samples - 1 (inclusive)" % n_labels) +DEFAULT_BLOCK_SIZE = 2 ** 26 +BYTES_PER_FLOAT = 8 + + def silhouette_score(X, labels, metric='euclidean', sample_size=None, - block_size=None, n_jobs=1, random_state=None, **kwds): + block_size=DEFAULT_BLOCK_SIZE, n_jobs=1, + random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples. The Silhouette Coefficient is calculated using the mean intra-cluster @@ -61,9 +66,8 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, array itself, use ``metric="precomputed"``. block_size : int, optional - The number of rows to process at a time to limit memory usage to - ``O(block_size * n_jobs * n_samples)``. - Default is ``n_samples / n_jobs``. + The maximum number of bytes of memory per job (see ``n_jobs``) to use + at a time for calculating pairwise distances. Default is 64MiB. .. versionadded:: 0.18 @@ -124,10 +128,10 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, block_size=block_size, **kwds)) -def _process_block(X, labels, start, block_size, block_range, add_at, +def _process_block(X, labels, start, block_n_rows, block_range, add_at, label_freqs, metric, kwds): # get distances from block to every other sample - stop = min(start + block_size, X.shape[0]) + stop = min(start + block_n_rows, X.shape[0]) if stop - start == X.shape[0]: # allow pairwise_distances to use fast paths block_dists = pairwise_distances(X, metric=metric, **kwds) @@ -151,8 +155,8 @@ def _process_block(X, labels, start, block_size, block_range, add_at, return intra_clust_dists, inter_clust_dists -def silhouette_samples(X, labels, metric='euclidean', block_size=None, - n_jobs=1, **kwds): +def silhouette_samples(X, labels, metric='euclidean', + block_size=DEFAULT_BLOCK_SIZE, n_jobs=1, **kwds): """Compute the Silhouette Coefficient for each sample. The Silhouette Coefficient is a measure of how well samples are clustered @@ -191,9 +195,8 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, the distance array itself, use "precomputed" as the metric. block_size : int, optional - The number of rows to process at a time to limit memory usage to - ``O(block_size * n_jobs * n_samples)``. - Default is ``n_samples / n_jobs``. + The maximum number of bytes of memory per job (see ``n_jobs``) to use + at a time for calculating pairwise distances. Default is 64MiB. .. versionadded:: 0.18 @@ -232,12 +235,14 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, label_freqs = np.bincount(labels) n_jobs = _get_n_jobs(n_jobs) - if block_size is None: - block_size = int(np.ceil(n_samples / n_jobs)) - elif block_size > n_samples: - block_size = min(block_size, n_samples) - # note block_size > (n_samples / n_jobs) just means not all - # available CPUs are used + block_n_rows = block_size // (BYTES_PER_FLOAT * n_samples) + if block_n_rows > n_samples: + block_n_rows = min(block_n_rows, n_samples) + if block_n_rows < 1: + raise ValueError('block_size should be at least n_samples * %d = %d ' + 'bytes, got %r' % (BYTES_PER_FLOAT, + n_samples * BYTES_PER_FLOAT, + block_size)) intra_clust_dists = [] inter_clust_dists = [] @@ -246,16 +251,16 @@ def silhouette_samples(X, labels, metric='euclidean', block_size=None, # a block to each cluster. # NB: we currently use np.bincount but could use np.add.at when Numpy >=1.8 # is minimum dependency, which would avoid materialising this index. - add_at = np.ravel_multi_index((np.repeat(np.arange(block_size), n_samples), - np.tile(labels, block_size)), - dims=(block_size, len(label_freqs))) - block_range = np.arange(block_size) + block_range = np.arange(block_n_rows) + add_at = np.ravel_multi_index((np.repeat(block_range, n_samples), + np.tile(labels, block_n_rows)), + dims=(block_n_rows, len(label_freqs))) parallel = Parallel(n_jobs=n_jobs) - results = parallel(delayed(_process_block)(X, labels, start, block_size, + results = parallel(delayed(_process_block)(X, labels, start, block_n_rows, block_range, add_at, label_freqs, metric, kwds) - for start in range(0, n_samples, block_size)) + for start in range(0, n_samples, block_n_rows)) intra_clust_dists, inter_clust_dists = zip(*results) if len(intra_clust_dists) == 1: From 51640c0eae75388e1e197475e78f6372e4ca3b1c Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 14 Aug 2016 11:54:32 +1000 Subject: [PATCH 12/96] block_size specified in MiB --- .../cluster/tests/test_unsupervised.py | 12 +++++----- sklearn/metrics/cluster/unsupervised.py | 24 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 1ac3428978751..486aec8089e6d 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -35,14 +35,14 @@ def test_silhouette(): assert_almost_equal(score_precomputed, score_euclidean) # test block_size - score_batched = silhouette_score(X, y, block_size=10000, + score_batched = silhouette_score(X, y, block_size=10, metric='euclidean') assert_almost_equal(score_batched, score_euclidean) - score_batched = silhouette_score(D, y, block_size=10000, + score_batched = silhouette_score(D, y, block_size=10, metric='precomputed') assert_almost_equal(score_batched, score_euclidean) # absurdly large block_size - score_batched = silhouette_score(D, y, block_size=1e100, + score_batched = silhouette_score(D, y, block_size=10000, metric='precomputed') assert_almost_equal(score_batched, score_euclidean) @@ -50,7 +50,7 @@ def test_silhouette(): score_parallel = silhouette_score(X, y, n_jobs=2, metric='euclidean') assert_almost_equal(score_parallel, score_euclidean) - score_parallel = silhouette_score(X, y, block_size=5000, + score_parallel = silhouette_score(X, y, block_size=10, n_jobs=2, metric='euclidean') assert_almost_equal(score_parallel, score_euclidean) @@ -81,8 +81,8 @@ def test_silhouette_invalid_block_size(): X = [[0], [0], [1]] y = [1, 1, 2] assert_raise_message(ValueError, 'block_size should be at least n_samples ' - '* 8 = 24 bytes, got 1', - silhouette_score, X, y, block_size=1) + '* 8 bytes = 1 MiB, got 0', + silhouette_score, X, y, block_size=0) def test_no_nan(): diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index b62aff4891794..ea126a8a183e9 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -23,7 +23,7 @@ def check_number_of_labels(n_labels, n_samples): "to n_samples - 1 (inclusive)" % n_labels) -DEFAULT_BLOCK_SIZE = 2 ** 26 +DEFAULT_BLOCK_SIZE = 64 BYTES_PER_FLOAT = 8 @@ -65,9 +65,9 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, `. If X is the distance array itself, use ``metric="precomputed"``. - block_size : int, optional - The maximum number of bytes of memory per job (see ``n_jobs``) to use - at a time for calculating pairwise distances. Default is 64MiB. + block_size : int, optional, default=64 + The maximum number of mebibytes (MiB) of memory per job (see + ``n_jobs``) to use at a time for calculating pairwise distances. .. versionadded:: 0.18 @@ -194,9 +194,9 @@ def silhouette_samples(X, labels, metric='euclidean', allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`. If X is the distance array itself, use "precomputed" as the metric. - block_size : int, optional - The maximum number of bytes of memory per job (see ``n_jobs``) to use - at a time for calculating pairwise distances. Default is 64MiB. + block_size : int, optional, default=64 + The maximum number of mebibytes (MiB) of memory per job (see + ``n_jobs``) to use at a time for calculating pairwise distances. .. versionadded:: 0.18 @@ -235,14 +235,14 @@ def silhouette_samples(X, labels, metric='euclidean', label_freqs = np.bincount(labels) n_jobs = _get_n_jobs(n_jobs) - block_n_rows = block_size // (BYTES_PER_FLOAT * n_samples) + block_n_rows = block_size * (2 ** 20) // (BYTES_PER_FLOAT * n_samples) if block_n_rows > n_samples: block_n_rows = min(block_n_rows, n_samples) if block_n_rows < 1: - raise ValueError('block_size should be at least n_samples * %d = %d ' - 'bytes, got %r' % (BYTES_PER_FLOAT, - n_samples * BYTES_PER_FLOAT, - block_size)) + min_block_mib = np.ceil(n_samples * BYTES_PER_FLOAT * 2 ** -20) + raise ValueError('block_size should be at least n_samples * %d bytes ' + '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, + min_block_mib, block_size)) intra_clust_dists = [] inter_clust_dists = [] From eb4619d66cc545a2735e2fa1bc592d98e6ce2e83 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 16 Aug 2016 23:54:19 +1000 Subject: [PATCH 13/96] document parameters to silhouette helper --- sklearn/metrics/cluster/unsupervised.py | 38 +++++++++++++++++++------ 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index ea126a8a183e9..0bfc65a220ada 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -128,16 +128,37 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, block_size=block_size, **kwds)) -def _process_block(X, labels, start, block_n_rows, block_range, add_at, - label_freqs, metric, kwds): +def _silhouette_block(X, labels, label_freqs, start, block_n_rows, + block_range, add_at, dist_kwds): + """Accumulate silhouette statistics for X[start:start+block_n_rows] + + Parameters + ---------- + X : shape (n_samples, n_features) or precomputed (n_samples, n_samples) + data + labels : array, shape (n_samples,) + corresponding cluster labels, encoded as {0, ..., n_clusters-1} + label_freqs : array + distribution of cluster labels in ``labels`` + start : int + first index in block + block_n_rows : int + length of block + block_range : array + precomputed range ``0..(block_n_rows-1)`` + add_at : array, shape (block_n_rows * n_clusters,) + indices into a flattened array of shape (block_n_rows, n_clusters) + where distances from block points to each cluster are accumulated + dist_kwds : dict + kwargs for ``pairwise_distances`` + """ # get distances from block to every other sample stop = min(start + block_n_rows, X.shape[0]) if stop - start == X.shape[0]: # allow pairwise_distances to use fast paths - block_dists = pairwise_distances(X, metric=metric, **kwds) + block_dists = pairwise_distances(X, **dist_kwds) else: - block_dists = pairwise_distances(X[start:stop], X, - metric=metric, **kwds) + block_dists = pairwise_distances(X[start:stop], X, **dist_kwds) # accumulate distances from each sample to each cluster clust_dists = np.bincount(add_at[:block_dists.size], @@ -257,9 +278,10 @@ def silhouette_samples(X, labels, metric='euclidean', dims=(block_n_rows, len(label_freqs))) parallel = Parallel(n_jobs=n_jobs) - results = parallel(delayed(_process_block)(X, labels, start, block_n_rows, - block_range, add_at, - label_freqs, metric, kwds) + kwds['metric'] = metric + results = parallel(delayed(_silhouette_block)(X, labels, label_freqs, + start, block_n_rows, + block_range, add_at, kwds) for start in range(0, n_samples, block_n_rows)) intra_clust_dists, inter_clust_dists = zip(*results) From 71ac9948c0e3b893b8a829775c8af592045cf8fb Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 18 Aug 2016 00:24:37 +1000 Subject: [PATCH 14/96] FIX pass n_jobs from silhouette_score Also use threading for parallelism --- sklearn/metrics/cluster/unsupervised.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 0bfc65a220ada..dcde47e27a8d9 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -125,7 +125,8 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, else: X, labels = X[indices], labels[indices] return np.mean(silhouette_samples(X, labels, metric=metric, - block_size=block_size, **kwds)) + block_size=block_size, n_jobs=n_jobs, + **kwds)) def _silhouette_block(X, labels, label_freqs, start, block_n_rows, @@ -276,7 +277,7 @@ def silhouette_samples(X, labels, metric='euclidean', add_at = np.ravel_multi_index((np.repeat(block_range, n_samples), np.tile(labels, block_n_rows)), dims=(block_n_rows, len(label_freqs))) - parallel = Parallel(n_jobs=n_jobs) + parallel = Parallel(n_jobs=n_jobs, backend='threading') kwds['metric'] = metric results = parallel(delayed(_silhouette_block)(X, labels, label_freqs, From 7cfcd435660dfcb87bd0bbedb645bc360015fa2f Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 5 Dec 2016 14:30:42 +0530 Subject: [PATCH 15/96] ENH: Added template for pairwise_distances_blockwise with docstring changes --- sklearn/metrics/pairwise.py | 94 +++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 2258f070018d2..2273004e932c3 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1130,6 +1130,100 @@ def _pairwise_callable(X, Y, metric, **kwds): 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] +DEFAULT_BLOCK_SIZE = 64 + + +def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, + block_size=DEFAULT_BLOCK_SIZE, **kwds): + """ Compute the distance matrix from a vector array X and optional Y. + + This method takes either a vector array or a distance matrix, and returns + a distance matrix. If the input is a vector array, the distances are + computed. If the input is a distances matrix, it is returned instead. + + This is equivalent to calling: + + pairwise_distances(X, y, metric) + + but uses much less memory. + + Parameters + ---------- + X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ + [n_samples_a, n_features] otherwise + Array of pairwise distances between samples, or a feature array. + + Y : array [n_samples_b, n_features], optional + An optional second feature array. Only allowed if metric != "precomputed". + + metric : string, or callable + The metric to use when calculating distance between instances in a + feature array. If metric is a string, it must be one of the options + allowed by scipy.spatial.distance.pdist for its metric parameter, or + a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. + If metric is "precomputed", X is assumed to be a distance matrix. + Alternatively, if metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays from X as input and return a value indicating + the distance between them. + + n_jobs : int + The number of jobs to use for the computation. This works by breaking + down the pairwise matrix into n_jobs even slices and computing them in + parallel. + + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + block_size : int, default=64 + The maximum number of mebibytes (MiB) of memory per job (see``n_jobs``) + to use at a time for calculating pairwise distances. + + `**kwds` : optional keyword parameters + Any further parameters are passed directly to the distance function. + If using a scipy.spatial.distance metric, the parameters are still + metric dependent. See the scipy docs for usage examples. + + Returns + ------- + D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] + A distance matrix D such that D_{i, j} is the distance between the + ith and jth vectors of the given matrix X, if Y is None. + If Y is not None, then D_{i, j} is the distance between the ith array + from X and the jth array from Y. + + """ + if (metric not in _VALID_METRICS and + not callable(metric) and metric != "precomputed"): + raise ValueError("Unknown metric %s. " + "Valid metrics are %s, or 'precomputed', or a " + "callable" % (metric, _VALID_METRICS)) + + if metric == "precomputed": + X, _ = check_pairwise_arrays(X, Y, precomputed=True) + return X + elif metric in PAIRWISE_DISTANCE_FUNCTIONS: + func = PAIRWISE_DISTANCE_FUNCTIONS[metric] + elif callable(metric): + func = partial(_pairwise_callable, metric=metric, **kwds) + else: + if issparse(X) or issparse(Y): + raise TypeError("scipy distance metrics do not" + " support sparse matrices.") + + dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None + + X, Y = check_pairwise_arrays(X, Y, dtype=dtype) + + if n_jobs == 1 and X is Y: + return distance.squareform(distance.pdist(X, metric=metric, + **kwds)) + func = partial(distance.cdist, metric=metric, **kwds) + + return _parallel_pairwise(X, Y, func, n_jobs, **kwds) + def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): """ Compute the distance matrix from a vector array X and optional Y. From dfb99fcc190b71e2862f13b4eac4fb6f90ecd78f Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Wed, 7 Dec 2016 12:28:29 +0530 Subject: [PATCH 16/96] ENH: added generator of blocks based on block_size --- sklearn/metrics/pairwise.py | 56 +++++++++++++++---------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 2273004e932c3..bd201602a4fc6 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1131,6 +1131,7 @@ def _pairwise_callable(X, Y, metric, **kwds): 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] DEFAULT_BLOCK_SIZE = 64 +BYTES_PER_FLOAT = 64 def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, @@ -1143,7 +1144,7 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, This is equivalent to calling: - pairwise_distances(X, y, metric) + pairwise_distances(X, y, metric, n_jobs) but uses much less memory. @@ -1188,41 +1189,28 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, Returns ------- - D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] - A distance matrix D such that D_{i, j} is the distance between the - ith and jth vectors of the given matrix X, if Y is None. - If Y is not None, then D_{i, j} is the distance between the ith array - from X and the jth array from Y. + D : generator of blocks based on the ``block_size`` parameter. The blocks, + when concatenated, produce a distance matrix D such that D_{i, j} is the + distance between the ith and jth vectors of the given matrix X, if Y is + None. If Y is not None, then D_{i, j} is the distance between the ith + array from X and the jth array from Y. """ - if (metric not in _VALID_METRICS and - not callable(metric) and metric != "precomputed"): - raise ValueError("Unknown metric %s. " - "Valid metrics are %s, or 'precomputed', or a " - "callable" % (metric, _VALID_METRICS)) - - if metric == "precomputed": - X, _ = check_pairwise_arrays(X, Y, precomputed=True) - return X - elif metric in PAIRWISE_DISTANCE_FUNCTIONS: - func = PAIRWISE_DISTANCE_FUNCTIONS[metric] - elif callable(metric): - func = partial(_pairwise_callable, metric=metric, **kwds) - else: - if issparse(X) or issparse(Y): - raise TypeError("scipy distance metrics do not" - " support sparse matrices.") - - dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None - - X, Y = check_pairwise_arrays(X, Y, dtype=dtype) - - if n_jobs == 1 and X is Y: - return distance.squareform(distance.pdist(X, metric=metric, - **kwds)) - func = partial(distance.cdist, metric=metric, **kwds) - - return _parallel_pairwise(X, Y, func, n_jobs, **kwds) + n_samples = X.shape[0] + block_n_rows = block_size * (2 ** 20) // (BYTES_PER_FLOAT * n_samples) + if block_n_rows > n_samples: + block_n_rows = min(block_n_rows, n_samples) + if block_n_rows < 1: + min_block_mib = np.ceil(n_samples * BYTES_PER_FLOAT * 2 ** -20) + raise ValueError('block_size should be at least n_samples * %d bytes ' + '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, + min_block_mib, block_size)) + block_range = np.arange(block_n_rows) + kwds['metric'] = metric + for start in range(0, n_samples, block_n_rows): + # get distances from block to every other sample + stop = min(start + block_n_rows, X.shape[0]) + yield (pairwise_distances, X[start:stop], X, metric, n_jobs, kwds) def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): From 1e687d101ee515b5cb966b04cf915ff0d5536606 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Wed, 7 Dec 2016 12:45:39 +0530 Subject: [PATCH 17/96] FIX: removed errors and extra value for metric --- sklearn/metrics/pairwise.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index bd201602a4fc6..aadc4642b0efb 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1206,11 +1206,10 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, min_block_mib, block_size)) block_range = np.arange(block_n_rows) - kwds['metric'] = metric for start in range(0, n_samples, block_n_rows): # get distances from block to every other sample stop = min(start + block_n_rows, X.shape[0]) - yield (pairwise_distances, X[start:stop], X, metric, n_jobs, kwds) + yield pairwise_distances(X[start:stop], X, metric, n_jobs, **kwds) def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): From 172e7f57d9e99ab27eeaac5e6755da1509d9d985 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Wed, 7 Dec 2016 13:01:35 +0530 Subject: [PATCH 18/96] FIX: remove redundant variables --- sklearn/metrics/pairwise.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index aadc4642b0efb..ca4ace0a3dbe2 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1205,7 +1205,6 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, raise ValueError('block_size should be at least n_samples * %d bytes ' '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, min_block_mib, block_size)) - block_range = np.arange(block_n_rows) for start in range(0, n_samples, block_n_rows): # get distances from block to every other sample stop = min(start + block_n_rows, X.shape[0]) From 686d0d227a735f55761fd3481b3486af06586d95 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 8 Dec 2016 11:02:00 +0530 Subject: [PATCH 19/96] FIX: remove flake8 errors --- sklearn/metrics/pairwise.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index ca4ace0a3dbe2..9e82a99767e3a 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1155,7 +1155,8 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional - An optional second feature array. Only allowed if metric != "precomputed". + An optional second feature array. Only allowed if + metric != "precomputed". metric : string, or callable The metric to use when calculating distance between instances in a @@ -1190,10 +1191,10 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, Returns ------- D : generator of blocks based on the ``block_size`` parameter. The blocks, - when concatenated, produce a distance matrix D such that D_{i, j} is the - distance between the ith and jth vectors of the given matrix X, if Y is - None. If Y is not None, then D_{i, j} is the distance between the ith - array from X and the jth array from Y. + when concatenated, produce a distance matrix D such that D_{i, j} is + the distance between the ith and jth vectors of the given matrix X, if + Y is None. If Y is not None, then D_{i, j} is the distance between the + ith array from X and the jth array from Y. """ n_samples = X.shape[0] @@ -1253,7 +1254,8 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional - An optional second feature array. Only allowed if metric != "precomputed". + An optional second feature array. Only allowed if + metric != "precomputed". metric : string, or callable The metric to use when calculating distance between instances in a From c7de82068e5a6e4fa3a98eb9c7f8862f93588aa5 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 8 Dec 2016 12:00:37 +0530 Subject: [PATCH 20/96] BUG: added fix for Y=None --- sklearn/metrics/pairwise.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 9e82a99767e3a..4b16d2e90c65a 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1206,10 +1206,13 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, raise ValueError('block_size should be at least n_samples * %d bytes ' '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, min_block_mib, block_size)) + if Y is None: + Y = X + for start in range(0, n_samples, block_n_rows): # get distances from block to every other sample stop = min(start + block_n_rows, X.shape[0]) - yield pairwise_distances(X[start:stop], X, metric, n_jobs, **kwds) + yield pairwise_distances(X[start:stop], Y, metric, n_jobs, **kwds) def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): From 0fb992fc2fd19ae4e5d4dc84cfa7b2dad057d33a Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 8 Dec 2016 12:02:12 +0530 Subject: [PATCH 21/96] FIX: remove whitespace --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 4b16d2e90c65a..6e8866cacc7dd 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1208,7 +1208,7 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, min_block_mib, block_size)) if Y is None: Y = X - + for start in range(0, n_samples, block_n_rows): # get distances from block to every other sample stop = min(start + block_n_rows, X.shape[0]) From 9b804911d4b45b210b33be52b95101b439bafefb Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 8 Dec 2016 12:18:21 +0530 Subject: [PATCH 22/96] FIX: fix typo --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 6e8866cacc7dd..33a3f19c054d9 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1131,7 +1131,7 @@ def _pairwise_callable(X, Y, metric, **kwds): 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] DEFAULT_BLOCK_SIZE = 64 -BYTES_PER_FLOAT = 64 +BYTES_PER_FLOAT = 8 def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, From 8e900e303ed5fb18bb2b707f53c7a4c82ab245c1 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 8 Dec 2016 12:38:21 +0530 Subject: [PATCH 23/96] TST: added tests for pairwise_distances_blockwise --- sklearn/metrics/tests/test_pairwise.py | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index d8b64b58ca481..382f1cb200ef2 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -10,6 +10,7 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_raises_regexp from sklearn.utils.testing import assert_true from sklearn.utils.testing import ignore_warnings @@ -27,6 +28,7 @@ from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_distances from sklearn.metrics.pairwise import pairwise_distances +from sklearn.metrics.pairwise import pairwise_distances_blockwise from sklearn.metrics.pairwise import pairwise_distances_argmin_min from sklearn.metrics.pairwise import pairwise_distances_argmin from sklearn.metrics.pairwise import pairwise_kernels @@ -370,6 +372,49 @@ def test_pairwise_distances_argmin_min(): np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) +def check_invalid_block_size_generator(generator): + for i in generator: + return i + + +def test_pairwise_distances_blockwise_invalid_block_size(): + rng = np.random.RandomState(0) + X = rng.random_sample((400, 4)) + y = rng.random_sample((200, 4)) + gen = pairwise_distances_blockwise(X, y, block_size=0, metric='euclidean') + assert_raise_message(ValueError, 'block_size should be at least n_samples ' + '* 8 bytes = 1 MiB, got 0', + check_invalid_block_size_generator, gen) + + +def test_pairwise_distances_blockwise(): + # Test the pairwise_distance helper function. + rng = np.random.RandomState(0) + # Euclidean distance should be equivalent to calling the function. + X = rng.random_sample((400, 4)) + gen = pairwise_distances_blockwise(X, block_size=1, metric="euclidean") + S = np.empty((0, X.shape[0])) + for row in gen: + S = np.vstack((S, row)) + S2 = euclidean_distances(X) + assert_array_almost_equal(S, S2) + # Euclidean distance, with Y != X. + Y = rng.random_sample((200, 4)) + gen = pairwise_distances_blockwise(X, Y, block_size=1, metric="euclidean") + S = np.empty((0, Y.shape[0])) + for row in gen: + S = np.vstack((S, row)) + S2 = euclidean_distances(X, Y) + assert_array_almost_equal(S, S2) + # absurdly large block_size + gen = pairwise_distances_blockwise(X, Y, block_size=10000, + metric='euclidean') + S = np.empty((0, Y.shape[0])) + for row in gen: + S = np.vstack((S, row)) + assert_almost_equal(S, S2) + + def test_euclidean_distances(): # Check the pairwise Euclidean distances computation X = [[0]] From 6be6ea2adbf165a79091ddfb86203d3f69bb9633 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 12 Dec 2016 11:14:54 +0530 Subject: [PATCH 24/96] FIX: removed errors and modified pairwise_distances_blockwise with tests --- sklearn/metrics/pairwise.py | 93 ++++++++++++++++++++------ sklearn/metrics/tests/test_pairwise.py | 81 ++++++++++++++-------- 2 files changed, 127 insertions(+), 47 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 33a3f19c054d9..6c41abc321783 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -915,7 +915,8 @@ def cosine_similarity(X, Y=None, dense_output=True): else: Y_normalized = normalize(Y, copy=True) - K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output) + K = safe_sparse_dot(X_normalized, Y_normalized.T, + dense_output=dense_output) return K @@ -1134,24 +1135,73 @@ def _pairwise_callable(X, Y, metric, **kwds): BYTES_PER_FLOAT = 8 +def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', + n_jobs=1, + block_size=DEFAULT_BLOCK_SIZE, + block_n_rows=1, + **kwds): + """Generate blocks of the distance matrix from X and optional Y. + + Parameters + ---------- + X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, + [n_samples_a, n_features] otherwise + Array of pairwise distances between samples, or a feature array. + + Y : array [n_samples_b, n_features], optional + An optional second feature array. Only allowed if + metric != "precomputed". + + metric : string, or callable + The metric to use when calculating distance between instances in a + feature array. + + n_jobs : int + The number of jobs to use for the computation. + + block_size : int, default=64 + The maximum number of mebibytes (MiB) of memory per job (see``n_jobs``) + to use at a time for calculating pairwise distances. + + block_n_rows : int + Number of rows to be computed for each block. + + `**kwds` : optional keyword parameters + Any further parameters are passed directly to the distance function. + + Returns + ------- + D : generator of blocks based on the ``block_size`` parameter. + + """ + if metric != 'precomputed' and Y is None: + Y = X + n_samples = X.shape[0] + for start in range(0, n_samples, block_n_rows): + # get distances from block to every other sample + stop = min(start + block_n_rows, X.shape[0]) + yield pairwise_distances(X[start:stop], Y, metric, n_jobs, **kwds) + + def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, **kwds): - """ Compute the distance matrix from a vector array X and optional Y. + """Compute the distance matrix from a vector array X and optional Y. - This method takes either a vector array or a distance matrix, and returns - a distance matrix. If the input is a vector array, the distances are - computed. If the input is a distances matrix, it is returned instead. + This method takes either a vector array or a distance matrix, and generates + blocks of a distance matrix. If the input is a vector array, the distances + are computed. If the input is a distances matrix, it is returned in blocks + instead. This is equivalent to calling: pairwise_distances(X, y, metric, n_jobs) - but uses much less memory. + but may use less memory. Parameters ---------- - X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ - [n_samples_a, n_features] otherwise + X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, + [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional @@ -1190,13 +1240,19 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, Returns ------- - D : generator of blocks based on the ``block_size`` parameter. The blocks, - when concatenated, produce a distance matrix D such that D_{i, j} is - the distance between the ith and jth vectors of the given matrix X, if - Y is None. If Y is not None, then D_{i, j} is the distance between the - ith array from X and the jth array from Y. + D : generator of blocks based on the ``block_size`` parameter. + The blocks, when concatenated, produce a distance matrix D such that + D_{i, j} is the distance between the ith and jth vectors of the given + matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance + between the ith array from X and the jth array from Y. """ + if (metric not in _VALID_METRICS and + not callable(metric) and metric != "precomputed"): + raise ValueError("Unknown metric %s. " + "Valid metrics are %s, or 'precomputed', or a " + "callable" % (metric, _VALID_METRICS)) + n_samples = X.shape[0] block_n_rows = block_size * (2 ** 20) // (BYTES_PER_FLOAT * n_samples) if block_n_rows > n_samples: @@ -1206,13 +1262,12 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, raise ValueError('block_size should be at least n_samples * %d bytes ' '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, min_block_mib, block_size)) - if Y is None: - Y = X - for start in range(0, n_samples, block_n_rows): - # get distances from block to every other sample - stop = min(start + block_n_rows, X.shape[0]) - yield pairwise_distances(X[start:stop], Y, metric, n_jobs, **kwds) + return _generate_pairwise_distances_blockwise(X, Y, metric=metric, + n_jobs=n_jobs, + block_size=block_size, + block_n_rows=block_n_rows, + **kwds) def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 382f1cb200ef2..43ba34a20adcb 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -372,19 +372,23 @@ def test_pairwise_distances_argmin_min(): np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) -def check_invalid_block_size_generator(generator): - for i in generator: - return i - - def test_pairwise_distances_blockwise_invalid_block_size(): - rng = np.random.RandomState(0) - X = rng.random_sample((400, 4)) - y = rng.random_sample((200, 4)) - gen = pairwise_distances_blockwise(X, y, block_size=0, metric='euclidean') + X = np.empty((400, 4)) + y = np.empty((200, 4)) assert_raise_message(ValueError, 'block_size should be at least n_samples ' '* 8 bytes = 1 MiB, got 0', - check_invalid_block_size_generator, gen) + pairwise_distances_blockwise, X, y, block_size=0, + metric='euclidean') + + +def check_pairwise_distances_blockwise(X, Y, block_size, metric, + true_distances): + if Y is None: + Y = X + gen = pairwise_distances_blockwise(X, Y, block_size=block_size, + metric=metric) + blockwise_distances = np.vstack(list(gen)) + assert_array_almost_equal(blockwise_distances, true_distances) def test_pairwise_distances_blockwise(): @@ -392,27 +396,48 @@ def test_pairwise_distances_blockwise(): rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((400, 4)) - gen = pairwise_distances_blockwise(X, block_size=1, metric="euclidean") - S = np.empty((0, X.shape[0])) - for row in gen: - S = np.vstack((S, row)) - S2 = euclidean_distances(X) - assert_array_almost_equal(S, S2) + S = euclidean_distances(X) + check_pairwise_distances_blockwise(X, None, block_size=1, + metric='euclidean', true_distances=S) # Euclidean distance, with Y != X. Y = rng.random_sample((200, 4)) - gen = pairwise_distances_blockwise(X, Y, block_size=1, metric="euclidean") - S = np.empty((0, Y.shape[0])) - for row in gen: - S = np.vstack((S, row)) - S2 = euclidean_distances(X, Y) - assert_array_almost_equal(S, S2) + S = euclidean_distances(X, Y) + check_pairwise_distances_blockwise(X, Y, block_size=1, + metric='euclidean', true_distances=S) # absurdly large block_size - gen = pairwise_distances_blockwise(X, Y, block_size=10000, - metric='euclidean') - S = np.empty((0, Y.shape[0])) - for row in gen: - S = np.vstack((S, row)) - assert_almost_equal(S, S2) + check_pairwise_distances_blockwise(X, Y, block_size=10000, + metric='euclidean', true_distances=S) + # "cityblock" uses scikit-learn metric, cityblock (function) is + # scipy.spatial. + S = pairwise_distances(X, Y, metric=cityblock) + check_pairwise_distances_blockwise(X, Y, block_size=1, + metric='cityblock', true_distances=S) + # The manhattan metric should be equivalent to cityblock. + check_pairwise_distances_blockwise(X, Y, block_size=1, + metric='manhattan', true_distances=S) + # Test cosine as a string metric versus cosine callable + # The string "cosine" uses sklearn.metric, + # while the function cosine is scipy.spatial + S = pairwise_distances(X, Y, metric=cosine) + check_pairwise_distances_blockwise(X, Y, block_size=1, + metric='cosine', true_distances=S) + # Test with sparse X and Y, + # currently only supported for Euclidean, L1 and cosine. + X_sparse = csr_matrix(X) + Y_sparse = csr_matrix(Y) + S = euclidean_distances(X_sparse, Y_sparse) + check_pairwise_distances_blockwise(X_sparse, Y_sparse, block_size=1, + metric='euclidean', true_distances=S) + S = cosine_distances(X_sparse, Y_sparse) + check_pairwise_distances_blockwise(X_sparse, Y_sparse, block_size=1, + metric='cosine', true_distances=S) + S = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) + check_pairwise_distances_blockwise(X_sparse, Y_sparse.tocsc(), + block_size=1, metric='manhattan', + true_distances=S) + # Test that a value error is raised if the metric is unknown + assert_raises(ValueError, pairwise_distances_blockwise, X, Y, + metric="blah") def test_euclidean_distances(): From 6d79bdd0b2920bac6ba62a2388e027b72bf2ac67 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 12 Dec 2016 11:17:36 +0530 Subject: [PATCH 25/96] FIX: fix typo --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 6c41abc321783..b09dbd5c466a1 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1140,7 +1140,7 @@ def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', block_size=DEFAULT_BLOCK_SIZE, block_n_rows=1, **kwds): - """Generate blocks of the distance matrix from X and optional Y. + """Generates blocks of the distance matrix from X and optional Y. Parameters ---------- From 54ff9f505ed4bde70946ce8818879252d6734563 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 5 Jan 2017 12:48:40 +0530 Subject: [PATCH 26/96] WIP: support for nearest neighbors --- sklearn/metrics/__init__.py | 2 ++ sklearn/metrics/pairwise.py | 45 ++++++++++++++++++++++++++++++-- sklearn/neighbors/base.py | 52 ++++++++++++++++++++++--------------- 3 files changed, 76 insertions(+), 23 deletions(-) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index cae8f9b6c7d03..83d67686eef81 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -50,6 +50,7 @@ from .pairwise import pairwise_distances_argmin from .pairwise import pairwise_distances_argmin_min from .pairwise import pairwise_kernels +from .pairwise import pairwise_distances_reduce from .regression import explained_variance_score from .regression import mean_absolute_error @@ -96,6 +97,7 @@ 'mutual_info_score', 'normalized_mutual_info_score', 'pairwise_distances', + 'pairwise_distances_reduce', 'pairwise_distances_argmin', 'pairwise_distances_argmin_min', 'pairwise_distances_argmin_min', diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b09dbd5c466a1..c5fb16cb01900 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1138,8 +1138,7 @@ def _pairwise_callable(X, Y, metric, **kwds): def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, - block_n_rows=1, - **kwds): + block_n_rows=1, **kwds): """Generates blocks of the distance matrix from X and optional Y. Parameters @@ -1183,6 +1182,48 @@ def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', yield pairwise_distances(X[start:stop], Y, metric, n_jobs, **kwds) +def _generate_pairwise_distances_reduce(X, Y=None, metric='euclidean', + n_jobs=1, reduce_func=None, + block_size=DEFAULT_BLOCK_SIZE, + block_n_rows=1, **kwds): + if metric != 'precomputed' and Y is None: + Y = X + n_samples = X.shape[0] + if reduce_func: + for start in range(0, n_samples, block_n_rows): + # get distances from block to every other sample + stop = min(start + block_n_rows, X.shape[0]) + dist = pairwise_distances(X[start:stop], Y, metric, n_jobs, **kwds) + yield reduce_func(dist, stop-start, *kwds) + + +def pairwise_distances_reduce(X, Y=None, metric='euclidean', n_jobs=1, + reduce_func=None, block_size=DEFAULT_BLOCK_SIZE, + **kwds): + if (metric not in _VALID_METRICS and + not callable(metric) and metric != "precomputed"): + raise ValueError("Unknown metric %s. " + "Valid metrics are %s, or 'precomputed', or a " + "callable" % (metric, _VALID_METRICS)) + + n_samples = X.shape[0] + block_n_rows = block_size * (2 ** 20) // (BYTES_PER_FLOAT * n_samples) + if block_n_rows > n_samples: + block_n_rows = min(block_n_rows, n_samples) + if block_n_rows < 1: + min_block_mib = np.ceil(n_samples * BYTES_PER_FLOAT * 2 ** -20) + raise ValueError('block_size should be at least n_samples * %d bytes ' + '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, + min_block_mib, block_size)) + + return _generate_pairwise_distances_reduce(X, Y, metric=metric, + n_jobs=n_jobs, + reduce_func=reduce_func, + block_size=block_size, + block_n_rows=block_n_rows, + **kwds) + + def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, **kwds): """Compute the distance matrix from a vector array X and optional Y. diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 0cf8bc04ae230..e23ce08702f15 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -15,7 +15,7 @@ from .ball_tree import BallTree from .kd_tree import KDTree from ..base import BaseEstimator -from ..metrics import pairwise_distances +from ..metrics import pairwise_distances, pairwise_distances_reduce from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices from ..utils.fixes import argpartition @@ -325,10 +325,8 @@ class from an array representing our data set and ask who's if n_neighbors is None: n_neighbors = self.n_neighbors - if X is not None: - query_is_train = False - X = check_array(X, accept_sparse='csr') - else: + query_is_train = True + X = self._fit_X query_is_train = True X = self._fit_X # Include an extra neighbor to account for the sample itself being @@ -349,26 +347,18 @@ class from an array representing our data set and ask who's if self._fit_method == 'brute': # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': - dist = pairwise_distances(X, self._fit_X, 'euclidean', - n_jobs=n_jobs, squared=True) + result = pairwise_distances_reduce( + X, self._fit_X, 'euclidean', n_jobs=n_jobs, + reduce_func=self._reduce_func, block_size=1, squared=True, + n_neighbors=n_neighbors, return_distance=return_distance) else: - dist = pairwise_distances( + result = pairwise_distances_reduce( X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, + reduce_func=self._reduce_func, block_size=1, + n_neighbors=n_neighbors, return_distance=return_distance, **self.effective_metric_params_) - neigh_ind = argpartition(dist, n_neighbors - 1, axis=1) - neigh_ind = neigh_ind[:, :n_neighbors] - # argpartition doesn't guarantee sorted order, so we sort again - neigh_ind = neigh_ind[ - sample_range, np.argsort(dist[sample_range, neigh_ind])] - - if return_distance: - if self.effective_metric_ == 'euclidean': - result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind - else: - result = dist[sample_range, neigh_ind], neigh_ind - else: - result = neigh_ind + result = np.vstack(list(result)) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): @@ -417,6 +407,26 @@ class from an array representing our data set and ask who's return dist, neigh_ind return neigh_ind + def _reduce_func(self, dist, n_samples, **kwds): + sample_range = np.arange(n_samples)[:, None] + n_neighbors = kwds['n_neighbors'] + return_distance = kwds['return_distance'] + neigh_ind = argpartition(dist, n_neighbors - 1, axis=1) + neigh_ind = neigh_ind[:, :n_neighbors] + # argpartition doesn't guarantee sorted order, so we sort again + neigh_ind = neigh_ind[ + sample_range, np.argsort(dist[sample_range, neigh_ind])] + + if return_distance: + if self.effective_metric_ == 'euclidean': + result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind + else: + result = dist[sample_range, neigh_ind], neigh_ind + else: + result = neigh_ind + + return result + def kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity'): """Computes the (weighted) graph of k-Neighbors for points in X From 8986965884018844da924304050d576d51859b31 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sat, 7 Jan 2017 07:12:35 +0530 Subject: [PATCH 27/96] ENH: passing arguments to reduce_func via partial --- sklearn/metrics/pairwise.py | 2 +- sklearn/metrics/tests/test_pairwise.py | 2 -- sklearn/neighbors/base.py | 20 +++++++++++--------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index c5fb16cb01900..590909c9fd4a9 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1194,7 +1194,7 @@ def _generate_pairwise_distances_reduce(X, Y=None, metric='euclidean', # get distances from block to every other sample stop = min(start + block_n_rows, X.shape[0]) dist = pairwise_distances(X[start:stop], Y, metric, n_jobs, **kwds) - yield reduce_func(dist, stop-start, *kwds) + yield reduce_func(dist=dist) def pairwise_distances_reduce(X, Y=None, metric='euclidean', n_jobs=1, diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 43ba34a20adcb..343c415b88dae 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -383,8 +383,6 @@ def test_pairwise_distances_blockwise_invalid_block_size(): def check_pairwise_distances_blockwise(X, Y, block_size, metric, true_distances): - if Y is None: - Y = X gen = pairwise_distances_blockwise(X, Y, block_size=block_size, metric=metric) blockwise_distances = np.vstack(list(gen)) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index e23ce08702f15..a6ab5c7e60637 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -6,6 +6,8 @@ # Multi-output support by Arnaud Joly # # License: BSD 3 clause (C) INRIA, University of Amsterdam +from functools import partial + import warnings from abc import ABCMeta, abstractmethod @@ -346,19 +348,21 @@ class from an array representing our data set and ask who's n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances + reduce_func = partial(self._reduce_func, n_neighbors=n_neighbors, + return_distance=return_distance) if self.effective_metric_ == 'euclidean': result = pairwise_distances_reduce( X, self._fit_X, 'euclidean', n_jobs=n_jobs, - reduce_func=self._reduce_func, block_size=1, squared=True, - n_neighbors=n_neighbors, return_distance=return_distance) + reduce_func=reduce_func, block_size=1, squared=True) else: result = pairwise_distances_reduce( X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, - reduce_func=self._reduce_func, block_size=1, - n_neighbors=n_neighbors, return_distance=return_distance, + reduce_func=reduce_func, block_size=1, **self.effective_metric_params_) - result = np.vstack(list(result)) + result = np.hstack(list(result)) + if return_distance: + result = result[0], result[1] elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): @@ -407,10 +411,8 @@ class from an array representing our data set and ask who's return dist, neigh_ind return neigh_ind - def _reduce_func(self, dist, n_samples, **kwds): - sample_range = np.arange(n_samples)[:, None] - n_neighbors = kwds['n_neighbors'] - return_distance = kwds['return_distance'] + def _reduce_func(self, n_neighbors, return_distance, dist): + sample_range = np.arange(dist.shape[0])[:, None] neigh_ind = argpartition(dist, n_neighbors - 1, axis=1) neigh_ind = neigh_ind[:, :n_neighbors] # argpartition doesn't guarantee sorted order, so we sort again From a072f118d313ae176d6a622aae89b9cdb437fa55 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sat, 7 Jan 2017 12:09:39 +0530 Subject: [PATCH 28/96] FIX: remove true_distances as parameter --- sklearn/metrics/tests/test_pairwise.py | 40 +++++--------------------- sklearn/neighbors/base.py | 4 ++- 2 files changed, 10 insertions(+), 34 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 343c415b88dae..6e1a7e8d6d77a 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -381,12 +381,12 @@ def test_pairwise_distances_blockwise_invalid_block_size(): metric='euclidean') -def check_pairwise_distances_blockwise(X, Y, block_size, metric, - true_distances): +def check_pairwise_distances_blockwise(X, Y, block_size, metric): gen = pairwise_distances_blockwise(X, Y, block_size=block_size, metric=metric) blockwise_distances = np.vstack(list(gen)) - assert_array_almost_equal(blockwise_distances, true_distances) + S = pairwise_distances(X, Y, metric=metric) + assert_array_almost_equal(blockwise_distances, S) def test_pairwise_distances_blockwise(): @@ -394,45 +394,19 @@ def test_pairwise_distances_blockwise(): rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((400, 4)) - S = euclidean_distances(X) check_pairwise_distances_blockwise(X, None, block_size=1, - metric='euclidean', true_distances=S) + metric='euclidean') # Euclidean distance, with Y != X. Y = rng.random_sample((200, 4)) - S = euclidean_distances(X, Y) check_pairwise_distances_blockwise(X, Y, block_size=1, - metric='euclidean', true_distances=S) + metric='euclidean') # absurdly large block_size check_pairwise_distances_blockwise(X, Y, block_size=10000, - metric='euclidean', true_distances=S) + metric='euclidean') # "cityblock" uses scikit-learn metric, cityblock (function) is # scipy.spatial. - S = pairwise_distances(X, Y, metric=cityblock) check_pairwise_distances_blockwise(X, Y, block_size=1, - metric='cityblock', true_distances=S) - # The manhattan metric should be equivalent to cityblock. - check_pairwise_distances_blockwise(X, Y, block_size=1, - metric='manhattan', true_distances=S) - # Test cosine as a string metric versus cosine callable - # The string "cosine" uses sklearn.metric, - # while the function cosine is scipy.spatial - S = pairwise_distances(X, Y, metric=cosine) - check_pairwise_distances_blockwise(X, Y, block_size=1, - metric='cosine', true_distances=S) - # Test with sparse X and Y, - # currently only supported for Euclidean, L1 and cosine. - X_sparse = csr_matrix(X) - Y_sparse = csr_matrix(Y) - S = euclidean_distances(X_sparse, Y_sparse) - check_pairwise_distances_blockwise(X_sparse, Y_sparse, block_size=1, - metric='euclidean', true_distances=S) - S = cosine_distances(X_sparse, Y_sparse) - check_pairwise_distances_blockwise(X_sparse, Y_sparse, block_size=1, - metric='cosine', true_distances=S) - S = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) - check_pairwise_distances_blockwise(X_sparse, Y_sparse.tocsc(), - block_size=1, metric='manhattan', - true_distances=S) + metric='cityblock') # Test that a value error is raised if the metric is unknown assert_raises(ValueError, pairwise_distances_blockwise, X, Y, metric="blah") diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index a6ab5c7e60637..b2f73763e16a0 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -347,9 +347,11 @@ class from an array representing our data set and ask who's n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': - # for efficiency, use squared euclidean distances + reduce_func = partial(self._reduce_func, n_neighbors=n_neighbors, return_distance=return_distance) + + # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': result = pairwise_distances_reduce( X, self._fit_X, 'euclidean', n_jobs=n_jobs, From e0fb4c521c88127aa3b9468da2e091efba819054 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sat, 7 Jan 2017 12:15:16 +0530 Subject: [PATCH 29/96] FIX: revert unintended change --- sklearn/neighbors/base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index b2f73763e16a0..2bbd780e9074e 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -327,8 +327,10 @@ class from an array representing our data set and ask who's if n_neighbors is None: n_neighbors = self.n_neighbors - query_is_train = True - X = self._fit_X + if X is not None: + query_is_train = False + X = check_array(X, accept_sparse='csr') + else: query_is_train = True X = self._fit_X # Include an extra neighbor to account for the sample itself being From 508afaeeb4357cee1739effff89e3c317d3eab93 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sat, 7 Jan 2017 12:31:02 +0530 Subject: [PATCH 30/96] FIX: convert float indices to int --- sklearn/neighbors/base.py | 6 ++---- sklearn/neighbors/classification.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 2bbd780e9074e..2de5c13aadf0b 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -363,10 +363,10 @@ class from an array representing our data set and ask who's X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, reduce_func=reduce_func, block_size=1, **self.effective_metric_params_) - result = np.hstack(list(result)) if return_distance: - result = result[0], result[1] + dist, neigh_ind = result[0], np.array(result[1], dtype=np.int) + result = dist, neigh_ind elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): @@ -422,7 +422,6 @@ def _reduce_func(self, n_neighbors, return_distance, dist): # argpartition doesn't guarantee sorted order, so we sort again neigh_ind = neigh_ind[ sample_range, np.argsort(dist[sample_range, neigh_ind])] - if return_distance: if self.effective_metric_ == 'euclidean': result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind @@ -430,7 +429,6 @@ def _reduce_func(self, n_neighbors, return_distance, dist): result = dist[sample_range, neigh_ind], neigh_ind else: result = neigh_ind - return result def kneighbors_graph(self, X=None, n_neighbors=None, diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 07c8fa320102f..d9ea8275eae52 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -143,7 +143,7 @@ def predict(self, X): X = check_array(X, accept_sparse='csr') neigh_dist, neigh_ind = self.kneighbors(X) - + print(neigh_ind.dtype) classes_ = self.classes_ _y = self._y if not self.outputs_2d_: From b515105c23bcd7a85d5737fe48c2f140df878a05 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sat, 7 Jan 2017 13:17:56 +0530 Subject: [PATCH 31/96] FIX: removed debug lines --- sklearn/neighbors/classification.py | 1 - sklearn/neighbors/tests/test_neighbors.py | 1 - 2 files changed, 2 deletions(-) diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index d9ea8275eae52..5573c96d0c4cb 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -143,7 +143,6 @@ def predict(self, X): X = check_array(X, accept_sparse='csr') neigh_dist, neigh_ind = self.kneighbors(X) - print(neigh_ind.dtype) classes_ = self.classes_ _y = self._y if not self.outputs_2d_: diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 0e827d9bb886e..a53b2760fcd79 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -149,7 +149,6 @@ def test_precomputed(random_state=42): neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): - print(Est) est = Est(metric='euclidean') est.radius = est.n_neighbors = 1 pred_X = est.fit(X, target).predict(Y) From f1f7348a98ffedbac18c1d82ea2e4115b0f96629 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sun, 8 Jan 2017 09:03:35 +0530 Subject: [PATCH 32/96] ENH: added pairwise_distances_reduce for radius_neighbors --- sklearn/neighbors/base.py | 64 ++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 2de5c13aadf0b..dd6de2856426a 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -17,7 +17,7 @@ from .ball_tree import BallTree from .kd_tree import KDTree from ..base import BaseEstimator -from ..metrics import pairwise_distances, pairwise_distances_reduce +from ..metrics import pairwise_distances_reduce from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices from ..utils.fixes import argpartition @@ -592,40 +592,25 @@ class from an array representing our data set and ask who's if radius is None: radius = self.radius - n_samples = X.shape[0] if self._fit_method == 'brute': # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': - dist = pairwise_distances(X, self._fit_X, 'euclidean', - n_jobs=self.n_jobs, squared=True) radius *= radius - else: - dist = pairwise_distances(X, self._fit_X, - self.effective_metric_, - n_jobs=self.n_jobs, - **self.effective_metric_params_) - - neigh_ind_list = [np.where(d <= radius)[0] for d in dist] + reduce_func = partial(self.reduce_func, radius=radius, + return_distance=return_distance) - # See https://github.com/numpy/numpy/issues/5456 - # if you want to understand why this is initialized this way. - neigh_ind = np.empty(n_samples, dtype='object') - neigh_ind[:] = neigh_ind_list - - if return_distance: - dist_array = np.empty(n_samples, dtype='object') - if self.effective_metric_ == 'euclidean': - dist_list = [np.sqrt(d[neigh_ind[i]]) - for i, d in enumerate(dist)] - else: - dist_list = [d[neigh_ind[i]] - for i, d in enumerate(dist)] - dist_array[:] = dist_list - - results = dist_array, neigh_ind + results = pairwise_distances_reduce( + X, self._fit_X, 'euclidean', n_jobs=self.n_jobs, + reduce_func=reduce_func, block_size=1, squared=True) else: - results = neigh_ind + reduce_func = partial(self.reduce_func, radius=radius, + return_distance=return_distance) + results = pairwise_distances_reduce( + X, self._fit_X, self.effective_metric_, n_jobs=self.n_jobs, + reduce_func=reduce_func, block_size=1, + **self.effective_metric_params_) + results = np.hstack(list(results)) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): raise ValueError( @@ -660,6 +645,29 @@ class from an array representing our data set and ask who's return dist, neigh_ind return neigh_ind + def reduce_func(self, radius, return_distance, dist): + neigh_ind_list = [np.where(d <= radius)[0] for d in dist] + + # See https://github.com/numpy/numpy/issues/5456 + # if you want to understand why this is initialized this way. + neigh_ind = np.empty(dist.shape[0], dtype='object') + neigh_ind[:] = neigh_ind_list + + if return_distance: + dist_array = np.empty(dist.shape[0], dtype='object') + if self.effective_metric_ == 'euclidean': + dist_list = [np.sqrt(d[neigh_ind[i]]) + for i, d in enumerate(dist)] + else: + dist_list = [d[neigh_ind[i]] + for i, d in enumerate(dist)] + dist_array[:] = dist_list + + results = dist_array, neigh_ind + else: + results = neigh_ind + return results + def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): """Computes the (weighted) graph of Neighbors for points in X From d54def130f6840eb366deea54c930e6ea3198382 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sat, 4 Feb 2017 10:41:52 +0530 Subject: [PATCH 33/96] FIX: changed order of reduce_func --- sklearn/metrics/pairwise.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 590909c9fd4a9..79e72f89f1035 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1183,7 +1183,7 @@ def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', def _generate_pairwise_distances_reduce(X, Y=None, metric='euclidean', - n_jobs=1, reduce_func=None, + reduce_func=None, n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, block_n_rows=1, **kwds): if metric != 'precomputed' and Y is None: @@ -1197,9 +1197,8 @@ def _generate_pairwise_distances_reduce(X, Y=None, metric='euclidean', yield reduce_func(dist=dist) -def pairwise_distances_reduce(X, Y=None, metric='euclidean', n_jobs=1, - reduce_func=None, block_size=DEFAULT_BLOCK_SIZE, - **kwds): +def pairwise_distances_reduce(X, Y=None, metric='euclidean', reduce_func=None, + n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, **kwds): if (metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed"): raise ValueError("Unknown metric %s. " From 3e8adfcbe0d084fb1ad2fce7ffaf155f96f0d382 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 6 Feb 2017 17:30:21 +0530 Subject: [PATCH 34/96] FIX: get pairwise_distances_reduce to work correctly --- sklearn/metrics/pairwise.py | 38 ++---------------------- sklearn/neighbors/base.py | 58 ++++++++++++++++++++----------------- 2 files changed, 33 insertions(+), 63 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 79e72f89f1035..4384064a94c34 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1182,45 +1182,11 @@ def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', yield pairwise_distances(X[start:stop], Y, metric, n_jobs, **kwds) -def _generate_pairwise_distances_reduce(X, Y=None, metric='euclidean', - reduce_func=None, n_jobs=1, - block_size=DEFAULT_BLOCK_SIZE, - block_n_rows=1, **kwds): - if metric != 'precomputed' and Y is None: - Y = X - n_samples = X.shape[0] - if reduce_func: - for start in range(0, n_samples, block_n_rows): - # get distances from block to every other sample - stop = min(start + block_n_rows, X.shape[0]) - dist = pairwise_distances(X[start:stop], Y, metric, n_jobs, **kwds) - yield reduce_func(dist=dist) - - def pairwise_distances_reduce(X, Y=None, metric='euclidean', reduce_func=None, n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, **kwds): - if (metric not in _VALID_METRICS and - not callable(metric) and metric != "precomputed"): - raise ValueError("Unknown metric %s. " - "Valid metrics are %s, or 'precomputed', or a " - "callable" % (metric, _VALID_METRICS)) - - n_samples = X.shape[0] - block_n_rows = block_size * (2 ** 20) // (BYTES_PER_FLOAT * n_samples) - if block_n_rows > n_samples: - block_n_rows = min(block_n_rows, n_samples) - if block_n_rows < 1: - min_block_mib = np.ceil(n_samples * BYTES_PER_FLOAT * 2 ** -20) - raise ValueError('block_size should be at least n_samples * %d bytes ' - '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, - min_block_mib, block_size)) - return _generate_pairwise_distances_reduce(X, Y, metric=metric, - n_jobs=n_jobs, - reduce_func=reduce_func, - block_size=block_size, - block_n_rows=block_n_rows, - **kwds) + return [reduce_func(dist=D) for D in pairwise_distances_blockwise(X, + Y, metric, n_jobs, block_size, **kwds)] def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index dd6de2856426a..b37cc70a41a58 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -270,7 +270,6 @@ class KNeighborsMixin(object): def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. - Returns indices of and distances to the neighbors of each point. Parameters @@ -363,10 +362,13 @@ class from an array representing our data set and ask who's X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, reduce_func=reduce_func, block_size=1, **self.effective_metric_params_) - result = np.hstack(list(result)) + if return_distance: - dist, neigh_ind = result[0], np.array(result[1], dtype=np.int) + result = np.hstack(result) + dist, neigh_ind = result[0], result[1].astype(np.int) result = dist, neigh_ind + else: + result = np.vstack(result) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): @@ -513,6 +515,29 @@ def kneighbors_graph(self, X=None, n_neighbors=None, class RadiusNeighborsMixin(object): """Mixin for radius-based neighbors searches""" + def reduce_func(self, radius, return_distance, dist): + neigh_ind_list = [np.where(d <= radius)[0] for d in dist] + + # See https://github.com/numpy/numpy/issues/5456 + # if you want to understand why this is initialized this way. + neigh_ind = np.empty(dist.shape[0], dtype='object') + neigh_ind[:] = neigh_ind_list + + if return_distance: + dist_array = np.empty(dist.shape[0], dtype='object') + if self.effective_metric_ == 'euclidean': + dist_list = [np.sqrt(d[neigh_ind[i]]) + for i, d in enumerate(dist)] + else: + dist_list = [d[neigh_ind[i]] + for i, d in enumerate(dist)] + dist_array[:] = dist_list + + results = dist_array, neigh_ind + else: + results = neigh_ind + return results + def radius_neighbors(self, X=None, radius=None, return_distance=True): """Finds the neighbors within a given radius of a point or points. @@ -610,7 +635,9 @@ class from an array representing our data set and ask who's X, self._fit_X, self.effective_metric_, n_jobs=self.n_jobs, reduce_func=reduce_func, block_size=1, **self.effective_metric_params_) - results = np.hstack(list(results)) + + results = np.hstack(results) + elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): raise ValueError( @@ -645,29 +672,6 @@ class from an array representing our data set and ask who's return dist, neigh_ind return neigh_ind - def reduce_func(self, radius, return_distance, dist): - neigh_ind_list = [np.where(d <= radius)[0] for d in dist] - - # See https://github.com/numpy/numpy/issues/5456 - # if you want to understand why this is initialized this way. - neigh_ind = np.empty(dist.shape[0], dtype='object') - neigh_ind[:] = neigh_ind_list - - if return_distance: - dist_array = np.empty(dist.shape[0], dtype='object') - if self.effective_metric_ == 'euclidean': - dist_list = [np.sqrt(d[neigh_ind[i]]) - for i, d in enumerate(dist)] - else: - dist_list = [d[neigh_ind[i]] - for i, d in enumerate(dist)] - dist_array[:] = dist_list - - results = dist_array, neigh_ind - else: - results = neigh_ind - return results - def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): """Computes the (weighted) graph of Neighbors for points in X From 4b8c7b2cec2a92f076ed3e853537218d93163273 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 6 Feb 2017 19:09:51 +0530 Subject: [PATCH 35/96] FIX: remove flake8 errors --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 4384064a94c34..bcbcdc29df087 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1186,7 +1186,7 @@ def pairwise_distances_reduce(X, Y=None, metric='euclidean', reduce_func=None, n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, **kwds): return [reduce_func(dist=D) for D in pairwise_distances_blockwise(X, - Y, metric, n_jobs, block_size, **kwds)] + Y, metric, n_jobs, block_size, **kwds)] def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, From 97486f6d286371adca43da080a6fe6bc3254e14c Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Tue, 7 Feb 2017 12:03:14 +0530 Subject: [PATCH 36/96] FIX: rename reduce_func --- sklearn/neighbors/base.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index b37cc70a41a58..d41cd3467ec15 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -349,7 +349,8 @@ class from an array representing our data set and ask who's n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': - reduce_func = partial(self._reduce_func, n_neighbors=n_neighbors, + reduce_func = partial(self._kneighbors_reduce_func, + n_neighbors=n_neighbors, return_distance=return_distance) # for efficiency, use squared euclidean distances @@ -417,7 +418,7 @@ class from an array representing our data set and ask who's return dist, neigh_ind return neigh_ind - def _reduce_func(self, n_neighbors, return_distance, dist): + def _kneighbors_reduce_func(self, dist, n_neighbors, return_distance): sample_range = np.arange(dist.shape[0])[:, None] neigh_ind = argpartition(dist, n_neighbors - 1, axis=1) neigh_ind = neigh_ind[:, :n_neighbors] @@ -515,7 +516,7 @@ def kneighbors_graph(self, X=None, n_neighbors=None, class RadiusNeighborsMixin(object): """Mixin for radius-based neighbors searches""" - def reduce_func(self, radius, return_distance, dist): + def _radius_neighbors_reduce_func(self, dist, radius, return_distance): neigh_ind_list = [np.where(d <= radius)[0] for d in dist] # See https://github.com/numpy/numpy/issues/5456 @@ -621,14 +622,16 @@ class from an array representing our data set and ask who's # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': radius *= radius - reduce_func = partial(self.reduce_func, radius=radius, + reduce_func = partial(self._radius_neighbors_reduce_func, + radius=radius, return_distance=return_distance) results = pairwise_distances_reduce( X, self._fit_X, 'euclidean', n_jobs=self.n_jobs, reduce_func=reduce_func, block_size=1, squared=True) else: - reduce_func = partial(self.reduce_func, radius=radius, + reduce_func = partial(self._radius_neighbors_reduce_func, + radius=radius, return_distance=return_distance) results = pairwise_distances_reduce( From 722a9eb2eb960eda59dea40eb1e079f19e81656c Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sat, 18 Feb 2017 17:51:41 +0530 Subject: [PATCH 37/96] FIX: return stacked distances from pairwise_distances_reduce --- sklearn/metrics/pairwise.py | 186 +++++++++++++++++++++++++++++++++++- sklearn/neighbors/base.py | 41 ++++---- 2 files changed, 199 insertions(+), 28 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index bcbcdc29df087..16fb21a993011 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -13,6 +13,7 @@ from functools import partial import numpy as np +from scipy import sparse from scipy.spatial import distance from scipy.sparse import csr_matrix from scipy.sparse import issparse @@ -54,6 +55,108 @@ def _return_float_dtype(X, Y): return X, Y, dtype +def flexible_concatenate(it, final_len=None): + """Concatenate the elements of an iterable + Supports generators of arrays, lists, sparse matrices or tuples thereof + >>> import numpy as np + >>> from scipy import sparse + >>> + >>> def make_example(typ): + ... yield typ([1, 2]) + ... yield typ([3]) + ... yield typ([4, 5, 6]) + ... + >>> flexible_concatenate(make_example(list)) + [1, 2, 3, 4, 5, 6] + >>> flexible_concatenate(make_example(np.array)) + array([1, 2, 3, 4, 5, 6]) + >>> flexible_concatenate(zip(make_example(list), make_example(np.array))) + ([1, 2, 3, 4, 5, 6], array([1, 2, 3, 4, 5, 6])) + >>> flexible_concatenate(make_example(np.array)) + array([1, 2, 3, 4, 5, 6]) + >>> flexible_concatenate(make_example(np.array), final_len=6) + array([1, 2, 3, 4, 5, 6]) + >>> flexible_concatenate(make_example( + ... lambda x: np.array(x).reshape(-1, 1))) + ... # doctest: +NORMALIZE_WHITESPACE + array([[1], [2], [3], [4], [5], [6]]) + >>> M = flexible_concatenate(make_example( + ... lambda x: sparse.csr_matrix(np.array(x).reshape(-1, 1)))) + ... # doctest: +NORMALIZE_WHITESPACE + >>> M.format + 'csr' + >>> M.A # doctest: +NORMALIZE_WHITESPACE + array([[1], [2], [3], [4], [5], [6]], dtype=int64) + >>> M = flexible_concatenate(make_example( + ... lambda x: sparse.csc_matrix(np.array(x).reshape(-1, 1)))) + ... # doctest: +NORMALIZE_WHITESPACE + >>> M.format + 'csc' + >>> M.A # doctest: +NORMALIZE_WHITESPACE + array([[1], [2], [3], [4], [5], [6]], dtype=int64) + """ + + def make_accumulator(prototype): + if isinstance(prototype, tuple): + return tuple(make_accumulator(y_proto) for y_proto in prototype) + if isinstance(prototype, np.ndarray) and final_len is not None: + return np.empty((final_len,) + prototype.shape[1:], + dtype=prototype.dtype) + else: + return [] + + def accumulate(x, accumulator, prototype): + if isinstance(prototype, tuple): + for y, y_acc, y_prototype in zip(x, accumulator, prototype): + n_rows = accumulate(y, y_acc, y_prototype) + # XXX: could assert all n_rows are identical + return n_rows + elif isinstance(prototype, np.ndarray) and final_len is not None: + accumulator[offset:offset + len(x)] = x + return len(x) + elif isinstance(prototype, list): + accumulator.extend(x) + return len(x) + else: + accumulator.append(x) + if hasattr(x, 'shape'): + return x.shape[0] + return len(x) + + def finalize(accumulator, prototype): + if isinstance(prototype, tuple): + return tuple(finalize(y_acc, y_prototype) + for y_acc, y_prototype in zip(accumulator, prototype)) + elif isinstance(prototype, list): + return accumulator + elif isinstance(prototype, np.ndarray) and final_len is not None: + return accumulator + elif isinstance(prototype, np.ndarray): + return np.concatenate(accumulator, axis=0) + elif sparse.isspmatrix(prototype): + return sparse.vstack(accumulator).asformat(prototype.format) + else: + raise NotImplementedError('No finalizing for accumulation of %s' + % type(prototype)) + + it = iter(it) + try: + first = next(it) + except StopIteration: + raise ValueError('Require at least one output from the iterator') + + accumulator = make_accumulator(first) + offset = 0 + offset = accumulate(first, accumulator, first) + for x in it: + offset += accumulate(x, accumulator, first) + + if final_len is not None: + assert offset == final_len, 'Expected %d, got %d' % (final_len, offset) + + return finalize(accumulator, first) + + def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): """ Set X and Y appropriately and checks inputs @@ -1182,11 +1285,88 @@ def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', yield pairwise_distances(X[start:stop], Y, metric, n_jobs, **kwds) -def pairwise_distances_reduce(X, Y=None, metric='euclidean', reduce_func=None, +def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, **kwds): + """Compute the distance matrix from a vector array X and optional Y. + + This method takes either a vector array or a distance matrix, and a + reducing function to reduce each block of the distance matrix produced, + as per the block_size parameter. If the input is a vector array, the + distances are computed. If the input is a distances matrix, it is reduced + in size and returned instead. + + This is equivalent to calling: + + pairwise_distances(X, y, metric, n_jobs) + + but should use less memory. + + Parameters + ---------- + X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, + [n_samples_a, n_features] otherwise + Array of pairwise distances between samples, or a feature array. + + Y : array [n_samples_b, n_features], optional + An optional second feature array. Only allowed if + metric != "precomputed". + + reduce_func : function, callable + The function which is applied on each block of the distance matrix + reducing its size. It reduces the size of each block from + [n_block_samples, n_samples_a] or [n_block_samples, n_samples] to + [n_block_samples, n_reduced] where n_block_samples is the number of + samples in each block and n_reduced depends on the reduce_func defined + by the user. + + metric : string, or callable + The metric to use when calculating distance between instances in a + feature array. If metric is a string, it must be one of the options + allowed by scipy.spatial.distance.pdist for its metric parameter, or + a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. + If metric is "precomputed", X is assumed to be a distance matrix. + Alternatively, if metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays from X as input and return a value indicating + the distance between them. + + n_jobs : int + The number of jobs to use for the computation. This works by breaking + down the pairwise matrix into n_jobs even slices and computing them in + parallel. + + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + block_size : int, default=64 + The maximum number of mebibytes (MiB) of memory per job (see``n_jobs``) + to use at a time for calculating pairwise distances. + + `**kwds` : optional keyword parameters + Any further parameters are passed directly to the distance function. + If using a scipy.spatial.distance metric, the parameters are still + metric dependent. See the scipy docs for usage examples. + + Returns + ------- + D : array [n_samples_a, n_reduced] + A distance matrix D such that D_{i, j} is the distance between the + ith and jth vectors of the given matrix X, if Y is None. + If Y is not None, then D_{i, j} is the distance between the ith array + from X and the jth array from Y. Here n_reduced depends on the + reduce_func. + + """ + + if not reduce_func: + raise ValueError("'reduce_func' needs to be passed as an argument.") - return [reduce_func(dist=D) for D in pairwise_distances_blockwise(X, - Y, metric, n_jobs, block_size, **kwds)] + reduced_distances = [reduce_func(dist=D) for D in + pairwise_distances_blockwise(X, Y, metric, n_jobs, + block_size, **kwds)] + return flexible_concatenate(reduced_distances) def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index d41cd3467ec15..fa574ab21fe1b 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -268,6 +268,22 @@ def _pairwise(self): class KNeighborsMixin(object): """Mixin for k-neighbors searches""" + def _kneighbors_reduce_func(self, dist, n_neighbors, return_distance): + sample_range = np.arange(dist.shape[0])[:, None] + neigh_ind = argpartition(dist, n_neighbors - 1, axis=1) + neigh_ind = neigh_ind[:, :n_neighbors] + # argpartition doesn't guarantee sorted order, so we sort again + neigh_ind = neigh_ind[ + sample_range, np.argsort(dist[sample_range, neigh_ind])] + if return_distance: + if self.effective_metric_ == 'euclidean': + result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind + else: + result = dist[sample_range, neigh_ind], neigh_ind + else: + result = neigh_ind + return result + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. @@ -364,13 +380,6 @@ class from an array representing our data set and ask who's reduce_func=reduce_func, block_size=1, **self.effective_metric_params_) - if return_distance: - result = np.hstack(result) - dist, neigh_ind = result[0], result[1].astype(np.int) - result = dist, neigh_ind - else: - result = np.vstack(result) - elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): raise ValueError( @@ -418,22 +427,6 @@ class from an array representing our data set and ask who's return dist, neigh_ind return neigh_ind - def _kneighbors_reduce_func(self, dist, n_neighbors, return_distance): - sample_range = np.arange(dist.shape[0])[:, None] - neigh_ind = argpartition(dist, n_neighbors - 1, axis=1) - neigh_ind = neigh_ind[:, :n_neighbors] - # argpartition doesn't guarantee sorted order, so we sort again - neigh_ind = neigh_ind[ - sample_range, np.argsort(dist[sample_range, neigh_ind])] - if return_distance: - if self.effective_metric_ == 'euclidean': - result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind - else: - result = dist[sample_range, neigh_ind], neigh_ind - else: - result = neigh_ind - return result - def kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity'): """Computes the (weighted) graph of k-Neighbors for points in X @@ -639,8 +632,6 @@ class from an array representing our data set and ask who's reduce_func=reduce_func, block_size=1, **self.effective_metric_params_) - results = np.hstack(results) - elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): raise ValueError( From 5ae169b668f5dce8422555df38f404cb1e636f06 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sat, 18 Feb 2017 18:19:42 +0530 Subject: [PATCH 38/96] TST: added tests for pairwise_distances_reduce --- sklearn/metrics/pairwise.py | 4 ++-- sklearn/metrics/tests/test_pairwise.py | 23 +++++++++++++++++++++++ sklearn/neighbors/base.py | 20 +++++++++++--------- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 16fb21a993011..9fafa2df75ff1 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1361,9 +1361,9 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', """ if not reduce_func: - raise ValueError("'reduce_func' needs to be passed as an argument.") + raise ValueError("reduce_func needs to be passed as an argument.") - reduced_distances = [reduce_func(dist=D) for D in + reduced_distances = [reduce_func(D) for D in pairwise_distances_blockwise(X, Y, metric, n_jobs, block_size, **kwds)] return flexible_concatenate(reduced_distances) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 6e1a7e8d6d77a..54b0664b726e1 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -29,6 +29,7 @@ from sklearn.metrics.pairwise import cosine_distances from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics.pairwise import pairwise_distances_blockwise +from sklearn.metrics.pairwise import pairwise_distances_reduce from sklearn.metrics.pairwise import pairwise_distances_argmin_min from sklearn.metrics.pairwise import pairwise_distances_argmin from sklearn.metrics.pairwise import pairwise_kernels @@ -372,6 +373,28 @@ def test_pairwise_distances_argmin_min(): np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) +def test_pairwise_distances_reduce_invalid_reduce_func(): + X = np.empty((400, 4)) + y = np.empty((200, 4)) + assert_raise_message(ValueError, 'reduce_func needs to be passed as an ' + 'argument', pairwise_distances_reduce, X, y, + block_size=0, metric='euclidean') + + +def _reduce_func(dist): + return dist[:, :100] + + +def test_pairwise_distances_reduce(): + rng = np.random.RandomState(0) + X = rng.random_sample((400, 4)) + # Reduced Euclidean distance + S = pairwise_distances(X)[:, :100] + S2 = pairwise_distances_reduce(X, None, reduce_func=_reduce_func, + block_size=1) + assert_array_almost_equal(S, S2) + + def test_pairwise_distances_blockwise_invalid_block_size(): X = np.empty((400, 4)) y = np.empty((200, 4)) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index fa574ab21fe1b..4549ee95d2968 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -372,12 +372,13 @@ class from an array representing our data set and ask who's # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': result = pairwise_distances_reduce( - X, self._fit_X, 'euclidean', n_jobs=n_jobs, - reduce_func=reduce_func, block_size=1, squared=True) + X, self._fit_X, reduce_func=reduce_func, + metric='euclidean', n_jobs=n_jobs, + block_size=1, squared=True) else: result = pairwise_distances_reduce( - X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, - reduce_func=reduce_func, block_size=1, + X, self._fit_X, reduce_func=reduce_func, + metric=self.effective_metric_, n_jobs=n_jobs, block_size=1, **self.effective_metric_params_) elif self._fit_method in ['ball_tree', 'kd_tree']: @@ -620,17 +621,18 @@ class from an array representing our data set and ask who's return_distance=return_distance) results = pairwise_distances_reduce( - X, self._fit_X, 'euclidean', n_jobs=self.n_jobs, - reduce_func=reduce_func, block_size=1, squared=True) + X, self._fit_X, reduce_func=reduce_func, + metric='euclidean', n_jobs=self.n_jobs, block_size=1, + squared=True) else: reduce_func = partial(self._radius_neighbors_reduce_func, radius=radius, return_distance=return_distance) results = pairwise_distances_reduce( - X, self._fit_X, self.effective_metric_, n_jobs=self.n_jobs, - reduce_func=reduce_func, block_size=1, - **self.effective_metric_params_) + X, self._fit_X, reduce_func=reduce_func, + metric=self.effective_metric_, n_jobs=self.n_jobs, + block_size=1, **self.effective_metric_params_) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): From 1cd56a89ac36b0f788ab951f73dd0def14798cf4 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sun, 19 Feb 2017 10:37:38 +0530 Subject: [PATCH 39/96] FIX: correct doctests --- sklearn/metrics/pairwise.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 9fafa2df75ff1..0192611ad4bb5 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -86,14 +86,24 @@ def flexible_concatenate(it, final_len=None): >>> M.format 'csr' >>> M.A # doctest: +NORMALIZE_WHITESPACE - array([[1], [2], [3], [4], [5], [6]], dtype=int64) + array([[1], + [2], + [3], + [4], + [5], + [6]]) >>> M = flexible_concatenate(make_example( ... lambda x: sparse.csc_matrix(np.array(x).reshape(-1, 1)))) ... # doctest: +NORMALIZE_WHITESPACE >>> M.format 'csc' >>> M.A # doctest: +NORMALIZE_WHITESPACE - array([[1], [2], [3], [4], [5], [6]], dtype=int64) + array([[1], + [2], + [3], + [4], + [5], + [6]]) """ def make_accumulator(prototype): From 855ea0a31ee0f5be67fefc90753d0e9edb1bf0db Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sun, 19 Feb 2017 11:19:08 +0530 Subject: [PATCH 40/96] FIX: remove conflicting doctests for Python2 and Python3 --- sklearn/metrics/pairwise.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 0192611ad4bb5..9b1f2b416a8fa 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -80,30 +80,6 @@ def flexible_concatenate(it, final_len=None): ... lambda x: np.array(x).reshape(-1, 1))) ... # doctest: +NORMALIZE_WHITESPACE array([[1], [2], [3], [4], [5], [6]]) - >>> M = flexible_concatenate(make_example( - ... lambda x: sparse.csr_matrix(np.array(x).reshape(-1, 1)))) - ... # doctest: +NORMALIZE_WHITESPACE - >>> M.format - 'csr' - >>> M.A # doctest: +NORMALIZE_WHITESPACE - array([[1], - [2], - [3], - [4], - [5], - [6]]) - >>> M = flexible_concatenate(make_example( - ... lambda x: sparse.csc_matrix(np.array(x).reshape(-1, 1)))) - ... # doctest: +NORMALIZE_WHITESPACE - >>> M.format - 'csc' - >>> M.A # doctest: +NORMALIZE_WHITESPACE - array([[1], - [2], - [3], - [4], - [5], - [6]]) """ def make_accumulator(prototype): From 6dd1d36f3351730c122dd1890ad2694617ed6416 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 18:18:39 +0530 Subject: [PATCH 41/96] FEAT: add new file for flexible_vstack --- sklearn/utils/stacking.py | 96 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 sklearn/utils/stacking.py diff --git a/sklearn/utils/stacking.py b/sklearn/utils/stacking.py new file mode 100644 index 0000000000000..67c3f575ffd15 --- /dev/null +++ b/sklearn/utils/stacking.py @@ -0,0 +1,96 @@ +def flexible_vstack(it, final_len=None): + """Helper that concatenates the elements of an iterable along axis=0. + + Supports iterables of arrays, lists, sparse matrices or tuples thereof. + + Parameters + ---------- + it : + + final_len : + + Examples + -------- + >>> import numpy as np + >>> from scipy import sparse + >>> + >>> def make_example(typ): + ... yield typ([1, 2]) + ... yield typ([3]) + ... yield typ([4, 5, 6]) + ... + >>> flexible_concatenate(make_example(list)) + [1, 2, 3, 4, 5, 6] + >>> flexible_concatenate(make_example(np.array)) + array([1, 2, 3, 4, 5, 6]) + >>> flexible_concatenate(zip(make_example(list), make_example(np.array))) + ([1, 2, 3, 4, 5, 6], array([1, 2, 3, 4, 5, 6])) + >>> flexible_concatenate(make_example(np.array)) + array([1, 2, 3, 4, 5, 6]) + >>> flexible_concatenate(make_example(np.array), final_len=6) + array([1, 2, 3, 4, 5, 6]) + >>> flexible_concatenate(make_example( + ... lambda x: np.array(x).reshape(-1, 1))) + ... # doctest: +NORMALIZE_WHITESPACE + array([[1], [2], [3], [4], [5], [6]]) + """ + + def make_accumulator(prototype): + if isinstance(prototype, tuple): + return tuple(make_accumulator(y_proto) for y_proto in prototype) + if isinstance(prototype, np.ndarray) and final_len is not None: + return np.empty((final_len,) + prototype.shape[1:], + dtype=prototype.dtype) + else: + return [] + + def accumulate(x, accumulator, prototype): + if isinstance(prototype, tuple): + for y, y_acc, y_prototype in zip(x, accumulator, prototype): + n_rows = accumulate(y, y_acc, y_prototype) + # XXX: could assert all n_rows are identical + return n_rows + elif isinstance(prototype, np.ndarray) and final_len is not None: + accumulator[offset:offset + len(x)] = x + return len(x) + elif isinstance(prototype, list): + accumulator.extend(x) + return len(x) + else: + accumulator.append(x) + if hasattr(x, 'shape'): + return x.shape[0] + return len(x) + + def finalize(accumulator, prototype): + if isinstance(prototype, tuple): + return tuple(finalize(y_acc, y_prototype) + for y_acc, y_prototype in zip(accumulator, prototype)) + elif isinstance(prototype, list): + return accumulator + elif isinstance(prototype, np.ndarray) and final_len is not None: + return accumulator + elif isinstance(prototype, np.ndarray): + return np.concatenate(accumulator, axis=0) + elif sparse.isspmatrix(prototype): + return sparse.vstack(accumulator).asformat(prototype.format) + else: + raise NotImplementedError('No finalizing for accumulation of %s' + % type(prototype)) + + it = iter(it) + try: + first = next(it) + except StopIteration: + raise ValueError('Require at least one output from the iterator') + + accumulator = make_accumulator(first) + offset = 0 + offset = accumulate(first, accumulator, first) + for x in it: + offset += accumulate(x, accumulator, first) + + if final_len is not None: + assert offset == final_len, 'Expected %d, got %d' % (final_len, offset) + +return finalize(accumulator, first) From d3f607d6bc2d9e43a4fb4d07b816578266bfa725 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 18:27:11 +0530 Subject: [PATCH 42/96] FIX: resolve conflicts on tests --- sklearn/metrics/pairwise.py | 102 +++--------------------------------- sklearn/utils/__init__.py | 4 +- sklearn/utils/stacking.py | 5 +- 3 files changed, 14 insertions(+), 97 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 9b1f2b416a8fa..f220d2b218bfd 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -22,6 +22,7 @@ from ..utils import gen_even_slices from ..utils import gen_batches from ..utils.extmath import row_norms, safe_sparse_dot +from ..utils import flexible_vstack from ..preprocessing import normalize from ..externals.joblib import Parallel from ..externals.joblib import delayed @@ -55,94 +56,6 @@ def _return_float_dtype(X, Y): return X, Y, dtype -def flexible_concatenate(it, final_len=None): - """Concatenate the elements of an iterable - Supports generators of arrays, lists, sparse matrices or tuples thereof - >>> import numpy as np - >>> from scipy import sparse - >>> - >>> def make_example(typ): - ... yield typ([1, 2]) - ... yield typ([3]) - ... yield typ([4, 5, 6]) - ... - >>> flexible_concatenate(make_example(list)) - [1, 2, 3, 4, 5, 6] - >>> flexible_concatenate(make_example(np.array)) - array([1, 2, 3, 4, 5, 6]) - >>> flexible_concatenate(zip(make_example(list), make_example(np.array))) - ([1, 2, 3, 4, 5, 6], array([1, 2, 3, 4, 5, 6])) - >>> flexible_concatenate(make_example(np.array)) - array([1, 2, 3, 4, 5, 6]) - >>> flexible_concatenate(make_example(np.array), final_len=6) - array([1, 2, 3, 4, 5, 6]) - >>> flexible_concatenate(make_example( - ... lambda x: np.array(x).reshape(-1, 1))) - ... # doctest: +NORMALIZE_WHITESPACE - array([[1], [2], [3], [4], [5], [6]]) - """ - - def make_accumulator(prototype): - if isinstance(prototype, tuple): - return tuple(make_accumulator(y_proto) for y_proto in prototype) - if isinstance(prototype, np.ndarray) and final_len is not None: - return np.empty((final_len,) + prototype.shape[1:], - dtype=prototype.dtype) - else: - return [] - - def accumulate(x, accumulator, prototype): - if isinstance(prototype, tuple): - for y, y_acc, y_prototype in zip(x, accumulator, prototype): - n_rows = accumulate(y, y_acc, y_prototype) - # XXX: could assert all n_rows are identical - return n_rows - elif isinstance(prototype, np.ndarray) and final_len is not None: - accumulator[offset:offset + len(x)] = x - return len(x) - elif isinstance(prototype, list): - accumulator.extend(x) - return len(x) - else: - accumulator.append(x) - if hasattr(x, 'shape'): - return x.shape[0] - return len(x) - - def finalize(accumulator, prototype): - if isinstance(prototype, tuple): - return tuple(finalize(y_acc, y_prototype) - for y_acc, y_prototype in zip(accumulator, prototype)) - elif isinstance(prototype, list): - return accumulator - elif isinstance(prototype, np.ndarray) and final_len is not None: - return accumulator - elif isinstance(prototype, np.ndarray): - return np.concatenate(accumulator, axis=0) - elif sparse.isspmatrix(prototype): - return sparse.vstack(accumulator).asformat(prototype.format) - else: - raise NotImplementedError('No finalizing for accumulation of %s' - % type(prototype)) - - it = iter(it) - try: - first = next(it) - except StopIteration: - raise ValueError('Require at least one output from the iterator') - - accumulator = make_accumulator(first) - offset = 0 - offset = accumulate(first, accumulator, first) - for x in it: - offset += accumulate(x, accumulator, first) - - if final_len is not None: - assert offset == final_len, 'Expected %d, got %d' % (final_len, offset) - - return finalize(accumulator, first) - - def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): """ Set X and Y appropriately and checks inputs @@ -1346,15 +1259,14 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', """ - if not reduce_func: + if reduce_func is not None: + reduced_distances = [reduce_func(D) for D in + pairwise_distances_blockwise(X, Y, metric, n_jobs, + block_size, **kwds)] + return flexible_vstack(reduced_distances) + else: raise ValueError("reduce_func needs to be passed as an argument.") - reduced_distances = [reduce_func(D) for D in - pairwise_distances_blockwise(X, Y, metric, n_jobs, - block_size, **kwds)] - return flexible_concatenate(reduced_distances) - - def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, **kwds): """Compute the distance matrix from a vector array X and optional Y. diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index a4e5b6a4f3ea5..c1a804841f2d7 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -17,6 +17,7 @@ from ..externals.joblib import cpu_count from ..exceptions import DataConversionWarning from .deprecation import deprecated +from .stacking import flexible_vstack __all__ = ["murmurhash3_32", "as_float_array", @@ -25,7 +26,8 @@ "compute_class_weight", "compute_sample_weight", "column_or_1d", "safe_indexing", "check_consistent_length", "check_X_y", 'indexable', - "check_symmetric", "indices_to_mask", "deprecated"] + "check_symmetric", "indices_to_mask", "deprecated", + "flexible_vstack"] def safe_mask(X, mask): diff --git a/sklearn/utils/stacking.py b/sklearn/utils/stacking.py index 67c3f575ffd15..ff5d5fe24180d 100644 --- a/sklearn/utils/stacking.py +++ b/sklearn/utils/stacking.py @@ -1,3 +1,6 @@ +import numpy as np +from scipy import sparse + def flexible_vstack(it, final_len=None): """Helper that concatenates the elements of an iterable along axis=0. @@ -93,4 +96,4 @@ def finalize(accumulator, prototype): if final_len is not None: assert offset == final_len, 'Expected %d, got %d' % (final_len, offset) -return finalize(accumulator, first) + return finalize(accumulator, first) From c373ceec03a343fba2e0c33ecdc8b8ff7e84728f Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 18:32:09 +0530 Subject: [PATCH 43/96] FIX: remove block_size placeholders from neighbors --- sklearn/neighbors/base.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 4549ee95d2968..d82a98c7179cb 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -373,12 +373,11 @@ class from an array representing our data set and ask who's if self.effective_metric_ == 'euclidean': result = pairwise_distances_reduce( X, self._fit_X, reduce_func=reduce_func, - metric='euclidean', n_jobs=n_jobs, - block_size=1, squared=True) + metric='euclidean', n_jobs=n_jobs, squared=True) else: result = pairwise_distances_reduce( X, self._fit_X, reduce_func=reduce_func, - metric=self.effective_metric_, n_jobs=n_jobs, block_size=1, + metric=self.effective_metric_, n_jobs=n_jobs, **self.effective_metric_params_) elif self._fit_method in ['ball_tree', 'kd_tree']: @@ -622,7 +621,7 @@ class from an array representing our data set and ask who's results = pairwise_distances_reduce( X, self._fit_X, reduce_func=reduce_func, - metric='euclidean', n_jobs=self.n_jobs, block_size=1, + metric='euclidean', n_jobs=self.n_jobs, squared=True) else: reduce_func = partial(self._radius_neighbors_reduce_func, @@ -632,7 +631,7 @@ class from an array representing our data set and ask who's results = pairwise_distances_reduce( X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=self.n_jobs, - block_size=1, **self.effective_metric_params_) + **self.effective_metric_params_) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): From 21ad2b185c54b5ab2c4b77435e337f492f67eaf1 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 18:43:14 +0530 Subject: [PATCH 44/96] FIX: use generator expressions --- sklearn/metrics/pairwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index f220d2b218bfd..623d4d1c91590 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1260,9 +1260,9 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', """ if reduce_func is not None: - reduced_distances = [reduce_func(D) for D in + reduced_distances = (reduce_func(D) for D in \ pairwise_distances_blockwise(X, Y, metric, n_jobs, - block_size, **kwds)] + block_size, **kwds)) return flexible_vstack(reduced_distances) else: raise ValueError("reduce_func needs to be passed as an argument.") From 676c2725d31af71f36ba9ab0068f14e9f441edb0 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 19:13:08 +0530 Subject: [PATCH 45/96] FIX: replace error on invalid block_size with warning --- sklearn/metrics/pairwise.py | 9 ++++++--- sklearn/metrics/tests/test_pairwise.py | 23 ++++++++++++++--------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 623d4d1c91590..d8e3eb0456b69 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -11,6 +11,7 @@ import itertools from functools import partial +import warnings import numpy as np from scipy import sparse @@ -1343,9 +1344,11 @@ def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, block_n_rows = min(block_n_rows, n_samples) if block_n_rows < 1: min_block_mib = np.ceil(n_samples * BYTES_PER_FLOAT * 2 ** -20) - raise ValueError('block_size should be at least n_samples * %d bytes ' - '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, - min_block_mib, block_size)) + warnings.warn('block_size should be at least n_samples * %d bytes ' + '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, + min_block_mib, block_size)) + block_size = min_block_mib + block_n_rows = 1 return _generate_pairwise_distances_blockwise(X, Y, metric=metric, n_jobs=n_jobs, diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 54b0664b726e1..f69b1025eac16 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -14,6 +14,7 @@ from sklearn.utils.testing import assert_raises_regexp from sklearn.utils.testing import assert_true from sklearn.utils.testing import ignore_warnings +from sklearn.utils.testing import assert_warns_message from sklearn.externals.six import iteritems @@ -395,21 +396,25 @@ def test_pairwise_distances_reduce(): assert_array_almost_equal(S, S2) +def check_pairwise_distances_blockwise(X, Y, block_size, metric='euclidean'): + gen = pairwise_distances_blockwise(X, Y, block_size=block_size, + metric=metric) + blockwise_distances = list(gen) + for block in blockwise_distances: + + blockwise_distances = np.vstack(blockwise_distances) + S = pairwise_distances(X, Y, metric=metric) + assert_array_almost_equal(blockwise_distances, S) + + def test_pairwise_distances_blockwise_invalid_block_size(): X = np.empty((400, 4)) y = np.empty((200, 4)) - assert_raise_message(ValueError, 'block_size should be at least n_samples ' + assert_warns_message(UserWarning, 'block_size should be at least n_samples ' '* 8 bytes = 1 MiB, got 0', pairwise_distances_blockwise, X, y, block_size=0, metric='euclidean') - - -def check_pairwise_distances_blockwise(X, Y, block_size, metric): - gen = pairwise_distances_blockwise(X, Y, block_size=block_size, - metric=metric) - blockwise_distances = np.vstack(list(gen)) - S = pairwise_distances(X, Y, metric=metric) - assert_array_almost_equal(blockwise_distances, S) + check_pairwise_distances_blockwise(X, y, block_size=0) def test_pairwise_distances_blockwise(): From a3074bef9a05f004b6c23b9575ab5b396f22dc58 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 19:25:09 +0530 Subject: [PATCH 46/96] FIX: replace error on invalid block_size with warning --- sklearn/metrics/tests/test_pairwise.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index f69b1025eac16..9414f7f0d3737 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -401,6 +401,8 @@ def check_pairwise_distances_blockwise(X, Y, block_size, metric='euclidean'): metric=metric) blockwise_distances = list(gen) for block in blockwise_distances: + memory_used = len(block) * BYTES_PER_FLOAT + assert_true(memory_used <= block_size * 2 ** 20) blockwise_distances = np.vstack(blockwise_distances) S = pairwise_distances(X, Y, metric=metric) From 3756a58e60d2f4b9fb14fedd92e9d13e6e147559 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 19:31:27 +0530 Subject: [PATCH 47/96] TST: check each components meets specified memory requirement --- sklearn/metrics/tests/test_pairwise.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 9414f7f0d3737..90124429e4739 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -397,9 +397,14 @@ def test_pairwise_distances_reduce(): def check_pairwise_distances_blockwise(X, Y, block_size, metric='euclidean'): + from sklearn.metrics.pairwise import BYTES_PER_FLOAT gen = pairwise_distances_blockwise(X, Y, block_size=block_size, metric=metric) blockwise_distances = list(gen) + min_block_mib = X.shape[0] * BYTES_PER_FLOAT * 2 ** -20 + if block_size < min_block_mib: + block_size = min_block_mib + for block in blockwise_distances: memory_used = len(block) * BYTES_PER_FLOAT assert_true(memory_used <= block_size * 2 ** 20) From 2ab293b52c9cc2c57fc8c20a93fc7c71257ababa Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 19:47:08 +0530 Subject: [PATCH 48/96] FIX: remove PEP8 errors --- sklearn/metrics/pairwise.py | 4 ++-- sklearn/metrics/tests/test_pairwise.py | 6 +++--- sklearn/utils/stacking.py | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index d8e3eb0456b69..30d0e5a1494cb 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -14,7 +14,6 @@ import warnings import numpy as np -from scipy import sparse from scipy.spatial import distance from scipy.sparse import csr_matrix from scipy.sparse import issparse @@ -1261,13 +1260,14 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', """ if reduce_func is not None: - reduced_distances = (reduce_func(D) for D in \ + reduced_distances = (reduce_func(D) for D in pairwise_distances_blockwise(X, Y, metric, n_jobs, block_size, **kwds)) return flexible_vstack(reduced_distances) else: raise ValueError("reduce_func needs to be passed as an argument.") + def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, **kwds): """Compute the distance matrix from a vector array X and optional Y. diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 90124429e4739..27ff8b19a75b1 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -401,7 +401,7 @@ def check_pairwise_distances_blockwise(X, Y, block_size, metric='euclidean'): gen = pairwise_distances_blockwise(X, Y, block_size=block_size, metric=metric) blockwise_distances = list(gen) - min_block_mib = X.shape[0] * BYTES_PER_FLOAT * 2 ** -20 + min_block_mib = X.shape[0] * BYTES_PER_FLOAT * 2 ** -20 if block_size < min_block_mib: block_size = min_block_mib @@ -417,8 +417,8 @@ def check_pairwise_distances_blockwise(X, Y, block_size, metric='euclidean'): def test_pairwise_distances_blockwise_invalid_block_size(): X = np.empty((400, 4)) y = np.empty((200, 4)) - assert_warns_message(UserWarning, 'block_size should be at least n_samples ' - '* 8 bytes = 1 MiB, got 0', + assert_warns_message(UserWarning, 'block_size should be at least ' + 'n_samples * 8 bytes = 1 MiB, got 0', pairwise_distances_blockwise, X, y, block_size=0, metric='euclidean') check_pairwise_distances_blockwise(X, y, block_size=0) diff --git a/sklearn/utils/stacking.py b/sklearn/utils/stacking.py index ff5d5fe24180d..99294dd029159 100644 --- a/sklearn/utils/stacking.py +++ b/sklearn/utils/stacking.py @@ -1,6 +1,7 @@ import numpy as np from scipy import sparse + def flexible_vstack(it, final_len=None): """Helper that concatenates the elements of an iterable along axis=0. From 84d34e31304843df100856d910d4584a90308b75 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 20:13:58 +0530 Subject: [PATCH 49/96] ENH: move flexible_vstack to __init__ --- sklearn/utils/__init__.py | 110 +++++++++++++++++++++++++++++++++++++- sklearn/utils/stacking.py | 100 ---------------------------------- 2 files changed, 108 insertions(+), 102 deletions(-) delete mode 100644 sklearn/utils/stacking.py diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index c1a804841f2d7..5e0d80af114f4 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -4,6 +4,7 @@ from collections import Sequence import numpy as np +from scipy import sparse from scipy.sparse import issparse import warnings @@ -17,8 +18,6 @@ from ..externals.joblib import cpu_count from ..exceptions import DataConversionWarning from .deprecation import deprecated -from .stacking import flexible_vstack - __all__ = ["murmurhash3_32", "as_float_array", "assert_all_finite", "check_array", @@ -438,3 +437,110 @@ def indices_to_mask(indices, mask_length): mask[indices] = True return mask + + +def flexible_vstack(it, final_len=None): + """Helper that concatenates the elements of an iterable. + + Supports iterables of arrays, lists, sparse matrices or tuples. + + Parameters + ---------- + it : iterable + Iterable whose elements are to be concatenated. + + final_len : int, default = None + If passed, specifies the expected length of 'stacked_results'. + + Returns + ------- + stacked_results : array-like or sparse-matrix or tuple + The result of concatenating the elements of 'it'. + + Examples + -------- + >>> import numpy as np + >>> from scipy import sparse + >>> + >>> def make_example(typ): + ... yield typ([1, 2]) + ... yield typ([3]) + ... yield typ([4, 5, 6]) + ... + >>> flexible_vstack(make_example(list)) + [1, 2, 3, 4, 5, 6] + >>> flexible_vstack(make_example(np.array)) + array([1, 2, 3, 4, 5, 6]) + >>> flexible_vstack(zip(make_example(list), make_example(np.array))) + ([1, 2, 3, 4, 5, 6], array([1, 2, 3, 4, 5, 6])) + >>> flexible_vstack(make_example(np.array)) + array([1, 2, 3, 4, 5, 6]) + >>> flexible_vstack(make_example(np.array), final_len=6) + array([1, 2, 3, 4, 5, 6]) + >>> flexible_vstack(make_example( + ... lambda x: np.array(x).reshape(-1, 1))) + ... # doctest: +NORMALIZE_WHITESPACE + array([[1], [2], [3], [4], [5], [6]]) + """ + + def make_accumulator(prototype): + if isinstance(prototype, tuple): + return tuple(make_accumulator(y_proto) for y_proto in prototype) + if isinstance(prototype, np.ndarray) and final_len is not None: + return np.empty((final_len,) + prototype.shape[1:], + dtype=prototype.dtype) + else: + return [] + + def accumulate(x, accumulator, prototype): + if isinstance(prototype, tuple): + for y, y_acc, y_prototype in zip(x, accumulator, prototype): + n_rows = accumulate(y, y_acc, y_prototype) + # XXX: could assert all n_rows are identical + return n_rows + elif isinstance(prototype, np.ndarray) and final_len is not None: + accumulator[offset:offset + len(x)] = x + return len(x) + elif isinstance(prototype, list): + accumulator.extend(x) + return len(x) + else: + accumulator.append(x) + if hasattr(x, 'shape'): + return x.shape[0] + return len(x) + + def finalize(accumulator, prototype): + if isinstance(prototype, tuple): + return tuple(finalize(y_acc, y_prototype) + for y_acc, y_prototype in zip(accumulator, prototype)) + elif isinstance(prototype, list): + return accumulator + elif isinstance(prototype, np.ndarray) and final_len is not None: + return accumulator + elif isinstance(prototype, np.ndarray): + return np.concatenate(accumulator, axis=0) + elif sparse.isspmatrix(prototype): + return sparse.vstack(accumulator).asformat(prototype.format) + else: + raise NotImplementedError('No finalizing for accumulation of %s' + % type(prototype)) + + it = iter(it) + try: + # prototype + first = next(it) + except StopIteration: + raise ValueError('Require at least one output from the iterator') + + accumulator = make_accumulator(first) + offset = 0 + offset = accumulate(first, accumulator, first) + for x in it: + offset += accumulate(x, accumulator, first) + + if final_len is not None: + assert offset == final_len, 'Expected %d, got %d' % (final_len, offset) + + stacked_results = finalize(accumulator, first) + return stacked_results diff --git a/sklearn/utils/stacking.py b/sklearn/utils/stacking.py deleted file mode 100644 index 99294dd029159..0000000000000 --- a/sklearn/utils/stacking.py +++ /dev/null @@ -1,100 +0,0 @@ -import numpy as np -from scipy import sparse - - -def flexible_vstack(it, final_len=None): - """Helper that concatenates the elements of an iterable along axis=0. - - Supports iterables of arrays, lists, sparse matrices or tuples thereof. - - Parameters - ---------- - it : - - final_len : - - Examples - -------- - >>> import numpy as np - >>> from scipy import sparse - >>> - >>> def make_example(typ): - ... yield typ([1, 2]) - ... yield typ([3]) - ... yield typ([4, 5, 6]) - ... - >>> flexible_concatenate(make_example(list)) - [1, 2, 3, 4, 5, 6] - >>> flexible_concatenate(make_example(np.array)) - array([1, 2, 3, 4, 5, 6]) - >>> flexible_concatenate(zip(make_example(list), make_example(np.array))) - ([1, 2, 3, 4, 5, 6], array([1, 2, 3, 4, 5, 6])) - >>> flexible_concatenate(make_example(np.array)) - array([1, 2, 3, 4, 5, 6]) - >>> flexible_concatenate(make_example(np.array), final_len=6) - array([1, 2, 3, 4, 5, 6]) - >>> flexible_concatenate(make_example( - ... lambda x: np.array(x).reshape(-1, 1))) - ... # doctest: +NORMALIZE_WHITESPACE - array([[1], [2], [3], [4], [5], [6]]) - """ - - def make_accumulator(prototype): - if isinstance(prototype, tuple): - return tuple(make_accumulator(y_proto) for y_proto in prototype) - if isinstance(prototype, np.ndarray) and final_len is not None: - return np.empty((final_len,) + prototype.shape[1:], - dtype=prototype.dtype) - else: - return [] - - def accumulate(x, accumulator, prototype): - if isinstance(prototype, tuple): - for y, y_acc, y_prototype in zip(x, accumulator, prototype): - n_rows = accumulate(y, y_acc, y_prototype) - # XXX: could assert all n_rows are identical - return n_rows - elif isinstance(prototype, np.ndarray) and final_len is not None: - accumulator[offset:offset + len(x)] = x - return len(x) - elif isinstance(prototype, list): - accumulator.extend(x) - return len(x) - else: - accumulator.append(x) - if hasattr(x, 'shape'): - return x.shape[0] - return len(x) - - def finalize(accumulator, prototype): - if isinstance(prototype, tuple): - return tuple(finalize(y_acc, y_prototype) - for y_acc, y_prototype in zip(accumulator, prototype)) - elif isinstance(prototype, list): - return accumulator - elif isinstance(prototype, np.ndarray) and final_len is not None: - return accumulator - elif isinstance(prototype, np.ndarray): - return np.concatenate(accumulator, axis=0) - elif sparse.isspmatrix(prototype): - return sparse.vstack(accumulator).asformat(prototype.format) - else: - raise NotImplementedError('No finalizing for accumulation of %s' - % type(prototype)) - - it = iter(it) - try: - first = next(it) - except StopIteration: - raise ValueError('Require at least one output from the iterator') - - accumulator = make_accumulator(first) - offset = 0 - offset = accumulate(first, accumulator, first) - for x in it: - offset += accumulate(x, accumulator, first) - - if final_len is not None: - assert offset == final_len, 'Expected %d, got %d' % (final_len, offset) - - return finalize(accumulator, first) From 6d49c12f7b3124d90687f8b85610950a717bc647 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 20:36:34 +0530 Subject: [PATCH 50/96] TST: add tests for flexible_vstack --- sklearn/utils/tests/test_utils.py | 48 +++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 13e78d46cb940..13bc084598860 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -18,6 +18,7 @@ from sklearn.utils import safe_indexing from sklearn.utils import shuffle from sklearn.utils import gen_even_slices +from sklearn.utils import flexible_vstack from sklearn.utils.extmath import pinvh from sklearn.utils.arpack import eigsh from sklearn.utils.mocking import MockDataFrame @@ -41,6 +42,48 @@ def test_make_rng(): assert_raises(ValueError, check_random_state, "some invalid seed") +def test_flexible_vstack(): + from scipy import sparse + + def make_example(typ): + yield typ([1, 2]) + yield typ([3]) + yield typ([4, 5, 6]) + + results = flexible_vstack(make_example(list)) + expected_results = [1, 2, 3, 4, 5, 6] + assert_equal(results, expected_results) + + results = flexible_vstack(make_example(np.array)) + expected_results = np.array([1, 2, 3, 4, 5, 6]) + assert_array_equal(results, expected_results) + + results = flexible_vstack(zip(make_example(list), make_example(np.array))) + expected_results = ([1, 2, 3, 4, 5, 6], np.array([1, 2, 3, 4, 5, 6])) + assert_array_equal(results, expected_results) + + results = flexible_vstack(make_example(np.array), final_len=6) + expected_results = np.array([1, 2, 3, 4, 5, 6]) + assert_array_equal(results, expected_results) + + results = flexible_vstack( + make_example(lambda x: np.array(x).reshape(-1, 1))) + expected_results = np.array([[1], [2], [3], [4], [5], [6]]) + assert_array_equal(results, expected_results) + + results = flexible_vstack( + make_example(lambda x: sparse.csr_matrix(np.array(x).reshape(-1, 1)))) + expected_results = np.array([[1], [2], [3], [4], [5], [6]], dtype=np.int64) + assert_equal(results.format, 'csr') + assert_array_equal(results.A, expected_results) + + results = flexible_vstack( + make_example(lambda x: sparse.csc_matrix(np.array(x).reshape(-1, 1)))) + expected_results = np.array([[1], [2], [3], [4], [5], [6]], dtype=np.int64) + assert_equal(results.format, 'csc') + assert_array_equal(results.A, expected_results) + + def test_deprecated(): # Test whether the deprecated decorator issues appropriate warnings # Copied almost verbatim from http://docs.python.org/library/warnings.html @@ -143,7 +186,7 @@ def test_arpack_eigsh_initialization(): # Test if eigsh is working correctly # New initialization [-1,1] (as in original ARPACK) # Was [0,1] before, with which this test could fail - v0 = random_state.uniform(-1,1, A.shape[0]) + v0 = random_state.uniform(-1, 1, A.shape[0]) w, _ = eigsh(A, k=k, sigma=0.0, v0=v0) # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest @@ -258,7 +301,8 @@ def test_shuffle_dont_convert_to_array(): def test_gen_even_slices(): # check that gen_even_slices contains all samples some_range = range(10) - joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)])) + joined_range = list(chain(*[some_range[slice] for slice in + gen_even_slices(10, 3)])) assert_array_equal(some_range, joined_range) # check that passing negative n_chunks raises an error From b3fb795ffd9d7e93d13ea7615ef590493e4f4c1f Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 20:49:41 +0530 Subject: [PATCH 51/96] DOC: improve docstring for --- sklearn/metrics/pairwise.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 30d0e5a1494cb..874dc712a47ec 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1194,12 +1194,6 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', distances are computed. If the input is a distances matrix, it is reduced in size and returned instead. - This is equivalent to calling: - - pairwise_distances(X, y, metric, n_jobs) - - but should use less memory. - Parameters ---------- X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, @@ -1212,11 +1206,7 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', reduce_func : function, callable The function which is applied on each block of the distance matrix - reducing its size. It reduces the size of each block from - [n_block_samples, n_samples_a] or [n_block_samples, n_samples] to - [n_block_samples, n_reduced] where n_block_samples is the number of - samples in each block and n_reduced depends on the reduce_func defined - by the user. + reducing its size. metric : string, or callable The metric to use when calculating distance between instances in a @@ -1250,12 +1240,11 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', Returns ------- - D : array [n_samples_a, n_reduced] + D : array-like or sparse matrix or tuple A distance matrix D such that D_{i, j} is the distance between the ith and jth vectors of the given matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance between the ith array - from X and the jth array from Y. Here n_reduced depends on the - reduce_func. + from X and the jth array from Y. """ From f3d3a1aa5878176e42c59feb0b8d76102ba850f1 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 20 Feb 2017 23:48:07 +0530 Subject: [PATCH 52/96] FIX: correct X, y for Python3 --- sklearn/metrics/tests/test_pairwise.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 27ff8b19a75b1..a2afb79637ef5 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -415,8 +415,9 @@ def check_pairwise_distances_blockwise(X, Y, block_size, metric='euclidean'): def test_pairwise_distances_blockwise_invalid_block_size(): - X = np.empty((400, 4)) - y = np.empty((200, 4)) + rng = np.random.RandomState(0) + X = rng.random_sample((400, 4)) + y = rng.random_sample((200, 4)) assert_warns_message(UserWarning, 'block_size should be at least ' 'n_samples * 8 bytes = 1 MiB, got 0', pairwise_distances_blockwise, X, y, block_size=0, From f901d7e6785793088b5c1eeca488d2f00bea7998 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Fri, 24 Feb 2017 22:36:58 +0530 Subject: [PATCH 53/96] ENH: rewrote pairwise_distances_argmin_min using pairwise_distances_reduce --- sklearn/metrics/pairwise.py | 91 +++++++++----------------- sklearn/metrics/tests/test_pairwise.py | 8 ++- 2 files changed, 38 insertions(+), 61 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 874dc712a47ec..d10f1e6d1b87a 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -20,7 +20,6 @@ from ..utils import check_array from ..utils import gen_even_slices -from ..utils import gen_batches from ..utils.extmath import row_norms, safe_sparse_dot from ..utils import flexible_vstack from ..preprocessing import normalize @@ -258,8 +257,19 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, return distances if squared else np.sqrt(distances, out=distances) +def _argmin_min_reduce_min(dist): + indices = dist.argmin(axis=1) + values = dist[np.arange(dist.shape[0]), indices] + return indices, values + + +BYTES_PER_FLOAT = 8 +DEFAULT_BLOCK_SIZE = 64 + + def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", - batch_size=500, metric_kwargs=None): + block_size=DEFAULT_BLOCK_SIZE, + metric_kwargs=None, batch_size=None): """Compute minimum distances between one point and a set of points. This function computes for each row in X, the index of the row of Y which @@ -279,12 +289,9 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", Arrays containing points. Respective shapes (n_samples1, n_features) and (n_samples2, n_features) - batch_size : integer - To reduce memory consumption over the naive solution, data are - processed in batches, comprising batch_size rows of X and - batch_size rows of Y. The default value is quite conservative, but - can be changed for fine-tuning. The larger the number, the larger the - memory usage. + block_size : int, default=64 + The maximum number of mebibytes (MiB) of memory per job to use at a + time for calculating pairwise distances. metric : string or callable, default 'euclidean' metric to use for distance computation. Any metric from scikit-learn @@ -332,12 +339,9 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", sklearn.metrics.pairwise_distances sklearn.metrics.pairwise_distances_argmin """ - dist_func = None - if metric in PAIRWISE_DISTANCE_FUNCTIONS: - dist_func = PAIRWISE_DISTANCE_FUNCTIONS[metric] - elif not callable(metric) and not isinstance(metric, str): - raise ValueError("'metric' must be a string or a callable") - + if batch_size is not None: + warnings.warn("'batch_size' was deprecated in version 0.19 and will " + "be removed in version 0.21.", DeprecationWarning) X, Y = check_pairwise_arrays(X, Y) if metric_kwargs is None: @@ -346,47 +350,18 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", if axis == 0: X, Y = Y, X - # Allocate output arrays - indices = np.empty(X.shape[0], dtype=np.intp) - values = np.empty(X.shape[0]) - values.fill(np.infty) - - for chunk_x in gen_batches(X.shape[0], batch_size): - X_chunk = X[chunk_x, :] - - for chunk_y in gen_batches(Y.shape[0], batch_size): - Y_chunk = Y[chunk_y, :] - - if dist_func is not None: - if metric == 'euclidean': # special case, for speed - d_chunk = safe_sparse_dot(X_chunk, Y_chunk.T, - dense_output=True) - d_chunk *= -2 - d_chunk += row_norms(X_chunk, squared=True)[:, np.newaxis] - d_chunk += row_norms(Y_chunk, squared=True)[np.newaxis, :] - np.maximum(d_chunk, 0, d_chunk) - else: - d_chunk = dist_func(X_chunk, Y_chunk, **metric_kwargs) - else: - d_chunk = pairwise_distances(X_chunk, Y_chunk, - metric=metric, **metric_kwargs) - - # Update indices and minimum values using chunk - min_indices = d_chunk.argmin(axis=1) - min_values = d_chunk[np.arange(chunk_x.stop - chunk_x.start), - min_indices] - - flags = values[chunk_x] > min_values - indices[chunk_x][flags] = min_indices[flags] + chunk_y.start - values[chunk_x][flags] = min_values[flags] + indices, values = \ + pairwise_distances_reduce(X, Y, reduce_func=_argmin_min_reduce_min, + metric=metric, block_size=block_size, + **metric_kwargs) if metric == "euclidean" and not metric_kwargs.get("squared", False): np.sqrt(values, values) return indices, values -def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", - batch_size=500, metric_kwargs=None): +def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", block_size=500, + metric_kwargs=None, batch_size=None): """Compute minimum distances between one point and a set of points. This function computes for each row in X, the index of the row of Y which @@ -410,12 +385,9 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", Arrays containing points. Respective shapes (n_samples1, n_features) and (n_samples2, n_features) - batch_size : integer - To reduce memory consumption over the naive solution, data are - processed in batches, comprising batch_size rows of X and - batch_size rows of Y. The default value is quite conservative, but - can be changed for fine-tuning. The larger the number, the larger the - memory usage. + block_size : int, default=64 + The maximum number of mebibytes (MiB) of memory per job to use at a + time for calculating pairwise distances. metric : string or callable metric to use for distance computation. Any metric from scikit-learn @@ -459,10 +431,13 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", sklearn.metrics.pairwise_distances sklearn.metrics.pairwise_distances_argmin_min """ + if batch_size is not None: + warnings.warn("'batch_size' was deprecated in version 0.19 and will " + "be removed in version 0.21.", DeprecationWarning) if metric_kwargs is None: metric_kwargs = {} - return pairwise_distances_argmin_min(X, Y, axis, metric, batch_size, + return pairwise_distances_argmin_min(X, Y, axis, metric, block_size, metric_kwargs)[0] @@ -1133,9 +1108,6 @@ def _pairwise_callable(X, Y, metric, **kwds): 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] -DEFAULT_BLOCK_SIZE = 64 -BYTES_PER_FLOAT = 8 - def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, @@ -1247,7 +1219,6 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', from X and the jth array from Y. """ - if reduce_func is not None: reduced_distances = (reduce_func(D) for D in pairwise_distances_blockwise(X, Y, metric, n_jobs, diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index a2afb79637ef5..05f2756362dd7 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -369,10 +369,16 @@ def test_pairwise_distances_argmin_min(): dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( - X, Y, axis=0, metric="manhattan", batch_size=50) + X, Y, axis=0, metric="manhattan", block_size=50) np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) + # Test batch_size deprecation warning + assert_warns_message(DeprecationWarning, "'batch_size' was deprecated in " + "version 0.19 and will be removed in version 0.21.", + pairwise_distances_argmin_min, X, Y, batch_size=500, + metric='euclidean') + def test_pairwise_distances_reduce_invalid_reduce_func(): X = np.empty((400, 4)) From c48e9a14fa6db3987da33f577dcd0e4e739ba8b4 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 10 Dec 2017 21:22:55 +1100 Subject: [PATCH 54/96] [WIP] ENH Add working_memory global config for chunked operations --- sklearn/__init__.py | 76 +----- sklearn/metrics/__init__.py | 4 +- .../cluster/tests/test_unsupervised.py | 28 -- sklearn/metrics/cluster/unsupervised.py | 100 ++----- sklearn/metrics/pairwise.py | 246 +++++++----------- sklearn/metrics/tests/test_pairwise.py | 70 ++--- sklearn/neighbors/base.py | 16 +- sklearn/utils/__init__.py | 37 +++ 8 files changed, 193 insertions(+), 384 deletions(-) diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 5f2278d1c8c37..42c562584a10d 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -15,84 +15,14 @@ import sys import re import warnings -import os -from contextlib import contextmanager as _contextmanager import logging +from ._config import get_config, set_config, config_context + logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) -_ASSUME_FINITE = bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)) - - -def get_config(): - """Retrieve current values for configuration set by :func:`set_config` - - Returns - ------- - config : dict - Keys are parameter names that can be passed to :func:`set_config`. - """ - return {'assume_finite': _ASSUME_FINITE} - - -def set_config(assume_finite=None): - """Set global scikit-learn configuration - - Parameters - ---------- - assume_finite : bool, optional - If True, validation for finiteness will be skipped, - saving time, but leading to potential crashes. If - False, validation for finiteness will be performed, - avoiding error. - """ - global _ASSUME_FINITE - if assume_finite is not None: - _ASSUME_FINITE = assume_finite - - -@_contextmanager -def config_context(**new_config): - """Context manager for global scikit-learn configuration - - Parameters - ---------- - assume_finite : bool, optional - If True, validation for finiteness will be skipped, - saving time, but leading to potential crashes. If - False, validation for finiteness will be performed, - avoiding error. - - Notes - ----- - All settings, not just those presently modified, will be returned to - their previous values when the context manager is exited. This is not - thread-safe. - - Examples - -------- - >>> import sklearn - >>> from sklearn.utils.validation import assert_all_finite - >>> with sklearn.config_context(assume_finite=True): - ... assert_all_finite([float('nan')]) - >>> with sklearn.config_context(assume_finite=True): - ... with sklearn.config_context(assume_finite=False): - ... assert_all_finite([float('nan')]) - ... # doctest: +ELLIPSIS - Traceback (most recent call last): - ... - ValueError: Input contains NaN, ... - """ - old_config = get_config().copy() - set_config(**new_config) - - try: - yield - finally: - set_config(**old_config) - # Make sure that DeprecationWarning within this package always gets printed warnings.filterwarnings('always', category=DeprecationWarning, @@ -145,7 +75,7 @@ def config_context(**new_config): 'preprocessing', 'random_projection', 'semi_supervised', 'svm', 'tree', 'discriminant_analysis', # Non-modules: - 'clone'] + 'clone', 'get_config', 'set_config', 'config_context'] def setup_module(module): diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 241fe0a051f2e..c034c3f3f4d9c 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -51,7 +51,7 @@ from .pairwise import pairwise_distances_argmin from .pairwise import pairwise_distances_argmin_min from .pairwise import pairwise_kernels -from .pairwise import pairwise_distances_reduce +from .pairwise import pairwise_distances_chunked from .regression import explained_variance_score from .regression import mean_absolute_error @@ -102,10 +102,10 @@ 'mutual_info_score', 'normalized_mutual_info_score', 'pairwise_distances', - 'pairwise_distances_reduce', 'pairwise_distances_argmin', 'pairwise_distances_argmin_min', 'pairwise_distances_argmin_min', + 'pairwise_distances_chunked', 'pairwise_kernels', 'precision_recall_curve', 'precision_recall_fscore_support', diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index ca04c6888f418..9d6e37843d438 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -35,26 +35,6 @@ def test_silhouette(): score_euclidean = silhouette_score(X, y, metric='euclidean') assert_almost_equal(score_precomputed, score_euclidean) - # test block_size - score_batched = silhouette_score(X, y, block_size=10, - metric='euclidean') - assert_almost_equal(score_batched, score_euclidean) - score_batched = silhouette_score(D, y, block_size=10, - metric='precomputed') - assert_almost_equal(score_batched, score_euclidean) - # absurdly large block_size - score_batched = silhouette_score(D, y, block_size=10000, - metric='precomputed') - assert_almost_equal(score_batched, score_euclidean) - - # smoke test n_jobs with and without explicit block_size - score_parallel = silhouette_score(X, y, - n_jobs=2, metric='euclidean') - assert_almost_equal(score_parallel, score_euclidean) - score_parallel = silhouette_score(X, y, block_size=10, - n_jobs=2, metric='euclidean') - assert_almost_equal(score_parallel, score_euclidean) - if X is X_dense: score_dense_without_sampling = score_precomputed else: @@ -101,14 +81,6 @@ def test_cluster_size_1(): assert_array_equal(ss, [0, .5, .5, 0, 1, 1]) -def test_silhouette_invalid_block_size(): - X = [[0], [0], [1]] - y = [1, 1, 2] - assert_raise_message(ValueError, 'block_size should be at least n_samples ' - '* 8 bytes = 1 MiB, got 0', - silhouette_score, X, y, block_size=0) - - def test_no_nan(): # Assert Silhouette Coefficient != nan when there is 1 sample in a class. # This tests for the condition that caused issue #960. diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 70530b18ac121..7f00470540a82 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -7,13 +7,14 @@ from __future__ import division +import functools + import numpy as np from ...utils import check_random_state from ...utils import check_X_y -from ...utils import _get_n_jobs -from ...externals.joblib import Parallel, delayed -from ..pairwise import pairwise_distances +from ...utils import get_block_n_rows +from ..pairwise import pairwise_distances_chunked from ...preprocessing import LabelEncoder @@ -23,12 +24,7 @@ def check_number_of_labels(n_labels, n_samples): "to n_samples - 1 (inclusive)" % n_labels) -DEFAULT_BLOCK_SIZE = 64 -BYTES_PER_FLOAT = 8 - - def silhouette_score(X, labels, metric='euclidean', sample_size=None, - block_size=DEFAULT_BLOCK_SIZE, n_jobs=1, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples. @@ -65,18 +61,6 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, `. If X is the distance array itself, use ``metric="precomputed"``. - block_size : int, optional, default=64 - The maximum number of mebibytes (MiB) of memory per job (see - ``n_jobs``) to use at a time for calculating pairwise distances. - - .. versionadded:: 0.18 - - n_jobs : int, optional (default = 1) - The number of parallel jobs to run. - If ``-1``, then the number of jobs is set to the number of CPU cores. - - .. versionadded:: 0.18 - sample_size : int or None The size of the sample to use when computing the Silhouette Coefficient on a random subset of the data. @@ -119,50 +103,33 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, X, labels = X[indices].T[indices].T, labels[indices] else: X, labels = X[indices], labels[indices] - return np.mean(silhouette_samples(X, labels, metric=metric, - block_size=block_size, n_jobs=n_jobs, - **kwds)) + return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) -def _silhouette_block(X, labels, label_freqs, start, block_n_rows, - block_range, add_at, dist_kwds): +def _silhouette_reduce(D_chunk, start, labels, label_freqs, add_at): """Accumulate silhouette statistics for X[start:start+block_n_rows] Parameters ---------- - X : shape (n_samples, n_features) or precomputed (n_samples, n_samples) - data + D_chunk : shape (n_chunk_samples, n_samples) + precomputed distances for a chunk + start : int + first index in block labels : array, shape (n_samples,) corresponding cluster labels, encoded as {0, ..., n_clusters-1} label_freqs : array distribution of cluster labels in ``labels`` - start : int - first index in block - block_n_rows : int - length of block - block_range : array - precomputed range ``0..(block_n_rows-1)`` add_at : array, shape (block_n_rows * n_clusters,) indices into a flattened array of shape (block_n_rows, n_clusters) where distances from block points to each cluster are accumulated - dist_kwds : dict - kwargs for ``pairwise_distances`` """ - # get distances from block to every other sample - stop = min(start + block_n_rows, X.shape[0]) - if stop - start == X.shape[0]: - # allow pairwise_distances to use fast paths - block_dists = pairwise_distances(X, **dist_kwds) - else: - block_dists = pairwise_distances(X[start:stop], X, **dist_kwds) - # accumulate distances from each sample to each cluster - clust_dists = np.bincount(add_at[:block_dists.size], - block_dists.ravel()) - clust_dists = clust_dists.reshape((stop - start, len(label_freqs))) + clust_dists = np.bincount(add_at[:D_chunk.size], + D_chunk.ravel()) + clust_dists = clust_dists.reshape(-1, len(label_freqs)) # intra_index selects intra-cluster distances within clust_dists - intra_index = (block_range[:len(clust_dists)], labels[start:stop]) + intra_index = (np.arange(len(D_chunk)), labels[start:start + len(D_chunk)]) # intra_clust_dists are averaged over cluster size outside this function intra_clust_dists = clust_dists[intra_index] # of the remaining distances we normalise and extract the minimum @@ -172,8 +139,7 @@ def _silhouette_block(X, labels, label_freqs, start, block_n_rows, return intra_clust_dists, inter_clust_dists -def silhouette_samples(X, labels, metric='euclidean', - block_size=DEFAULT_BLOCK_SIZE, n_jobs=1, **kwds): +def silhouette_samples(X, labels, metric='euclidean', **kwds): """Compute the Silhouette Coefficient for each sample. The Silhouette Coefficient is a measure of how well samples are clustered @@ -211,18 +177,6 @@ def silhouette_samples(X, labels, metric='euclidean', allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`. If X is the distance array itself, use "precomputed" as the metric. - block_size : int, optional, default=64 - The maximum number of mebibytes (MiB) of memory per job (see - ``n_jobs``) to use at a time for calculating pairwise distances. - - .. versionadded:: 0.20 - - n_jobs : int, optional (default = 1) - The number of parallel jobs to run. - If ``-1``, then the number of jobs is set to the number of CPU cores. - - .. versionadded:: 0.20 - `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. If using a ``scipy.spatial.distance`` metric, the parameters are still @@ -252,16 +206,8 @@ def silhouette_samples(X, labels, metric='euclidean', label_freqs = np.bincount(labels) check_number_of_labels(len(le.classes_), n_samples) - n_jobs = _get_n_jobs(n_jobs) - block_n_rows = block_size * (2 ** 20) // (BYTES_PER_FLOAT * n_samples) - if block_n_rows > n_samples: - block_n_rows = min(block_n_rows, n_samples) - if block_n_rows < 1: - min_block_mib = np.ceil(n_samples * BYTES_PER_FLOAT * 2 ** -20) - raise ValueError('block_size should be at least n_samples * %d bytes ' - '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, - min_block_mib, block_size)) - + block_n_rows = get_block_n_rows(row_bytes=n_samples * 8, + max_n_rows=n_samples) intra_clust_dists = [] inter_clust_dists = [] @@ -273,15 +219,15 @@ def silhouette_samples(X, labels, metric='euclidean', add_at = np.ravel_multi_index((np.repeat(block_range, n_samples), np.tile(labels, block_n_rows)), dims=(block_n_rows, len(label_freqs))) - parallel = Parallel(n_jobs=n_jobs, backend='threading') kwds['metric'] = metric - results = parallel(delayed(_silhouette_block)(X, labels, label_freqs, - start, block_n_rows, - block_range, add_at, kwds) - for start in range(0, n_samples, block_n_rows)) + reduce_func = functools.partial(_silhouette_reduce, + labels=labels, label_freqs=label_freqs, + add_at=add_at) + results = pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds) + print(len(results)) + intra_clust_dists, inter_clust_dists = results - intra_clust_dists, inter_clust_dists = zip(*results) if len(intra_clust_dists) == 1: intra_clust_dists = intra_clust_dists[0] inter_clust_dists = inter_clust_dists[0] diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 24e7447ca3fc2..3fc91e4a30b52 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -20,8 +20,10 @@ from ..utils import check_array from ..utils import gen_even_slices +from ..utils import get_block_n_rows from ..utils.extmath import row_norms, safe_sparse_dot from ..utils import flexible_vstack +from ..utils.validation import _num_samples from ..preprocessing import normalize from ..externals.joblib import Parallel from ..externals.joblib import delayed @@ -257,18 +259,16 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, return distances if squared else np.sqrt(distances, out=distances) -def _argmin_min_reduce_min(dist): +def _argmin_min_reduce_min(dist, start): indices = dist.argmin(axis=1) values = dist[np.arange(dist.shape[0]), indices] return indices, values BYTES_PER_FLOAT = 8 -DEFAULT_BLOCK_SIZE = 64 def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", - block_size=DEFAULT_BLOCK_SIZE, metric_kwargs=None, batch_size=None): """Compute minimum distances between one point and a set of points. @@ -320,19 +320,14 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", See the documentation for scipy.spatial.distance for details on these metrics. - block_size : int, default=64 - The maximum number of mebibytes (MiB) of memory per job to use at a - time for calculating pairwise distances. - batch_size : integer - To reduce memory consumption over the naive solution, data are - processed in batches, comprising batch_size rows of X and - batch_size rows of Y. The default value is quite conservative, but - can be changed for fine-tuning. The larger the number, the larger the - memory usage. - metric_kwargs : dict, optional Keyword arguments to pass to specified metric function. + batch_size : integer + .. deprecated:: 0.20 + Deprecated for removal in 0.22. + Use sklearn.set_config(working_memory=...) instead. + Returns ------- argmin : numpy.ndarray @@ -348,8 +343,10 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", sklearn.metrics.pairwise_distances_argmin """ if batch_size is not None: - warnings.warn("'batch_size' was deprecated in version 0.19 and will " - "be removed in version 0.21.", DeprecationWarning) + warnings.warn("'batch_size' was deprecated in version 0.20 and will " + "be removed in version 0.22. " + "Use sklearn.set_config(working_memory=...) instead.", + DeprecationWarning) X, Y = check_pairwise_arrays(X, Y) if metric_kwargs is None: @@ -358,17 +355,16 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", if axis == 0: X, Y = Y, X - indices, values = \ - pairwise_distances_reduce(X, Y, reduce_func=_argmin_min_reduce_min, - metric=metric, block_size=block_size, - **metric_kwargs) + indices, values = pairwise_distances_chunked( + X, Y, reduce_func=_argmin_min_reduce_min, metric=metric, + **metric_kwargs) if metric == "euclidean" and not metric_kwargs.get("squared", False): np.sqrt(values, values) return indices, values -def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", block_size=500, +def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", metric_kwargs=None, batch_size=None): """Compute minimum distances between one point and a set of points. @@ -422,12 +418,8 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", block_size=500, See the documentation for scipy.spatial.distance for details on these metrics. - block_size : int, default=64 - The maximum number of mebibytes (MiB) of memory per job to use at a - time for calculating pairwise distances. - batch_size : integer - Deprecated. Use block_size instead. + Deprecated. Use sklearn.set_config(working_memory=...) instead. metric_kwargs : dict keyword arguments to pass to specified metric function. @@ -443,13 +435,12 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", block_size=500, sklearn.metrics.pairwise_distances_argmin_min """ if batch_size is not None: - warnings.warn("'batch_size' was deprecated in version 0.19 and will " - "be removed in version 0.21.", DeprecationWarning) + warnings.warn("'batch_size' was deprecated in version 0.20 and will " + "be removed in version 0.22.", DeprecationWarning) if metric_kwargs is None: metric_kwargs = {} - return pairwise_distances_argmin_min(X, Y, axis, metric, block_size, - metric_kwargs)[0] + return pairwise_distances_argmin_min(X, Y, axis, metric, metric_kwargs)[0] def manhattan_distances(X, Y=None, sum_over_features=True, @@ -1124,10 +1115,8 @@ def _pairwise_callable(X, Y, metric, **kwds): 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] -def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', - n_jobs=1, - block_size=DEFAULT_BLOCK_SIZE, - block_n_rows=1, **kwds): +def _generate_pairwise_distances_chunked(X, Y, metric, reduce_func, + n_jobs, block_n_rows, **kwds): """Generates blocks of the distance matrix from X and optional Y. Parameters @@ -1140,6 +1129,15 @@ def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', An optional second feature array. Only allowed if metric != "precomputed". + reduce_func : callable, optional + The function which is applied on each chunk of the distance matrix, + reducing it to needed values. ``reduce_func`` receives the distances + chunk, an array or sparse matrix of shape + ``(X_chunk_n_samples, Y_n_samples)``, + and also the index of the first row in X. It should return an array, + a list, or a sparse matrix of length ``X_chunk_n_samples``, or a tuple + of such objects. + metric : string, or callable The metric to use when calculating distance between instances in a feature array. @@ -1147,10 +1145,6 @@ def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs : int The number of jobs to use for the computation. - block_size : int, default=64 - The maximum number of mebibytes (MiB) of memory per job (see``n_jobs``) - to use at a time for calculating pairwise distances. - block_n_rows : int Number of rows to be computed for each block. @@ -1159,8 +1153,8 @@ def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', Returns ------- - D : generator of blocks based on the ``block_size`` parameter. - + D : generator + Yields (distance_matrix_chunk, start_idx) matrices. """ if metric != 'precomputed' and Y is None: Y = X @@ -1168,18 +1162,40 @@ def _generate_pairwise_distances_blockwise(X, Y=None, metric='euclidean', for start in range(0, n_samples, block_n_rows): # get distances from block to every other sample stop = min(start + block_n_rows, X.shape[0]) - yield pairwise_distances(X[start:stop], Y, metric, n_jobs, **kwds) - - -def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', - n_jobs=1, block_size=DEFAULT_BLOCK_SIZE, **kwds): - """Compute the distance matrix from a vector array X and optional Y. - - This method takes either a vector array or a distance matrix, and a - reducing function to reduce each block of the distance matrix produced, - as per the block_size parameter. If the input is a vector array, the - distances are computed. If the input is a distances matrix, it is reduced - in size and returned instead. + if start == 0 and stop >= n_samples: + X_chunk = X # allow fast paths in pairwise_distances + else: + X_chunk = X[start:stop] + D_chunk = pairwise_distances(X_chunk, Y, metric=metric, + n_jobs=n_jobs, **kwds) + if reduce_func is not None: + D_chunk = reduce_func(D_chunk, start) + _check_chunk_size(D_chunk, stop - start) + # TODO: check that len(flexible_vstack([D_chunk])) == stop - start + yield D_chunk + + +def _check_chunk_size(reduced, chunk_size): + is_tuple = isinstance(reduced, tuple) + if not is_tuple: + reduced = (reduced,) + if not np.all(len(r) == chunk_size for r in reduced): + actual_size = tuple(map(len, reduced)) if is_tuple else len(reduced) + raise ValueError('reduce_func returned object of length %s. ' + 'Expected same length as input: %d.' % + (actual_size, chunk_size)) + + +def pairwise_distances_chunked(X, Y=None, reduce_func=None, + metric='euclidean', n_jobs=1, + working_memory=None, **kwds): + """Generate a distance matrix chunk by chunk with optional reduction + + In cases where not all of a pairwise distance matrix needs to be stored at + once, this is used to calculate pairwise distances in + ``working_memory``-sized chunks. If ``reduce_func`` is given, it is run + on each chunk and its return values are concatenated into lists, arrays + or sparse matrices. Parameters ---------- @@ -1191,9 +1207,17 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', An optional second feature array. Only allowed if metric != "precomputed". - reduce_func : function, callable - The function which is applied on each block of the distance matrix - reducing its size. + reduce_func : callable, optional + The function which is applied on each chunk of the distance matrix, + reducing it to needed values. ``reduce_func`` receives the distances + chunk, an array or sparse matrix of shape + ``(X_chunk_n_samples, Y_n_samples)``, + and also the index of the first row in X. It should return an array, + a list, or a sparse matrix of length ``X_chunk_n_samples``, or a tuple + of such objects. + + If None, pairwise_distances_chunked returns a generator of vertical + chunks of the distance matrix. metric : string, or callable The metric to use when calculating distance between instances in a @@ -1216,9 +1240,9 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. - block_size : int, default=64 - The maximum number of mebibytes (MiB) of memory per job (see``n_jobs``) - to use at a time for calculating pairwise distances. + working_memory : int, optional + The sought maximum memory for temporary distance matrix chunks. + Defaults to sklearn.get_config()['working_memory']. `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. @@ -1234,102 +1258,20 @@ def pairwise_distances_reduce(X, Y=None, reduce_func=None, metric='euclidean', from X and the jth array from Y. """ - if reduce_func is not None: - reduced_distances = (reduce_func(D) for D in - pairwise_distances_blockwise(X, Y, metric, n_jobs, - block_size, **kwds)) - return flexible_vstack(reduced_distances) + block_n_rows = get_block_n_rows(row_bytes=_num_samples(Y if Y is not None + else X) * 8, + max_n_rows=_num_samples(X), + working_memory=working_memory) + gen = _generate_pairwise_distances_chunked(X, Y, metric=metric, + n_jobs=n_jobs, + reduce_func=reduce_func, + block_n_rows=block_n_rows, + **kwds) + if reduce_func is None: + return gen else: - raise ValueError("reduce_func needs to be passed as an argument.") - - -def pairwise_distances_blockwise(X, Y=None, metric='euclidean', n_jobs=1, - block_size=DEFAULT_BLOCK_SIZE, **kwds): - """Compute the distance matrix from a vector array X and optional Y. - - This method takes either a vector array or a distance matrix, and generates - blocks of a distance matrix. If the input is a vector array, the distances - are computed. If the input is a distances matrix, it is returned in blocks - instead. - - This is equivalent to calling: - - pairwise_distances(X, y, metric, n_jobs) - - but may use less memory. - - Parameters - ---------- - X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, - [n_samples_a, n_features] otherwise - Array of pairwise distances between samples, or a feature array. - - Y : array [n_samples_b, n_features], optional - An optional second feature array. Only allowed if - metric != "precomputed". - - metric : string, or callable - The metric to use when calculating distance between instances in a - feature array. If metric is a string, it must be one of the options - allowed by scipy.spatial.distance.pdist for its metric parameter, or - a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. - If metric is "precomputed", X is assumed to be a distance matrix. - Alternatively, if metric is a callable function, it is called on each - pair of instances (rows) and the resulting value recorded. The callable - should take two arrays from X as input and return a value indicating - the distance between them. - - n_jobs : int - The number of jobs to use for the computation. This works by breaking - down the pairwise matrix into n_jobs even slices and computing them in - parallel. - - If -1 all CPUs are used. If 1 is given, no parallel computing code is - used at all, which is useful for debugging. For n_jobs below -1, - (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one - are used. - - block_size : int, default=64 - The maximum number of mebibytes (MiB) of memory per job (see``n_jobs``) - to use at a time for calculating pairwise distances. - - `**kwds` : optional keyword parameters - Any further parameters are passed directly to the distance function. - If using a scipy.spatial.distance metric, the parameters are still - metric dependent. See the scipy docs for usage examples. - - Returns - ------- - D : generator of blocks based on the ``block_size`` parameter. - The blocks, when concatenated, produce a distance matrix D such that - D_{i, j} is the distance between the ith and jth vectors of the given - matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance - between the ith array from X and the jth array from Y. - - """ - if (metric not in _VALID_METRICS and - not callable(metric) and metric != "precomputed"): - raise ValueError("Unknown metric %s. " - "Valid metrics are %s, or 'precomputed', or a " - "callable" % (metric, _VALID_METRICS)) - - n_samples = X.shape[0] - block_n_rows = block_size * (2 ** 20) // (BYTES_PER_FLOAT * n_samples) - if block_n_rows > n_samples: - block_n_rows = min(block_n_rows, n_samples) - if block_n_rows < 1: - min_block_mib = np.ceil(n_samples * BYTES_PER_FLOAT * 2 ** -20) - warnings.warn('block_size should be at least n_samples * %d bytes ' - '= %.0f MiB, got %r' % (BYTES_PER_FLOAT, - min_block_mib, block_size)) - block_size = min_block_mib - block_n_rows = 1 - - return _generate_pairwise_distances_blockwise(X, Y, metric=metric, - n_jobs=n_jobs, - block_size=block_size, - block_n_rows=block_n_rows, - **kwds) + # TODO: check that the shapes of objects in reduce_func are correct + return flexible_vstack(gen) def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 47836314f25b3..8eb15a8f5152c 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -30,8 +30,7 @@ from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_distances from sklearn.metrics.pairwise import pairwise_distances -from sklearn.metrics.pairwise import pairwise_distances_blockwise -from sklearn.metrics.pairwise import pairwise_distances_reduce +from sklearn.metrics.pairwise import pairwise_distances_chunked from sklearn.metrics.pairwise import pairwise_distances_argmin_min from sklearn.metrics.pairwise import pairwise_distances_argmin from sklearn.metrics.pairwise import pairwise_kernels @@ -370,89 +369,70 @@ def test_pairwise_distances_argmin_min(): dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( - X, Y, axis=0, metric="manhattan", block_size=50) + X, Y, axis=0, metric="manhattan") np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) # Test batch_size deprecation warning assert_warns_message(DeprecationWarning, "'batch_size' was deprecated in " - "version 0.19 and will be removed in version 0.21.", + "version 0.20 and will be removed in version 0.22.", pairwise_distances_argmin_min, X, Y, batch_size=500, metric='euclidean') -def test_pairwise_distances_reduce_invalid_reduce_func(): - X = np.empty((400, 4)) - y = np.empty((200, 4)) - assert_raise_message(ValueError, 'reduce_func needs to be passed as an ' - 'argument', pairwise_distances_reduce, X, y, - block_size=0, metric='euclidean') - - -def _reduce_func(dist): +def _reduce_func(dist, start): return dist[:, :100] -def test_pairwise_distances_reduce(): +def test_pairwise_distances_chunked_reduce(): rng = np.random.RandomState(0) X = rng.random_sample((400, 4)) # Reduced Euclidean distance S = pairwise_distances(X)[:, :100] - S2 = pairwise_distances_reduce(X, None, reduce_func=_reduce_func, - block_size=1) + S2 = pairwise_distances_chunked(X, None, reduce_func=_reduce_func, + working_memory=1) assert_array_almost_equal(S, S2) -def check_pairwise_distances_blockwise(X, Y, block_size, metric='euclidean'): +def check_pairwise_distances_chunked(X, Y, working_memory, metric='euclidean'): from sklearn.metrics.pairwise import BYTES_PER_FLOAT - gen = pairwise_distances_blockwise(X, Y, block_size=block_size, - metric=metric) + gen = pairwise_distances_chunked(X, Y, working_memory=working_memory, + metric=metric) blockwise_distances = list(gen) min_block_mib = X.shape[0] * BYTES_PER_FLOAT * 2 ** -20 - if block_size < min_block_mib: - block_size = min_block_mib + if working_memory < min_block_mib: + working_memory = min_block_mib for block in blockwise_distances: memory_used = len(block) * BYTES_PER_FLOAT - assert_true(memory_used <= block_size * 2 ** 20) + assert_true(memory_used <= working_memory * 2 ** 20) blockwise_distances = np.vstack(blockwise_distances) S = pairwise_distances(X, Y, metric=metric) assert_array_almost_equal(blockwise_distances, S) -def test_pairwise_distances_blockwise_invalid_block_size(): - rng = np.random.RandomState(0) - X = rng.random_sample((400, 4)) - y = rng.random_sample((200, 4)) - assert_warns_message(UserWarning, 'block_size should be at least ' - 'n_samples * 8 bytes = 1 MiB, got 0', - pairwise_distances_blockwise, X, y, block_size=0, - metric='euclidean') - check_pairwise_distances_blockwise(X, y, block_size=0) - - -def test_pairwise_distances_blockwise(): +def test_pairwise_distances_chunked(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((400, 4)) - check_pairwise_distances_blockwise(X, None, block_size=1, - metric='euclidean') + check_pairwise_distances_chunked(X, None, working_memory=1, + metric='euclidean') # Euclidean distance, with Y != X. Y = rng.random_sample((200, 4)) - check_pairwise_distances_blockwise(X, Y, block_size=1, - metric='euclidean') - # absurdly large block_size - check_pairwise_distances_blockwise(X, Y, block_size=10000, - metric='euclidean') + check_pairwise_distances_chunked(X, Y, working_memory=1, + metric='euclidean') + # absurdly large working_memory + check_pairwise_distances_chunked(X, Y, working_memory=10000, + metric='euclidean') # "cityblock" uses scikit-learn metric, cityblock (function) is # scipy.spatial. - check_pairwise_distances_blockwise(X, Y, block_size=1, - metric='cityblock') + check_pairwise_distances_chunked(X, Y, working_memory=1, + metric='cityblock') # Test that a value error is raised if the metric is unknown - assert_raises(ValueError, pairwise_distances_blockwise, X, Y, - metric="blah") + assert_raises(ValueError, next, + pairwise_distances_chunked(X, Y, metric="blah")) def test_euclidean_distances(): diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index e9d98455f1f72..ed1518167989f 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -17,7 +17,7 @@ from .ball_tree import BallTree from .kd_tree import KDTree from ..base import BaseEstimator -from ..metrics import pairwise_distances_reduce +from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices from ..utils.multiclass import check_classification_targets @@ -274,7 +274,8 @@ def _pairwise(self): class KNeighborsMixin(object): """Mixin for k-neighbors searches""" - def _kneighbors_reduce_func(self, dist, n_neighbors, return_distance): + def _kneighbors_reduce_func(self, dist, start, + n_neighbors, return_distance): sample_range = np.arange(dist.shape[0])[:, None] neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) neigh_ind = neigh_ind[:, :n_neighbors] @@ -377,11 +378,11 @@ class from an array representing our data set and ask who's # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': - result = pairwise_distances_reduce( + result = pairwise_distances_chunked( X, self._fit_X, reduce_func=reduce_func, metric='euclidean', n_jobs=n_jobs, squared=True) else: - result = pairwise_distances_reduce( + result = pairwise_distances_chunked( X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, **self.effective_metric_params_) @@ -515,7 +516,8 @@ def kneighbors_graph(self, X=None, n_neighbors=None, class RadiusNeighborsMixin(object): """Mixin for radius-based neighbors searches""" - def _radius_neighbors_reduce_func(self, dist, radius, return_distance): + def _radius_neighbors_reduce_func(self, dist, start, + radius, return_distance): neigh_ind_list = [np.where(d <= radius)[0] for d in dist] # See https://github.com/numpy/numpy/issues/5456 @@ -625,7 +627,7 @@ class from an array representing our data set and ask who's radius=radius, return_distance=return_distance) - results = pairwise_distances_reduce( + results = pairwise_distances_chunked( X, self._fit_X, reduce_func=reduce_func, metric='euclidean', n_jobs=self.n_jobs, squared=True) @@ -634,7 +636,7 @@ class from an array representing our data set and ask who's radius=radius, return_distance=return_distance) - results = pairwise_distances_reduce( + results = pairwise_distances_chunked( X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=self.n_jobs, **self.effective_metric_params_) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 0d7b8a8fc872b..88d7a3062beff 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -18,6 +18,7 @@ from ..externals.joblib import cpu_count from ..exceptions import DataConversionWarning from .deprecation import deprecated +from .. import get_config __all__ = ["murmurhash3_32", "as_float_array", "assert_all_finite", "check_array", @@ -614,3 +615,39 @@ def finalize(accumulator, prototype): stacked_results = finalize(accumulator, first) return stacked_results + + +def get_block_n_rows(row_bytes, max_n_rows=None, + working_memory=None): + """Calculates the number of rows that fit in working_memory + + Parameters + ---------- + row_bytes : int + The number of bytes consumed by each row + max_n_rows : int, optional + The maximum return value. + working_memory : int, optional + The number of rows to fit inside this number of MiB will be returned. + Defaults to ``sklearn.get_config()['working_memory']``. + + Returns + ------- + int or n_samples + """ + + if working_memory is None: + working_memory = get_config()['working_memory'] + + if working_memory is None: + return max_n_rows + + block_n_rows = working_memory * (2 ** 20) // row_bytes + if max_n_rows is not None: + block_n_rows = min(block_n_rows, max_n_rows) + if block_n_rows < 1: + warnings.warn('Could not adhere to working_memory config. ' + 'Currently %dMiB, %.0fMiB required.' % + (working_memory, np.ceil(row_bytes * 2 ** -20))) + block_n_rows = 1 + return block_n_rows From 82bc06a6558c5c0acc29a20e3e9c406a6343e981 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 10 Dec 2017 21:26:09 +1100 Subject: [PATCH 55/96] Add to classes.rst --- doc/modules/classes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 3fe245113a562..6cd1cb168c505 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -903,7 +903,6 @@ See the :ref:`metrics` section of the user guide for further details. metrics.pairwise.laplacian_kernel metrics.pairwise.linear_kernel metrics.pairwise.manhattan_distances - metrics.pairwise.pairwise_distances metrics.pairwise.pairwise_kernels metrics.pairwise.polynomial_kernel metrics.pairwise.rbf_kernel @@ -915,6 +914,7 @@ See the :ref:`metrics` section of the user guide for further details. metrics.pairwise_distances metrics.pairwise_distances_argmin metrics.pairwise_distances_argmin_min + metrics.pairwise_distances_chunked .. _mixture_ref: From ec31fadeaf18eb57c38b1d218bbbd333665cccc9 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 10 Dec 2017 21:29:40 +1100 Subject: [PATCH 56/96] Add missing module --- sklearn/_config.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 sklearn/_config.py diff --git a/sklearn/_config.py b/sklearn/_config.py new file mode 100644 index 0000000000000..31a5813f39d6a --- /dev/null +++ b/sklearn/_config.py @@ -0,0 +1,90 @@ +"""Global configuration state and functions for management +""" +import os +from contextlib import contextmanager as contextmanager + +_ASSUME_FINITE = bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)) +_WORKING_MEMORY = int(os.environ.get('SKLEARN_WORKING_MEMORY', 64)) + + +def get_config(): + """Retrieve current values for configuration set by :func:`set_config` + + Returns + ------- + config : dict + Keys are parameter names that can be passed to :func:`set_config`. + """ + return {'assume_finite': _ASSUME_FINITE, + 'working_memory': _WORKING_MEMORY} + + +def set_config(assume_finite=None, working_memory=None): + """Set global scikit-learn configuration + + Parameters + ---------- + assume_finite : bool, optional + If True, validation for finiteness will be skipped, + saving time, but leading to potential crashes. If + False, validation for finiteness will be performed, + avoiding error. Global default: False. + + working_memory : int, optional + If set, scikit-learn will attempt to limit the size of temporary arrays + to this number of MiB (per job when parallelised), often saving both + computation time and memory on expensive operations that can be + performed in chunks. Global default: 64. + """ + global _ASSUME_FINITE, _WORKING_MEMORY + if assume_finite is not None: + _ASSUME_FINITE = assume_finite + if working_memory is not None: + _WORKING_MEMORY = working_memory + + +@contextmanager +def config_context(**new_config): + """Context manager for global scikit-learn configuration + + Parameters + ---------- + assume_finite : bool, optional + If True, validation for finiteness will be skipped, + saving time, but leading to potential crashes. If + False, validation for finiteness will be performed, + avoiding error. Global default: False. + + working_memory : int, optional + If set, scikit-learn will attempt to limit the size of temporary arrays + to this number of MiB (per job when parallelised), often saving both + computation time and memory on expensive operations that can be + performed in chunks. Global default: 64. + + Notes + ----- + All settings, not just those presently modified, will be returned to + their previous values when the context manager is exited. This is not + thread-safe. + + Examples + -------- + >>> import sklearn + >>> from sklearn.utils.validation import assert_all_finite + >>> with sklearn.config_context(assume_finite=True): + ... assert_all_finite([float('nan')]) + >>> with sklearn.config_context(assume_finite=True): + ... with sklearn.config_context(assume_finite=False): + ... assert_all_finite([float('nan')]) + ... # doctest: +ELLIPSIS + Traceback (most recent call last): + ... + ValueError: Input contains NaN, ... + """ + old_config = get_config().copy() + set_config(**new_config) + + try: + yield + finally: + set_config(**old_config) From a56e00299cb51c9f4b80e9f66e51a27eea4be389 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 10 Dec 2017 21:31:47 +1100 Subject: [PATCH 57/96] Remove obsolete TODOs --- sklearn/metrics/pairwise.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 3fc91e4a30b52..323e8cb154869 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1171,7 +1171,6 @@ def _generate_pairwise_distances_chunked(X, Y, metric, reduce_func, if reduce_func is not None: D_chunk = reduce_func(D_chunk, start) _check_chunk_size(D_chunk, stop - start) - # TODO: check that len(flexible_vstack([D_chunk])) == stop - start yield D_chunk @@ -1270,7 +1269,6 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, if reduce_func is None: return gen else: - # TODO: check that the shapes of objects in reduce_func are correct return flexible_vstack(gen) From cb35271899486893585205bf19d66502590917f3 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 10 Dec 2017 21:35:43 +1100 Subject: [PATCH 58/96] Pass final_len to flexible_vstack --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 323e8cb154869..90ba5fe51fffe 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1269,7 +1269,7 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, if reduce_func is None: return gen else: - return flexible_vstack(gen) + return flexible_vstack(gen, final_len=_num_samples(X)) def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): From dc1f544be16427ac5058041789f81c76f6154d3d Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 10 Dec 2017 21:38:07 +1100 Subject: [PATCH 59/96] Renaming and removing obsolete code --- sklearn/metrics/pairwise.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 90ba5fe51fffe..411fb502c7ee0 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -259,15 +259,12 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, return distances if squared else np.sqrt(distances, out=distances) -def _argmin_min_reduce_min(dist, start): +def _argmin_min_reduce_func(dist, start): indices = dist.argmin(axis=1) values = dist[np.arange(dist.shape[0]), indices] return indices, values -BYTES_PER_FLOAT = 8 - - def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", metric_kwargs=None, batch_size=None): """Compute minimum distances between one point and a set of points. @@ -356,7 +353,7 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", X, Y = Y, X indices, values = pairwise_distances_chunked( - X, Y, reduce_func=_argmin_min_reduce_min, metric=metric, + X, Y, reduce_func=_argmin_min_reduce_func, metric=metric, **metric_kwargs) if metric == "euclidean" and not metric_kwargs.get("squared", False): From da5a6c7e75cca5d0fa879cc57bc26dc9e389709a Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 10 Dec 2017 22:21:02 +1100 Subject: [PATCH 60/96] Update test_config --- sklearn/tests/test_config.py | 42 +++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index b968e7b7917ea..500b0c4459ac0 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -3,38 +3,40 @@ def test_config_context(): - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config(), {'assume_finite': False, 'working_memory': 64}) # Not using as a context manager affects nothing config_context(assume_finite=True) - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config()['assume_finite'], False) with config_context(assume_finite=True): - assert_equal(get_config(), {'assume_finite': True}) - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config(), {'assume_finite': True, + 'working_memory': 64}) + assert_equal(get_config()['assume_finite'], False) with config_context(assume_finite=True): with config_context(assume_finite=None): - assert_equal(get_config(), {'assume_finite': True}) + assert_equal(get_config()['assume_finite'], True) - assert_equal(get_config(), {'assume_finite': True}) + assert_equal(get_config()['assume_finite'], True) with config_context(assume_finite=False): - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config()['assume_finite'], False) with config_context(assume_finite=None): - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config()['assume_finite'], False) # global setting will not be retained outside of context that # did not modify this setting set_config(assume_finite=True) - assert_equal(get_config(), {'assume_finite': True}) + assert_equal(get_config()['assume_finite'], True) - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config()['assume_finite'], False) - assert_equal(get_config(), {'assume_finite': True}) + assert_equal(get_config()['assume_finite'], True) - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config(), {'assume_finite': False, + 'working_memory': 64}) # No positional arguments assert_raises(TypeError, config_context, True) @@ -43,26 +45,26 @@ def test_config_context(): def test_config_context_exception(): - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config()['assume_finite'], False) try: with config_context(assume_finite=True): - assert_equal(get_config(), {'assume_finite': True}) + assert_equal(get_config()['assume_finite'], True) raise ValueError() except ValueError: pass - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config()['assume_finite'], False) def test_set_config(): - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config()['assume_finite'], False) set_config(assume_finite=None) - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config()['assume_finite'], False) set_config(assume_finite=True) - assert_equal(get_config(), {'assume_finite': True}) + assert_equal(get_config()['assume_finite'], True) set_config(assume_finite=None) - assert_equal(get_config(), {'assume_finite': True}) + assert_equal(get_config()['assume_finite'], True) set_config(assume_finite=False) - assert_equal(get_config(), {'assume_finite': False}) + assert_equal(get_config()['assume_finite'], False) # No unknown arguments assert_raises(TypeError, set_config, do_something_else=True) From 70730327802b377dc3ab2cc9cbfc9695634b24ea Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 10 Dec 2017 23:33:31 +1100 Subject: [PATCH 61/96] Fix test import --- sklearn/metrics/pairwise.py | 10 ++++------ sklearn/metrics/tests/test_pairwise.py | 8 +++----- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 411fb502c7ee0..1636d42b62b23 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -259,7 +259,7 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, return distances if squared else np.sqrt(distances, out=distances) -def _argmin_min_reduce_func(dist, start): +def _argmin_min_reduce(dist, start): indices = dist.argmin(axis=1) values = dist[np.arange(dist.shape[0]), indices] return indices, values @@ -353,7 +353,7 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", X, Y = Y, X indices, values = pairwise_distances_chunked( - X, Y, reduce_func=_argmin_min_reduce_func, metric=metric, + X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs) if metric == "euclidean" and not metric_kwargs.get("squared", False): @@ -431,13 +431,11 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", sklearn.metrics.pairwise_distances sklearn.metrics.pairwise_distances_argmin_min """ - if batch_size is not None: - warnings.warn("'batch_size' was deprecated in version 0.20 and will " - "be removed in version 0.22.", DeprecationWarning) if metric_kwargs is None: metric_kwargs = {} - return pairwise_distances_argmin_min(X, Y, axis, metric, metric_kwargs)[0] + return pairwise_distances_argmin_min(X, Y, axis, metric, metric_kwargs, + batch_size=batch_size)[0] def manhattan_distances(X, Y=None, sum_over_features=True, diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 8eb15a8f5152c..1c6c0f5d988b1 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -395,16 +395,14 @@ def test_pairwise_distances_chunked_reduce(): def check_pairwise_distances_chunked(X, Y, working_memory, metric='euclidean'): - from sklearn.metrics.pairwise import BYTES_PER_FLOAT gen = pairwise_distances_chunked(X, Y, working_memory=working_memory, metric=metric) blockwise_distances = list(gen) - min_block_mib = X.shape[0] * BYTES_PER_FLOAT * 2 ** -20 - if working_memory < min_block_mib: - working_memory = min_block_mib + min_block_mib = X.shape[0] * 8 * 2 ** -20 + working_memory = min(working_memory, min_block_mib) for block in blockwise_distances: - memory_used = len(block) * BYTES_PER_FLOAT + memory_used = len(block) * 8 assert_true(memory_used <= working_memory * 2 ** 20) blockwise_distances = np.vstack(blockwise_distances) From b29ff762ce5b9142154fae57bda21bfbf7741dc0 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 10 Dec 2017 23:38:44 +1100 Subject: [PATCH 62/96] Tweaks --- sklearn/metrics/pairwise.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 1636d42b62b23..024d466c341b5 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1156,7 +1156,7 @@ def _generate_pairwise_distances_chunked(X, Y, metric, reduce_func, n_samples = X.shape[0] for start in range(0, n_samples, block_n_rows): # get distances from block to every other sample - stop = min(start + block_n_rows, X.shape[0]) + stop = min(start + block_n_rows, n_samples) if start == 0 and stop >= n_samples: X_chunk = X # allow fast paths in pairwise_distances else: @@ -1245,11 +1245,7 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, Returns ------- - D : array-like or sparse matrix or tuple - A distance matrix D such that D_{i, j} is the distance between the - ith and jth vectors of the given matrix X, if Y is None. - If Y is not None, then D_{i, j} is the distance between the ith array - from X and the jth array from Y. + TODO """ block_n_rows = get_block_n_rows(row_bytes=_num_samples(Y if Y is not None From e71a4616bf9f2f604547b04bccba5a043b34d617 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 10 Dec 2017 23:43:11 +1100 Subject: [PATCH 63/96] Remove debug print --- sklearn/metrics/cluster/unsupervised.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 7f00470540a82..f2b07324f8014 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -225,7 +225,6 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): labels=labels, label_freqs=label_freqs, add_at=add_at) results = pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds) - print(len(results)) intra_clust_dists, inter_clust_dists = results if len(intra_clust_dists) == 1: From 055a9ef0d77dc38da70d9b597c0b974460515a62 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 00:04:16 +1100 Subject: [PATCH 64/96] Remove unused import --- sklearn/metrics/tests/test_pairwise.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 1c6c0f5d988b1..7e8a8d9e9bd10 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -10,7 +10,6 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_raises_regexp from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_warns From d3915050fbb193c2541f58bb37a7ade0fcf0fa2f Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 09:54:22 +1100 Subject: [PATCH 65/96] Remove flexible_vstack to reduce magic --- sklearn/metrics/cluster/unsupervised.py | 5 +- sklearn/metrics/pairwise.py | 96 ++++++--------------- sklearn/metrics/tests/test_pairwise.py | 6 +- sklearn/neighbors/base.py | 87 ++++++++++--------- sklearn/utils/__init__.py | 110 +----------------------- sklearn/utils/tests/test_utils.py | 43 --------- 6 files changed, 77 insertions(+), 270 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index f2b07324f8014..f191fcb85ee3a 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -224,8 +224,11 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): reduce_func = functools.partial(_silhouette_reduce, labels=labels, label_freqs=label_freqs, add_at=add_at) - results = pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds) + results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, + **kwds)) intra_clust_dists, inter_clust_dists = results + intra_clust_dists = np.concatenate(intra_clust_dists) + inter_clust_dists = np.concatenate(inter_clust_dists) if len(intra_clust_dists) == 1: intra_clust_dists = intra_clust_dists[0] diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 024d466c341b5..b606b79477ba1 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -22,7 +22,6 @@ from ..utils import gen_even_slices from ..utils import get_block_n_rows from ..utils.extmath import row_norms, safe_sparse_dot -from ..utils import flexible_vstack from ..utils.validation import _num_samples from ..preprocessing import normalize from ..externals.joblib import Parallel @@ -352,9 +351,11 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", if axis == 0: X, Y = Y, X - indices, values = pairwise_distances_chunked( + indices, values = zip(*pairwise_distances_chunked( X, Y, reduce_func=_argmin_min_reduce, metric=metric, - **metric_kwargs) + **metric_kwargs)) + indices = np.concatenate(indices) + values = np.concatenate(values) if metric == "euclidean" and not metric_kwargs.get("squared", False): np.sqrt(values, values) @@ -1110,65 +1111,6 @@ def _pairwise_callable(X, Y, metric, **kwds): 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] -def _generate_pairwise_distances_chunked(X, Y, metric, reduce_func, - n_jobs, block_n_rows, **kwds): - """Generates blocks of the distance matrix from X and optional Y. - - Parameters - ---------- - X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, - [n_samples_a, n_features] otherwise - Array of pairwise distances between samples, or a feature array. - - Y : array [n_samples_b, n_features], optional - An optional second feature array. Only allowed if - metric != "precomputed". - - reduce_func : callable, optional - The function which is applied on each chunk of the distance matrix, - reducing it to needed values. ``reduce_func`` receives the distances - chunk, an array or sparse matrix of shape - ``(X_chunk_n_samples, Y_n_samples)``, - and also the index of the first row in X. It should return an array, - a list, or a sparse matrix of length ``X_chunk_n_samples``, or a tuple - of such objects. - - metric : string, or callable - The metric to use when calculating distance between instances in a - feature array. - - n_jobs : int - The number of jobs to use for the computation. - - block_n_rows : int - Number of rows to be computed for each block. - - `**kwds` : optional keyword parameters - Any further parameters are passed directly to the distance function. - - Returns - ------- - D : generator - Yields (distance_matrix_chunk, start_idx) matrices. - """ - if metric != 'precomputed' and Y is None: - Y = X - n_samples = X.shape[0] - for start in range(0, n_samples, block_n_rows): - # get distances from block to every other sample - stop = min(start + block_n_rows, n_samples) - if start == 0 and stop >= n_samples: - X_chunk = X # allow fast paths in pairwise_distances - else: - X_chunk = X[start:stop] - D_chunk = pairwise_distances(X_chunk, Y, metric=metric, - n_jobs=n_jobs, **kwds) - if reduce_func is not None: - D_chunk = reduce_func(D_chunk, start) - _check_chunk_size(D_chunk, stop - start) - yield D_chunk - - def _check_chunk_size(reduced, chunk_size): is_tuple = isinstance(reduced, tuple) if not is_tuple: @@ -1245,22 +1187,32 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, Returns ------- - TODO + D_chunks : generator + Each element in the generator is either a slice of distance matrix or a + reduced distance matrix. """ block_n_rows = get_block_n_rows(row_bytes=_num_samples(Y if Y is not None else X) * 8, max_n_rows=_num_samples(X), working_memory=working_memory) - gen = _generate_pairwise_distances_chunked(X, Y, metric=metric, - n_jobs=n_jobs, - reduce_func=reduce_func, - block_n_rows=block_n_rows, - **kwds) - if reduce_func is None: - return gen - else: - return flexible_vstack(gen, final_len=_num_samples(X)) + + if metric != 'precomputed' and Y is None: + Y = X + n_samples = X.shape[0] + for start in range(0, n_samples, block_n_rows): + # get distances from block to every other sample + stop = min(start + block_n_rows, n_samples) + if start == 0 and stop >= n_samples: + X_chunk = X # allow fast paths in pairwise_distances + else: + X_chunk = X[start:stop] + D_chunk = pairwise_distances(X_chunk, Y, metric=metric, + n_jobs=n_jobs, **kwds) + if reduce_func is not None: + D_chunk = reduce_func(D_chunk, start) + _check_chunk_size(D_chunk, stop - start) + yield D_chunk def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 7e8a8d9e9bd10..b2de8e01dcf59 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -388,9 +388,9 @@ def test_pairwise_distances_chunked_reduce(): X = rng.random_sample((400, 4)) # Reduced Euclidean distance S = pairwise_distances(X)[:, :100] - S2 = pairwise_distances_chunked(X, None, reduce_func=_reduce_func, - working_memory=1) - assert_array_almost_equal(S, S2) + S_chunks = pairwise_distances_chunked(X, None, reduce_func=_reduce_func, + working_memory=1) + assert_array_almost_equal(S, np.vstack(S_chunks)) def check_pairwise_distances_chunked(X, Y, working_memory, metric='euclidean'): diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index ed1518167989f..f1209c6b52f8f 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -377,15 +377,13 @@ class from an array representing our data set and ask who's return_distance=return_distance) # for efficiency, use squared euclidean distances - if self.effective_metric_ == 'euclidean': - result = pairwise_distances_chunked( - X, self._fit_X, reduce_func=reduce_func, - metric='euclidean', n_jobs=n_jobs, squared=True) - else: - result = pairwise_distances_chunked( - X, self._fit_X, reduce_func=reduce_func, - metric=self.effective_metric_, n_jobs=n_jobs, - **self.effective_metric_params_) + kwds = ({'squared': True} if self.effective_metric_ == 'euclidean' + else self.effective_metric_params_) + + result = pairwise_distances_chunked( + X, self._fit_X, reduce_func=reduce_func, + metric=self.effective_metric_, n_jobs=n_jobs, + **kwds) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): @@ -397,14 +395,15 @@ class from an array representing our data set and ask who's X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) ) - if return_distance: - dist, neigh_ind = tuple(zip(*result)) - result = np.vstack(dist), np.vstack(neigh_ind) - else: - result = np.vstack(result) else: raise ValueError("internal: _fit_method not recognized") + if return_distance: + dist, neigh_ind = zip(*result) + result = np.vstack(dist), np.vstack(neigh_ind) + else: + result = np.vstack(result) + if not query_is_train: return result else: @@ -518,24 +517,18 @@ class RadiusNeighborsMixin(object): def _radius_neighbors_reduce_func(self, dist, start, radius, return_distance): - neigh_ind_list = [np.where(d <= radius)[0] for d in dist] - - # See https://github.com/numpy/numpy/issues/5456 - # if you want to understand why this is initialized this way. - neigh_ind = np.empty(dist.shape[0], dtype='object') - neigh_ind[:] = neigh_ind_list + neigh_ind = [np.where(d <= radius)[0] for d in dist] if return_distance: dist_array = np.empty(dist.shape[0], dtype='object') if self.effective_metric_ == 'euclidean': - dist_list = [np.sqrt(d[neigh_ind[i]]) - for i, d in enumerate(dist)] + dist = [np.sqrt(d[neigh_ind[i]]) + for i, d in enumerate(dist)] else: - dist_list = [d[neigh_ind[i]] - for i, d in enumerate(dist)] - dist_array[:] = dist_list + dist = [d[neigh_ind[i]] + for i, d in enumerate(dist)] - results = dist_array, neigh_ind + results = dist, neigh_ind else: results = neigh_ind return results @@ -623,23 +616,33 @@ class from an array representing our data set and ask who's # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': radius *= radius - reduce_func = partial(self._radius_neighbors_reduce_func, - radius=radius, - return_distance=return_distance) - - results = pairwise_distances_chunked( - X, self._fit_X, reduce_func=reduce_func, - metric='euclidean', n_jobs=self.n_jobs, - squared=True) + kwds = {'squared': True} + else: + kwds = self.effective_metric_params_ + + reduce_func = partial(self._radius_neighbors_reduce_func, + radius=radius, + return_distance=return_distance) + + results = pairwise_distances_chunked( + X, self._fit_X, reduce_func=reduce_func, + metric=self.effective_metric_, n_jobs=self.n_jobs, + **kwds) + if return_distance: + dist_chunks, neigh_ind_chunks = zip(*results) + dist_list = sum(dist_chunks, []) + neigh_ind_list = sum(neigh_ind_chunks, []) + # See https://github.com/numpy/numpy/issues/5456 + # if you want to understand why this is initialized this way. + dist = np.empty(len(dist_list), dtype='object') + dist[:] = dist_list + neigh_ind = np.empty(len(neigh_ind_list), dtype='object') + neigh_ind[:] = neigh_ind_list + results = dist, neigh_ind else: - reduce_func = partial(self._radius_neighbors_reduce_func, - radius=radius, - return_distance=return_distance) - - results = pairwise_distances_chunked( - X, self._fit_X, reduce_func=reduce_func, - metric=self.effective_metric_, n_jobs=self.n_jobs, - **self.effective_metric_params_) + neigh_ind_list = sum(results, []) + results = np.empty(len(neigh_ind_list), dtype='object') + results[:] = neigh_ind_list elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 88d7a3062beff..632916cddfefc 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -26,8 +26,7 @@ "compute_class_weight", "compute_sample_weight", "column_or_1d", "safe_indexing", "check_consistent_length", "check_X_y", 'indexable', - "check_symmetric", "indices_to_mask", "deprecated", - "flexible_vstack"] + "check_symmetric", "indices_to_mask", "deprecated"] class Bunch(dict): @@ -510,113 +509,6 @@ def indices_to_mask(indices, mask_length): return mask -def flexible_vstack(it, final_len=None): - """Helper that concatenates the elements of an iterable. - - Supports iterables of arrays, lists, sparse matrices or tuples. - - Parameters - ---------- - it : iterable - Iterable whose elements are to be concatenated. - - final_len : int, default = None - If passed, specifies the expected length of 'stacked_results'. - - Returns - ------- - stacked_results : array-like or sparse-matrix or tuple - The result of concatenating the elements of 'it'. - - Examples - -------- - >>> import numpy as np - >>> from scipy import sparse - >>> - >>> def make_example(typ): - ... yield typ([1, 2]) - ... yield typ([3]) - ... yield typ([4, 5, 6]) - ... - >>> flexible_vstack(make_example(list)) - [1, 2, 3, 4, 5, 6] - >>> flexible_vstack(make_example(np.array)) - array([1, 2, 3, 4, 5, 6]) - >>> flexible_vstack(zip(make_example(list), make_example(np.array))) - ([1, 2, 3, 4, 5, 6], array([1, 2, 3, 4, 5, 6])) - >>> flexible_vstack(make_example(np.array)) - array([1, 2, 3, 4, 5, 6]) - >>> flexible_vstack(make_example(np.array), final_len=6) - array([1, 2, 3, 4, 5, 6]) - >>> flexible_vstack(make_example( - ... lambda x: np.array(x).reshape(-1, 1))) - ... # doctest: +NORMALIZE_WHITESPACE - array([[1], [2], [3], [4], [5], [6]]) - """ - - def make_accumulator(prototype): - if isinstance(prototype, tuple): - return tuple(make_accumulator(y_proto) for y_proto in prototype) - if isinstance(prototype, np.ndarray) and final_len is not None: - return np.empty((final_len,) + prototype.shape[1:], - dtype=prototype.dtype) - else: - return [] - - def accumulate(x, accumulator, prototype): - if isinstance(prototype, tuple): - for y, y_acc, y_prototype in zip(x, accumulator, prototype): - n_rows = accumulate(y, y_acc, y_prototype) - # XXX: could assert all n_rows are identical - return n_rows - elif isinstance(prototype, np.ndarray) and final_len is not None: - accumulator[offset:offset + len(x)] = x - return len(x) - elif isinstance(prototype, list): - accumulator.extend(x) - return len(x) - else: - accumulator.append(x) - if hasattr(x, 'shape'): - return x.shape[0] - return len(x) - - def finalize(accumulator, prototype): - if isinstance(prototype, tuple): - return tuple(finalize(y_acc, y_prototype) - for y_acc, y_prototype in zip(accumulator, prototype)) - elif isinstance(prototype, list): - return accumulator - elif isinstance(prototype, np.ndarray) and final_len is not None: - return accumulator - elif isinstance(prototype, np.ndarray): - return np.concatenate(accumulator, axis=0) - elif sparse.isspmatrix(prototype): - return sparse.vstack(accumulator).asformat(prototype.format) - else: - raise NotImplementedError('No finalizing for accumulation of %s' - % type(prototype)) - - it = iter(it) - try: - # prototype - first = next(it) - except StopIteration: - raise ValueError('Require at least one output from the iterator') - - accumulator = make_accumulator(first) - offset = 0 - offset = accumulate(first, accumulator, first) - for x in it: - offset += accumulate(x, accumulator, first) - - if final_len is not None: - assert offset == final_len, 'Expected %d, got %d' % (final_len, offset) - - stacked_results = finalize(accumulator, first) - return stacked_results - - def get_block_n_rows(row_bytes, max_n_rows=None, working_memory=None): """Calculates the number of rows that fit in working_memory diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 47b646d0993af..fa93bf34fe6bc 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -18,7 +18,6 @@ from sklearn.utils import safe_indexing from sklearn.utils import shuffle from sklearn.utils import gen_even_slices -from sklearn.utils import flexible_vstack from sklearn.utils.extmath import pinvh from sklearn.utils.arpack import eigsh from sklearn.utils.mocking import MockDataFrame @@ -41,48 +40,6 @@ def test_make_rng(): assert_raises(ValueError, check_random_state, "some invalid seed") -def test_flexible_vstack(): - from scipy import sparse - - def make_example(typ): - yield typ([1, 2]) - yield typ([3]) - yield typ([4, 5, 6]) - - results = flexible_vstack(make_example(list)) - expected_results = [1, 2, 3, 4, 5, 6] - assert_equal(results, expected_results) - - results = flexible_vstack(make_example(np.array)) - expected_results = np.array([1, 2, 3, 4, 5, 6]) - assert_array_equal(results, expected_results) - - results = flexible_vstack(zip(make_example(list), make_example(np.array))) - expected_results = ([1, 2, 3, 4, 5, 6], np.array([1, 2, 3, 4, 5, 6])) - assert_array_equal(results, expected_results) - - results = flexible_vstack(make_example(np.array), final_len=6) - expected_results = np.array([1, 2, 3, 4, 5, 6]) - assert_array_equal(results, expected_results) - - results = flexible_vstack( - make_example(lambda x: np.array(x).reshape(-1, 1))) - expected_results = np.array([[1], [2], [3], [4], [5], [6]]) - assert_array_equal(results, expected_results) - - results = flexible_vstack( - make_example(lambda x: sparse.csr_matrix(np.array(x).reshape(-1, 1)))) - expected_results = np.array([[1], [2], [3], [4], [5], [6]], dtype=np.int64) - assert_equal(results.format, 'csr') - assert_array_equal(results.A, expected_results) - - results = flexible_vstack( - make_example(lambda x: sparse.csc_matrix(np.array(x).reshape(-1, 1)))) - expected_results = np.array([[1], [2], [3], [4], [5], [6]], dtype=np.int64) - assert_equal(results.format, 'csc') - assert_array_equal(results.A, expected_results) - - def test_deprecated(): # Test whether the deprecated decorator issues appropriate warnings # Copied almost verbatim from http://docs.python.org/library/warnings.html From 86f0321ad2dfae44ee97214e44e033688a203f69 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 09:55:08 +1100 Subject: [PATCH 66/96] Improve reduce_func description --- sklearn/metrics/pairwise.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b606b79477ba1..4634c594ae2c4 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1145,12 +1145,11 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, reduce_func : callable, optional The function which is applied on each chunk of the distance matrix, - reducing it to needed values. ``reduce_func`` receives the distances - chunk, an array or sparse matrix of shape - ``(X_chunk_n_samples, Y_n_samples)``, - and also the index of the first row in X. It should return an array, - a list, or a sparse matrix of length ``X_chunk_n_samples``, or a tuple - of such objects. + reducing it to needed values. ``reduce_func(D_chunk, start)`` + is called repeatedly, where ``D_chunk`` is a contiguous vertical + slice of the pairwise distance matrix, starting at row ``start``. + It should return an array, a list, or a sparse matrix of length + ``D_chunk.shape[0]``, or a tuple of such objects. If None, pairwise_distances_chunked returns a generator of vertical chunks of the distance matrix. From 149eb8f5bc3ee09c31c13ea01b66f00dc5a4ec15 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 12:50:48 +1100 Subject: [PATCH 67/96] fix flake --- sklearn/neighbors/base.py | 2 -- sklearn/utils/__init__.py | 1 - 2 files changed, 3 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index f1209c6b52f8f..80f98c9cd4224 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -520,14 +520,12 @@ def _radius_neighbors_reduce_func(self, dist, start, neigh_ind = [np.where(d <= radius)[0] for d in dist] if return_distance: - dist_array = np.empty(dist.shape[0], dtype='object') if self.effective_metric_ == 'euclidean': dist = [np.sqrt(d[neigh_ind[i]]) for i, d in enumerate(dist)] else: dist = [d[neigh_ind[i]] for i, d in enumerate(dist)] - results = dist, neigh_ind else: results = neigh_ind diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 632916cddfefc..3ffc0800e3d0c 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -4,7 +4,6 @@ from collections import Sequence import numpy as np -from scipy import sparse from scipy.sparse import issparse import warnings From c8afdb83825d21136f38b1e6d8f5e5abd6ac9e08 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 12:51:35 +1100 Subject: [PATCH 68/96] Block -> chunks; generate_chunks helper And some cleaning --- sklearn/metrics/cluster/unsupervised.py | 33 ++++++------------------- sklearn/metrics/pairwise.py | 29 +++++++++------------- sklearn/utils/__init__.py | 27 +++++++++++++++----- 3 files changed, 40 insertions(+), 49 deletions(-) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index f191fcb85ee3a..25f03bd9d3094 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -13,7 +13,6 @@ from ...utils import check_random_state from ...utils import check_X_y -from ...utils import get_block_n_rows from ..pairwise import pairwise_distances_chunked from ...preprocessing import LabelEncoder @@ -106,27 +105,24 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) -def _silhouette_reduce(D_chunk, start, labels, label_freqs, add_at): - """Accumulate silhouette statistics for X[start:start+block_n_rows] +def _silhouette_reduce(D_chunk, start, labels, label_freqs): + """Accumulate silhouette statistics for vertical chunk of X Parameters ---------- D_chunk : shape (n_chunk_samples, n_samples) precomputed distances for a chunk start : int - first index in block + first index in chunk labels : array, shape (n_samples,) corresponding cluster labels, encoded as {0, ..., n_clusters-1} label_freqs : array distribution of cluster labels in ``labels`` - add_at : array, shape (block_n_rows * n_clusters,) - indices into a flattened array of shape (block_n_rows, n_clusters) - where distances from block points to each cluster are accumulated """ # accumulate distances from each sample to each cluster - clust_dists = np.bincount(add_at[:D_chunk.size], - D_chunk.ravel()) - clust_dists = clust_dists.reshape(-1, len(label_freqs)) + clust_dists = np.zeros((len(D_chunk), len(label_freqs)), + dtype=D_chunk.dtype) + np.add.at(clust_dists.T, labels, D_chunk.T) # intra_index selects intra-cluster distances within clust_dists intra_index = (np.arange(len(D_chunk)), labels[start:start + len(D_chunk)]) @@ -206,24 +202,9 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): label_freqs = np.bincount(labels) check_number_of_labels(len(le.classes_), n_samples) - block_n_rows = get_block_n_rows(row_bytes=n_samples * 8, - max_n_rows=n_samples) - intra_clust_dists = [] - inter_clust_dists = [] - - # We use these indices as bins to accumulate distances from each sample in - # a block to each cluster. - # NB: we currently use np.bincount but could use np.add.at when Numpy >=1.8 - # is minimum dependency, which would avoid materialising this index. - block_range = np.arange(block_n_rows) - add_at = np.ravel_multi_index((np.repeat(block_range, n_samples), - np.tile(labels, block_n_rows)), - dims=(block_n_rows, len(label_freqs))) - kwds['metric'] = metric reduce_func = functools.partial(_silhouette_reduce, - labels=labels, label_freqs=label_freqs, - add_at=add_at) + labels=labels, label_freqs=label_freqs) results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds)) intra_clust_dists, inter_clust_dists = results diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 4634c594ae2c4..12cba107ab258 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -18,11 +18,11 @@ from scipy.sparse import csr_matrix from scipy.sparse import issparse +from ..utils.validation import _num_samples from ..utils import check_array from ..utils import gen_even_slices -from ..utils import get_block_n_rows +from ..utils import generate_chunks from ..utils.extmath import row_norms, safe_sparse_dot -from ..utils.validation import _num_samples from ..preprocessing import normalize from ..externals.joblib import Parallel from ..externals.joblib import delayed @@ -1191,26 +1191,21 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, reduced distance matrix. """ - block_n_rows = get_block_n_rows(row_bytes=_num_samples(Y if Y is not None - else X) * 8, - max_n_rows=_num_samples(X), - working_memory=working_memory) + if metric == 'precomputed': + it = ((X, 0),) + else: + if Y is None: + Y = X + row_bytes = 8 * _num_samples(Y) + it = generate_chunks(X, row_bytes, working_memory) - if metric != 'precomputed' and Y is None: - Y = X - n_samples = X.shape[0] - for start in range(0, n_samples, block_n_rows): - # get distances from block to every other sample - stop = min(start + block_n_rows, n_samples) - if start == 0 and stop >= n_samples: - X_chunk = X # allow fast paths in pairwise_distances - else: - X_chunk = X[start:stop] + for X_chunk, start in it: D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds) if reduce_func is not None: + chunk_size = D_chunk.shape[0] D_chunk = reduce_func(D_chunk, start) - _check_chunk_size(D_chunk, stop - start) + _check_chunk_size(D_chunk, chunk_size) yield D_chunk diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 3ffc0800e3d0c..871a7fbff67d3 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -17,6 +17,7 @@ from ..externals.joblib import cpu_count from ..exceptions import DataConversionWarning from .deprecation import deprecated +from .validation import _num_samples from .. import get_config __all__ = ["murmurhash3_32", "as_float_array", @@ -508,7 +509,7 @@ def indices_to_mask(indices, mask_length): return mask -def get_block_n_rows(row_bytes, max_n_rows=None, +def get_chunk_n_rows(row_bytes, max_n_rows=None, working_memory=None): """Calculates the number of rows that fit in working_memory @@ -533,12 +534,26 @@ def get_block_n_rows(row_bytes, max_n_rows=None, if working_memory is None: return max_n_rows - block_n_rows = working_memory * (2 ** 20) // row_bytes + chunk_n_rows = working_memory * (2 ** 20) // row_bytes if max_n_rows is not None: - block_n_rows = min(block_n_rows, max_n_rows) - if block_n_rows < 1: + chunk_n_rows = min(chunk_n_rows, max_n_rows) + if chunk_n_rows < 1: warnings.warn('Could not adhere to working_memory config. ' 'Currently %dMiB, %.0fMiB required.' % (working_memory, np.ceil(row_bytes * 2 ** -20))) - block_n_rows = 1 - return block_n_rows + chunk_n_rows = 1 + return chunk_n_rows + + +def generate_chunks(X, row_bytes, working_memory=None): + n_samples = _num_samples(X) + chunk_n_rows = get_chunk_n_rows(row_bytes=row_bytes, + max_n_rows=n_samples, + working_memory=working_memory) + for start in range(0, n_samples, chunk_n_rows): + stop = min(start + chunk_n_rows, n_samples) + if start == 0 and stop >= n_samples: + X_chunk = X # potential for fast paths + else: + X_chunk = X[start:stop] + yield X_chunk, start From a1743137ef44350ade620f8efa49e706b6620c99 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 13:00:58 +1100 Subject: [PATCH 69/96] DOC --- sklearn/metrics/pairwise.py | 11 +++++------ sklearn/utils/__init__.py | 22 ++++++++++++++++++++-- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 12cba107ab258..15f529c14358d 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1184,12 +1184,11 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. - Returns - ------- - D_chunks : generator - Each element in the generator is either a slice of distance matrix or a - reduced distance matrix. - + Yields + ------ + D_chunk : array or sparse matrix + A contiguous slice of distance matrix, optionally processed by + ``reduce_func``. """ if metric == 'precomputed': it = ((X, 0),) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 871a7fbff67d3..e520fe24ce4c8 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -516,7 +516,8 @@ def get_chunk_n_rows(row_bytes, max_n_rows=None, Parameters ---------- row_bytes : int - The number of bytes consumed by each row + The number of bytes consumed by each row of expected output + from some function being applied to each chunk. max_n_rows : int, optional The maximum return value. working_memory : int, optional @@ -525,7 +526,7 @@ def get_chunk_n_rows(row_bytes, max_n_rows=None, Returns ------- - int or n_samples + int or the value of n_samples """ if working_memory is None: @@ -546,6 +547,23 @@ def get_chunk_n_rows(row_bytes, max_n_rows=None, def generate_chunks(X, row_bytes, working_memory=None): + """Generates vertical chunks of X to process within constant memory + + Parameters + ---------- + X : array-like + row_bytes : int + The number of bytes consumed by each row of expected output + from some function being applied to each chunk. + working_memory : int, optional + The number of rows to fit inside this number of MiB will be returned. + Defaults to ``sklearn.get_config()['working_memory']``. + + Yields + ------ + X_chunk : array-like + start : int + """ n_samples = _num_samples(X) chunk_n_rows = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=n_samples, From 16aabd55307511d1482afd4ef45dc075e28ed71c Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 17:39:21 +1100 Subject: [PATCH 70/96] Remove redundant code --- sklearn/metrics/cluster/tests/test_unsupervised.py | 14 -------------- sklearn/metrics/cluster/unsupervised.py | 7 ------- sklearn/utils/__init__.py | 3 --- 3 files changed, 24 deletions(-) diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 9d6e37843d438..07f73cd1c0012 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -81,20 +81,6 @@ def test_cluster_size_1(): assert_array_equal(ss, [0, .5, .5, 0, 1, 1]) -def test_no_nan(): - # Assert Silhouette Coefficient != nan when there is 1 sample in a class. - # This tests for the condition that caused issue #960. - # Note that there is only one sample in cluster 0. This used to cause the - # silhouette_score to return nan. - labels = np.array([1, 0, 1, 1, 1]) - # The distance matrix doesn't actually matter. - D = np.random.RandomState(0).rand(len(labels), len(labels)) - silhouette = silhouette_score(D, labels, metric='precomputed') - assert_false(np.isnan(silhouette)) - ss = silhouette_samples(D, labels, metric='precomputed') - assert_false(np.isnan(ss).any()) - - def test_silhouette_paper_example(): # Explicitly check per-sample results against Rousseeuw (1987) lower = [5.58, diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 25f03bd9d3094..4ef240b17607d 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -211,13 +211,6 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): intra_clust_dists = np.concatenate(intra_clust_dists) inter_clust_dists = np.concatenate(inter_clust_dists) - if len(intra_clust_dists) == 1: - intra_clust_dists = intra_clust_dists[0] - inter_clust_dists = inter_clust_dists[0] - else: - intra_clust_dists = np.hstack(intra_clust_dists) - inter_clust_dists = np.hstack(inter_clust_dists) - denom = (label_freqs - 1).take(labels, mode='clip') with np.errstate(divide="ignore", invalid="ignore"): intra_clust_dists /= denom diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index e520fe24ce4c8..3f137a460d06e 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -532,9 +532,6 @@ def get_chunk_n_rows(row_bytes, max_n_rows=None, if working_memory is None: working_memory = get_config()['working_memory'] - if working_memory is None: - return max_n_rows - chunk_n_rows = working_memory * (2 ** 20) // row_bytes if max_n_rows is not None: chunk_n_rows = min(chunk_n_rows, max_n_rows) From 31430181cde88dc18a64f589ec330dcbb8ee8dc8 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 18:51:16 +1100 Subject: [PATCH 71/96] Document working_memory config --- doc/modules/computational_performance.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/doc/modules/computational_performance.rst b/doc/modules/computational_performance.rst index d66cba212a2dd..bf411ee2d9004 100644 --- a/doc/modules/computational_performance.rst +++ b/doc/modules/computational_performance.rst @@ -308,6 +308,26 @@ Debian / Ubuntu. or upgrade to Python 3.4 which has a new version of ``multiprocessing`` that should be immune to this problem. +Limiting Working Memory +----------------------- + +Some calculations when implemented using standard numpy vectorized +operations involve using a large amount of temporary memory. +As well as potentially exhausting system memory, memory management +can overwhelm computation time. Where computations can be performed +in fixed-memory chunks, we attempt to do so, and allow the user to +hint at the maximum size of this working memory (defaulting to 64 MiB) +using :func:`sklearn.set_config` or :func:`config_context`. +The following suggests to limit temporary working memory to 128 MiB:: + + >>> import sklearn + >>> with sklearn.config_context(working_memory=128): + ... pass # do chunked work here + +An example of a chunked operation adhering to this setting is +:func:`metric.pairwise_distances_chunked`, which facilitates computing +row-wise reductions of a pairwise distance matrix. + Model Compression ----------------- From a21794d1299043756374221e5a3d2e9f9f5425d3 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 20:24:34 +1100 Subject: [PATCH 72/96] Use existing gen_batches --- sklearn/metrics/pairwise.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 15f529c14358d..43fdff9ece405 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -21,7 +21,7 @@ from ..utils.validation import _num_samples from ..utils import check_array from ..utils import gen_even_slices -from ..utils import generate_chunks +from ..utils import gen_batches, get_chunk_n_rows from ..utils.extmath import row_norms, safe_sparse_dot from ..preprocessing import normalize from ..externals.joblib import Parallel @@ -1190,20 +1190,26 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, A contiguous slice of distance matrix, optionally processed by ``reduce_func``. """ + n_samples = _num_samples(X) if metric == 'precomputed': - it = ((X, 0),) + slices = (slice(0, n_samples),) else: if Y is None: Y = X - row_bytes = 8 * _num_samples(Y) - it = generate_chunks(X, row_bytes, working_memory) - - for X_chunk, start in it: + chunk_n_rows = get_chunk_n_rows(row_bytes=8 * _num_samples(Y), + working_memory=working_memory) + slices = gen_batches(n_samples, chunk_n_rows) + + for sl in slices: + if sl.start == 0 and sl.stop == n_samples: + X_chunk = X # enable optimised paths for X is Y + else: + X_chunk = X[sl] D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds) if reduce_func is not None: chunk_size = D_chunk.shape[0] - D_chunk = reduce_func(D_chunk, start) + D_chunk = reduce_func(D_chunk, sl.start) _check_chunk_size(D_chunk, chunk_size) yield D_chunk From 60e01e3dd3bc3efccd0570c6802321f39baed6b2 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 21:18:24 +1100 Subject: [PATCH 73/96] TST test_get_chunk_n_rows --- sklearn/metrics/pairwise.py | 7 +++--- sklearn/utils/__init__.py | 41 ++++++------------------------ sklearn/utils/tests/test_utils.py | 42 ++++++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 38 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 43fdff9ece405..2c04a8f28104d 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1190,18 +1190,19 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, A contiguous slice of distance matrix, optionally processed by ``reduce_func``. """ - n_samples = _num_samples(X) + n_samples_X = _num_samples(X) if metric == 'precomputed': slices = (slice(0, n_samples),) else: if Y is None: Y = X chunk_n_rows = get_chunk_n_rows(row_bytes=8 * _num_samples(Y), + max_n_rows=n_samples_X, working_memory=working_memory) - slices = gen_batches(n_samples, chunk_n_rows) + slices = gen_batches(n_samples_X, chunk_n_rows) for sl in slices: - if sl.start == 0 and sl.stop == n_samples: + if sl.start == 0 and sl.stop == n_samples_X: X_chunk = X # enable optimised paths for X is Y else: X_chunk = X[sl] diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 3f137a460d06e..8e17b5670330e 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -520,55 +520,28 @@ def get_chunk_n_rows(row_bytes, max_n_rows=None, from some function being applied to each chunk. max_n_rows : int, optional The maximum return value. - working_memory : int, optional + working_memory : int or float, optional The number of rows to fit inside this number of MiB will be returned. Defaults to ``sklearn.get_config()['working_memory']``. Returns ------- int or the value of n_samples + + Warns + ----- + Issues a UserWarning if ``row_bytes`` exceeds ``working_memory`` MiB. """ if working_memory is None: working_memory = get_config()['working_memory'] - chunk_n_rows = working_memory * (2 ** 20) // row_bytes + chunk_n_rows = int(working_memory * (2 ** 20) // row_bytes) if max_n_rows is not None: chunk_n_rows = min(chunk_n_rows, max_n_rows) if chunk_n_rows < 1: warnings.warn('Could not adhere to working_memory config. ' - 'Currently %dMiB, %.0fMiB required.' % + 'Currently %.0fMiB, %.0fMiB required.' % (working_memory, np.ceil(row_bytes * 2 ** -20))) chunk_n_rows = 1 return chunk_n_rows - - -def generate_chunks(X, row_bytes, working_memory=None): - """Generates vertical chunks of X to process within constant memory - - Parameters - ---------- - X : array-like - row_bytes : int - The number of bytes consumed by each row of expected output - from some function being applied to each chunk. - working_memory : int, optional - The number of rows to fit inside this number of MiB will be returned. - Defaults to ``sklearn.get_config()['working_memory']``. - - Yields - ------ - X_chunk : array-like - start : int - """ - n_samples = _num_samples(X) - chunk_n_rows = get_chunk_n_rows(row_bytes=row_bytes, - max_n_rows=n_samples, - working_memory=working_memory) - for start in range(0, n_samples, chunk_n_rows): - stop = min(start + chunk_n_rows, n_samples) - if start == 0 and stop >= n_samples: - X_chunk = X # potential for fast paths - else: - X_chunk = X[start:stop] - yield X_chunk, start diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index fa93bf34fe6bc..1f1efed825c80 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -1,6 +1,7 @@ from itertools import chain, product import warnings +import pytest import numpy as np import scipy.sparse as sp from scipy.linalg import pinv2 @@ -9,7 +10,8 @@ from sklearn.utils.testing import (assert_equal, assert_raises, assert_true, assert_almost_equal, assert_array_equal, SkipTest, assert_raises_regex, - assert_greater_equal, ignore_warnings) + assert_greater_equal, ignore_warnings, + assert_warns_message, assert_no_warnings) from sklearn.utils import check_random_state from sklearn.utils import deprecated from sklearn.utils import resample @@ -18,9 +20,11 @@ from sklearn.utils import safe_indexing from sklearn.utils import shuffle from sklearn.utils import gen_even_slices +from sklearn.utils import get_chunk_n_rows from sklearn.utils.extmath import pinvh from sklearn.utils.arpack import eigsh from sklearn.utils.mocking import MockDataFrame +from sklearn import config_context def test_make_rng(): @@ -274,3 +278,39 @@ def test_gen_even_slices(): slices = gen_even_slices(10, -1) assert_raises_regex(ValueError, "gen_even_slices got n_packs=-1, must be" " >=1", next, slices) + + +@pytest.mark.parametrize( + ('row_bytes', 'max_n_rows', 'working_memory', 'expected', 'warning'), + [(1024, None, 1, 1024, None), + (1024, None, 0.99999999, 1023, None), + (1023, None, 1, 1025, None), + (1025, None, 1, 1023, None), + (1024, None, 2, 2048, None), + (1024, 7, 1, 7, None), + (1024 * 1024, None, 1, 1, None), + (1024 * 1024 + 1, None, 1, 1, + 'Could not adhere to working_memory config. ' + 'Currently 1MiB, 2MiB required.'), + ]) +def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, + expected, warning): + if warning is not None: + def check_warning(*args, **kw): + return assert_warns_message(UserWarning, warning, *args, **kw) + else: + check_warning = assert_no_warnings + + actual = check_warning(get_chunk_n_rows, + row_bytes=row_bytes, + max_n_rows=max_n_rows, + working_memory=working_memory) + + assert actual == expected + assert type(actual) is type(expected) + with config_context(working_memory=working_memory): + actual = check_warning(get_chunk_n_rows, + row_bytes=row_bytes, + max_n_rows=max_n_rows) + assert actual == expected + assert type(actual) is type(expected) From 0a79c39d6867655836beb4f247a753e0c2fd2dc6 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 23:24:07 +1100 Subject: [PATCH 74/96] DOC What's new --- doc/whats_new/v0.20.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index b85c11548854a..290209b037481 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -74,6 +74,13 @@ Model evaluation ``'balanced_accuracy'`` scorer for binary classification. :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia `. +Misc + +- A new configuration parameter, ``working_memory`` was added to control memory + consumption limits in chunked operations, such as the new + :func:`metrics.pairwise_distances_chunked`. See :ref:`working_memory`. + :issue:`10280` by `Joel Nothman`_ and :user:`Aman Dalmia `. + Enhancements ............ @@ -108,6 +115,10 @@ Classifiers and regressors :class:`linear_model.BayesianRidge` for weighted linear regression. :issue:`10111` by :user:`Peter St. John `. +- :mod:`Nearest neighbors ` query methods are now more memory + efficient when ``algorithm='brute'``. :issue:`10280` by `Joel Nothman`_ + and :user:`Aman Dalmia `. + Model evaluation and meta-estimators - A scorer based on :func:`metrics.brier_score_loss` is also available. @@ -118,6 +129,10 @@ Metrics - :func:`metrics.roc_auc_score` now supports binary ``y_true`` other than ``{0, 1}`` or ``{-1, 1}``. :issue:`9828` by :user:`Hanmin Qin `. +- :func:`metrics.cluster.silhouette_score` and + :func:`metrics.cluster.silhouette_samples` are more memory efficient, + causing them to run faster. :issue:`10280` by `Joel Nothman`_. + Linear, kernelized and related models - Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the @@ -255,6 +270,12 @@ Metrics due to floating point error in the input. :issue:`9851` by :user:`Hanmin Qin `. +- The ``batch_size`` parameter to :func:`metrics.pairwise_distances_argmin_min` + and :func:`metrics.pairwise_distances_argmin` is deprecated to be removed in + v0.22. It no longer has any effect, as batch size is determined by global + ``working_memory`` config. See :ref:`working_memory`. :issue:`10280` by `Joel + Nothman`_ and :user:`Aman Dalmia `. + Cluster - Deprecate ``pooling_func`` unused parameter in From df94645a0069e4bf9f848a11b53cebcf493a3c48 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 23:25:01 +1100 Subject: [PATCH 75/96] TST improve pairwise_distances_chunked testing --- sklearn/metrics/pairwise.py | 14 ++++-- sklearn/metrics/tests/test_pairwise.py | 60 +++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 2c04a8f28104d..54cf9c1e54389 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1115,11 +1115,17 @@ def _check_chunk_size(reduced, chunk_size): is_tuple = isinstance(reduced, tuple) if not is_tuple: reduced = (reduced,) - if not np.all(len(r) == chunk_size for r in reduced): - actual_size = tuple(map(len, reduced)) if is_tuple else len(reduced) + if any(isinstance(r, tuple) or not hasattr(r, '__iter__') + for r in reduced): + raise TypeError('reduce_func returned %r. ' + 'Expected sequence(s) of length %d.' % + (reduced if is_tuple else reduced[0], chunk_size)) + if any(_num_samples(r) != chunk_size for r in reduced): + actual_size = tuple(map(_num_samples, reduced)) raise ValueError('reduce_func returned object of length %s. ' 'Expected same length as input: %d.' % - (actual_size, chunk_size)) + (actual_size if is_tuple else actual_size[0], + chunk_size)) def pairwise_distances_chunked(X, Y=None, reduce_func=None, @@ -1192,7 +1198,7 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, """ n_samples_X = _num_samples(X) if metric == 'precomputed': - slices = (slice(0, n_samples),) + slices = (slice(0, n_samples_X),) else: if Y is None: Y = X diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index b2de8e01dcf59..9791ea1ed25b8 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -1,5 +1,6 @@ import numpy as np from numpy import linalg +import pytest from scipy.sparse import dok_matrix, csr_matrix, issparse from scipy.spatial.distance import cosine, cityblock, minkowski, wminkowski @@ -389,15 +390,56 @@ def test_pairwise_distances_chunked_reduce(): # Reduced Euclidean distance S = pairwise_distances(X)[:, :100] S_chunks = pairwise_distances_chunked(X, None, reduce_func=_reduce_func, - working_memory=1) + working_memory=2 ** -16) + assert hasattr(S_chunks, '__next__') + S_chunks = list(S_chunks) + assert len(S_chunks) > 1 assert_array_almost_equal(S, np.vstack(S_chunks)) +@pytest.mark.parametrize('good_reduce', [ + lambda D, start: list(D), + lambda D, start: np.array(D), + lambda D, start: csr_matrix(D), + lambda D, start: (list(D), list(D)), + lambda D, start: (dok_matrix(D), np.array(D), list(D)), + lambda D, start: 'abcdefghijklmnopqrstuvwxyz'[:len(D)], + ]) +def test_pairwise_distances_chunked_reduce_valid(good_reduce): + X = np.arange(10).reshape(-1, 1) + S_chunks = pairwise_distances_chunked(X, None, reduce_func=good_reduce, + working_memory=64) + next(S_chunks) + + +@pytest.mark.parametrize(('bad_reduce', 'err_type', 'message'), [ + (lambda D, s: np.concatenate([D, D[-1:]]), ValueError, + r'length 11\..* input: 10\.'), + (lambda D, s: (D, np.concatenate([D, D[-1:]])), ValueError, + r'length \(10, 11\)\..* input: 10\.'), + (lambda D, s: (D[:9], D), ValueError, + r'length \(9, 10\)\..* input: 10\.'), + (lambda D, s: 7, TypeError, + r'returned 7\. Expected sequence\(s\) of length 10\.'), + (lambda D, s: (7, 8), TypeError, + r'returned \(7, 8\)\. Expected sequence\(s\) of length 10\.'), + (lambda D, s: (np.arange(10), 9), TypeError, + r', 9\)\. Expected sequence\(s\) of length 10\.'), +]) +def test_pairwise_distances_chunked_reduce_invalid(bad_reduce, err_type, + message): + X = np.arange(10).reshape(-1, 1) + S_chunks = pairwise_distances_chunked(X, None, reduce_func=bad_reduce, + working_memory=64) + assert_raises_regexp(err_type, message, next, S_chunks) + + def check_pairwise_distances_chunked(X, Y, working_memory, metric='euclidean'): gen = pairwise_distances_chunked(X, Y, working_memory=working_memory, metric=metric) + assert hasattr(gen, '__next__') blockwise_distances = list(gen) - min_block_mib = X.shape[0] * 8 * 2 ** -20 + min_block_mib = np.array(X).shape[0] * 8 * 2 ** -20 working_memory = min(working_memory, min_block_mib) for block in blockwise_distances: @@ -416,10 +458,15 @@ def test_pairwise_distances_chunked(): X = rng.random_sample((400, 4)) check_pairwise_distances_chunked(X, None, working_memory=1, metric='euclidean') + # X as list + check_pairwise_distances_chunked(X.tolist(), None, working_memory=1, + metric='euclidean') # Euclidean distance, with Y != X. Y = rng.random_sample((200, 4)) check_pairwise_distances_chunked(X, Y, working_memory=1, metric='euclidean') + check_pairwise_distances_chunked(X.tolist(), Y.tolist(), working_memory=1, + metric='euclidean') # absurdly large working_memory check_pairwise_distances_chunked(X, Y, working_memory=10000, metric='euclidean') @@ -431,6 +478,15 @@ def test_pairwise_distances_chunked(): assert_raises(ValueError, next, pairwise_distances_chunked(X, Y, metric="blah")) + # Test precomputed returns all at once + D = pairwise_distances(X) + gen = pairwise_distances_chunked(D, + working_memory=2 ** -16, + metric='precomputed') + assert hasattr(gen, '__next__') + assert next(gen) is D + assert_raises(StopIteration, next, gen) + def test_euclidean_distances(): # Check the pairwise Euclidean distances computation From a2b2b0a144e9d61ad7e975b9fd0fc3c759d50208 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 11 Dec 2017 23:39:09 +1100 Subject: [PATCH 76/96] Remove unused import --- sklearn/utils/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 8e17b5670330e..6265c65d41541 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -17,7 +17,6 @@ from ..externals.joblib import cpu_count from ..exceptions import DataConversionWarning from .deprecation import deprecated -from .validation import _num_samples from .. import get_config __all__ = ["murmurhash3_32", "as_float_array", From e97ef8b1d58229ee90c2bb39182749cefa5ab797 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 12 Dec 2017 00:01:00 +1100 Subject: [PATCH 77/96] Try fix tests for Python 2 --- sklearn/metrics/tests/test_pairwise.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 9791ea1ed25b8..2bc74d25be6ad 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -1,3 +1,5 @@ +from types import GeneratorType + import numpy as np from numpy import linalg import pytest @@ -391,7 +393,7 @@ def test_pairwise_distances_chunked_reduce(): S = pairwise_distances(X)[:, :100] S_chunks = pairwise_distances_chunked(X, None, reduce_func=_reduce_func, working_memory=2 ** -16) - assert hasattr(S_chunks, '__next__') + assert isinstance(S_chunks, GeneratorType) S_chunks = list(S_chunks) assert len(S_chunks) > 1 assert_array_almost_equal(S, np.vstack(S_chunks)) @@ -403,7 +405,6 @@ def test_pairwise_distances_chunked_reduce(): lambda D, start: csr_matrix(D), lambda D, start: (list(D), list(D)), lambda D, start: (dok_matrix(D), np.array(D), list(D)), - lambda D, start: 'abcdefghijklmnopqrstuvwxyz'[:len(D)], ]) def test_pairwise_distances_chunked_reduce_valid(good_reduce): X = np.arange(10).reshape(-1, 1) @@ -437,7 +438,7 @@ def test_pairwise_distances_chunked_reduce_invalid(bad_reduce, err_type, def check_pairwise_distances_chunked(X, Y, working_memory, metric='euclidean'): gen = pairwise_distances_chunked(X, Y, working_memory=working_memory, metric=metric) - assert hasattr(gen, '__next__') + assert isinstance(gen, GeneratorType) blockwise_distances = list(gen) min_block_mib = np.array(X).shape[0] * 8 * 2 ** -20 working_memory = min(working_memory, min_block_mib) @@ -483,7 +484,7 @@ def test_pairwise_distances_chunked(): gen = pairwise_distances_chunked(D, working_memory=2 ** -16, metric='precomputed') - assert hasattr(gen, '__next__') + assert isinstance(gen, GeneratorType) assert next(gen) is D assert_raises(StopIteration, next, gen) From 26cf342707998ae9446688340e0963ce60dff430 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 12 Dec 2017 06:53:37 +1100 Subject: [PATCH 78/96] Fix appveyor failure --- sklearn/metrics/pairwise.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 54cf9c1e54389..e1558be3cf2be 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1121,7 +1121,9 @@ def _check_chunk_size(reduced, chunk_size): 'Expected sequence(s) of length %d.' % (reduced if is_tuple else reduced[0], chunk_size)) if any(_num_samples(r) != chunk_size for r in reduced): - actual_size = tuple(map(_num_samples, reduced)) + # XXX: we use int(_num_samples...) because sometimes _num_samples + # returns a long in Python 2, even for small numbers. + actual_size = tuple(int(_num_samples(r)) for r in reduced) raise ValueError('reduce_func returned object of length %s. ' 'Expected same length as input: %d.' % (actual_size if is_tuple else actual_size[0], From 011021d06d92197f98d5729f546ea62edaaae8e0 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Dec 2017 13:57:36 +1100 Subject: [PATCH 79/96] Respond to Roman --- sklearn/metrics/pairwise.py | 9 +++++++++ sklearn/utils/__init__.py | 6 +++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index e1558be3cf2be..54351dad0216d 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1204,6 +1204,15 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, else: if Y is None: Y = X + # We get as many rows as possible within our working_memory budget to + # store len(Y) distances in each row of output. + # + # Note: + # - this will get at least 1 row, even if 1 row of distances will + # exceed working_memory. + # - this does not account for any temporary memory usage while + # calculating distances (e.g. difference of vectors in manhattan + # distance. chunk_n_rows = get_chunk_n_rows(row_bytes=8 * _num_samples(Y), max_n_rows=n_samples_X, working_memory=working_memory) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 6265c65d41541..682db2b38cbde 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -510,13 +510,13 @@ def indices_to_mask(indices, mask_length): def get_chunk_n_rows(row_bytes, max_n_rows=None, working_memory=None): - """Calculates the number of rows that fit in working_memory + """Calculates how many rows can be processed within working_memory Parameters ---------- row_bytes : int - The number of bytes consumed by each row of expected output - from some function being applied to each chunk. + The expected number of bytes of memory that will be consumed + during the processing of each row. max_n_rows : int, optional The maximum return value. working_memory : int or float, optional From ca536f6e7c59a4ee10670848aa3203ece7f55821 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 13 Dec 2017 21:47:05 +1100 Subject: [PATCH 80/96] See also --- sklearn/metrics/pairwise.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 54351dad0216d..e654fe9c12a65 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1311,6 +1311,11 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): If Y is not None, then D_{i, j} is the distance between the ith array from X and the jth array from Y. + See also + -------- + pairwise_distances_chunked : performs the same calculation as this funtion, + but returns a generator of chunks of the distance matrix, in order to + limit memory usage. """ if (metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed"): From 57875cf449c8aa934ea3a59fa729998f26547322 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 9 Jan 2018 09:55:26 +1100 Subject: [PATCH 81/96] use dict for global config --- sklearn/_config.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/_config.py b/sklearn/_config.py index 31a5813f39d6a..7c42be3ca0a0f 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -3,8 +3,10 @@ import os from contextlib import contextmanager as contextmanager -_ASSUME_FINITE = bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)) -_WORKING_MEMORY = int(os.environ.get('SKLEARN_WORKING_MEMORY', 64)) +_global_config = { + 'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)), + 'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 64)) +} def get_config(): @@ -15,8 +17,7 @@ def get_config(): config : dict Keys are parameter names that can be passed to :func:`set_config`. """ - return {'assume_finite': _ASSUME_FINITE, - 'working_memory': _WORKING_MEMORY} + return _global_config.copy() def set_config(assume_finite=None, working_memory=None): @@ -36,11 +37,10 @@ def set_config(assume_finite=None, working_memory=None): computation time and memory on expensive operations that can be performed in chunks. Global default: 64. """ - global _ASSUME_FINITE, _WORKING_MEMORY if assume_finite is not None: - _ASSUME_FINITE = assume_finite + _global_config['assume_finite'] = assume_finite if working_memory is not None: - _WORKING_MEMORY = working_memory + _global_config['working_memory'] = working_memory @contextmanager From 06e841354d6ec7a0c7884b6b7b3d611be468e9f9 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 9 Jan 2018 10:14:39 +1100 Subject: [PATCH 82/96] Minor responses to Roman --- sklearn/metrics/pairwise.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 8068abaabc475..b403c3feb9718 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -265,7 +265,7 @@ def _argmin_min_reduce(dist, start): def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", - metric_kwargs=None, batch_size=None): + batch_size=None, metric_kwargs=None): """Compute minimum distances between one point and a set of points. This function computes for each row in X, the index of the row of Y which @@ -316,14 +316,14 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", See the documentation for scipy.spatial.distance for details on these metrics. - metric_kwargs : dict, optional - Keyword arguments to pass to specified metric function. - batch_size : integer .. deprecated:: 0.20 Deprecated for removal in 0.22. Use sklearn.set_config(working_memory=...) instead. + metric_kwargs : dict, optional + Keyword arguments to pass to specified metric function. + Returns ------- argmin : numpy.ndarray @@ -1112,6 +1112,8 @@ def _pairwise_callable(X, Y, metric, **kwds): def _check_chunk_size(reduced, chunk_size): + """Checks chunk is a sequence of expected size or a tuple of same + """ is_tuple = isinstance(reduced, tuple) if not is_tuple: reduced = (reduced,) From 7d45f17d63175e5309a1b5861fd72febd6930c87 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 9 Jan 2018 23:55:18 +1100 Subject: [PATCH 83/96] Add pairwise_distances_chunked examples --- sklearn/metrics/pairwise.py | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index b403c3feb9718..350443bf786d5 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1199,6 +1199,48 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, D_chunk : array or sparse matrix A contiguous slice of distance matrix, optionally processed by ``reduce_func``. + + Example + ------- + + Without reduce_func: + + >>> X = np.random.RandomState(0).rand(5, 3) + >>> D_chunk = next(pairwise_distances_chunked(X)) + >>> D_chunk # doctest: +ELLIPSIS + array([[ 0. ..., 0.29..., 0.41..., 0.19..., 0.57...], + [ 0.29..., 0. ..., 0.57..., 0.41..., 0.76...], + [ 0.41..., 0.57..., 0. ..., 0.44..., 0.90...], + [ 0.19..., 0.41..., 0.44..., 0. ..., 0.51...], + [ 0.57..., 0.76..., 0.90..., 0.51..., 0. ...]]) + + Retrieve all neighbors and average distance within radius r: + + >>> r = .2 + >>> def reduce_func(D_chunk, start): + ... neigh = [np.flatnonzero(d < r) for d in D_chunk] + ... avg_dist = np.ma.masked_array(D_chunk, D_chunk < r).mean(axis=1) + ... return neigh, avg_dist + >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func) + >>> neigh, avg_dist = next(gen) + >>> neigh + [array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])] + >>> avg_dist.data # doctest: +ELLIPSIS + array([ 0.427..., 0.513..., 0.586..., 0.459... , 0.687...]) + + Where r is defined per sample, we need to make use of ``start``: + + >>> r = [.2, .4, .4, .3, .1] + >>> def reduce_func(D_chunk, start): + ... neigh = [np.flatnonzero(d < r[i]) + ... for i, d in enumerate(D_chunk, start)] + ... return neigh + >>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func)) + >>> neigh + [array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])] + + case where we want to get the indices and average + distance of the neighborhood for each point in X, but the radius """ n_samples_X = _num_samples(X) if metric == 'precomputed': From 214784d0c7e68214638ac188470a1d97cb44a587 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 9 Jan 2018 23:57:14 +1100 Subject: [PATCH 84/96] Remove junk in docs --- sklearn/metrics/pairwise.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 350443bf786d5..c10a25e94868a 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1200,9 +1200,8 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, A contiguous slice of distance matrix, optionally processed by ``reduce_func``. - Example + Examples ------- - Without reduce_func: >>> X = np.random.RandomState(0).rand(5, 3) @@ -1238,9 +1237,6 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, >>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func)) >>> neigh [array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])] - - case where we want to get the indices and average - distance of the neighborhood for each point in X, but the radius """ n_samples_X = _num_samples(X) if metric == 'precomputed': From 75a2eab5438e1ca80c9244c1edbe7bdf96afeac0 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 10 Jan 2018 00:00:38 +1100 Subject: [PATCH 85/96] Illustrate actual chunking --- sklearn/metrics/pairwise.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index c10a25e94868a..ec458773b65d2 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1237,6 +1237,15 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, >>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func)) >>> neigh [array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])] + + Force row-by-row generation by reducing ``working_memory``: + + >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func, + ... working_memory=0) + >>> next(gen) + [array([0, 3])] + >>> next(gen) + [array([0, 1])] """ n_samples_X = _num_samples(X) if metric == 'precomputed': From f8badad95b2be0a731802f1074a073c524448d65 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 10 Jan 2018 07:30:22 +1100 Subject: [PATCH 86/96] Fix up pairwise_distances_argmin arg ordering --- sklearn/metrics/pairwise.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index ec458773b65d2..8bfaefc8c6c30 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -363,7 +363,7 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", - metric_kwargs=None, batch_size=None): + batch_size=None, metric_kwargs=None): """Compute minimum distances between one point and a set of points. This function computes for each row in X, the index of the row of Y which @@ -417,7 +417,9 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", metrics. batch_size : integer - Deprecated. Use sklearn.set_config(working_memory=...) instead. + .. deprecated:: 0.20 + Deprecated for removal in 0.22. + Use sklearn.set_config(working_memory=...) instead. metric_kwargs : dict keyword arguments to pass to specified metric function. @@ -435,7 +437,8 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", if metric_kwargs is None: metric_kwargs = {} - return pairwise_distances_argmin_min(X, Y, axis, metric, metric_kwargs, + return pairwise_distances_argmin_min(X, Y, axis, metric, + metric_kwargs=metric_kwargs, batch_size=batch_size)[0] From 29a4c64b971f052139c653ecb098d9025c045fa5 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 10 Jan 2018 10:35:54 +1100 Subject: [PATCH 87/96] More nuanced comment on memory-speed tradeoffs --- doc/modules/computational_performance.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/modules/computational_performance.rst b/doc/modules/computational_performance.rst index bf411ee2d9004..ba811a4865aa4 100644 --- a/doc/modules/computational_performance.rst +++ b/doc/modules/computational_performance.rst @@ -314,7 +314,8 @@ Limiting Working Memory Some calculations when implemented using standard numpy vectorized operations involve using a large amount of temporary memory. As well as potentially exhausting system memory, memory management -can overwhelm computation time. Where computations can be performed +can overwhelm computation time, or at least, using more memory provides +negligible speed benefit. Where computations can be performed in fixed-memory chunks, we attempt to do so, and allow the user to hint at the maximum size of this working memory (defaulting to 64 MiB) using :func:`sklearn.set_config` or :func:`config_context`. From 73e3b33e6b98275d992a5eea893c699ebf5b9c6d Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 15 Jan 2018 09:40:40 +1100 Subject: [PATCH 88/96] Increase default working memory to 1GiB --- doc/modules/computational_performance.rst | 2 +- sklearn/_config.py | 6 +++--- sklearn/tests/test_config.py | 7 ++++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/modules/computational_performance.rst b/doc/modules/computational_performance.rst index ba811a4865aa4..b2d54b00ed06b 100644 --- a/doc/modules/computational_performance.rst +++ b/doc/modules/computational_performance.rst @@ -317,7 +317,7 @@ As well as potentially exhausting system memory, memory management can overwhelm computation time, or at least, using more memory provides negligible speed benefit. Where computations can be performed in fixed-memory chunks, we attempt to do so, and allow the user to -hint at the maximum size of this working memory (defaulting to 64 MiB) +hint at the maximum size of this working memory (defaulting to 1GB) using :func:`sklearn.set_config` or :func:`config_context`. The following suggests to limit temporary working memory to 128 MiB:: diff --git a/sklearn/_config.py b/sklearn/_config.py index 7c42be3ca0a0f..2b8a2e795bf86 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -5,7 +5,7 @@ _global_config = { 'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)), - 'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 64)) + 'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)) } @@ -35,7 +35,7 @@ def set_config(assume_finite=None, working_memory=None): If set, scikit-learn will attempt to limit the size of temporary arrays to this number of MiB (per job when parallelised), often saving both computation time and memory on expensive operations that can be - performed in chunks. Global default: 64. + performed in chunks. Global default: 1024. """ if assume_finite is not None: _global_config['assume_finite'] = assume_finite @@ -59,7 +59,7 @@ def config_context(**new_config): If set, scikit-learn will attempt to limit the size of temporary arrays to this number of MiB (per job when parallelised), often saving both computation time and memory on expensive operations that can be - performed in chunks. Global default: 64. + performed in chunks. Global default: 1024. Notes ----- diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index 500b0c4459ac0..6c2cecce0cfd5 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -3,7 +3,8 @@ def test_config_context(): - assert_equal(get_config(), {'assume_finite': False, 'working_memory': 64}) + assert_equal(get_config(), + {'assume_finite': False, 'working_memory': 1024}) # Not using as a context manager affects nothing config_context(assume_finite=True) @@ -11,7 +12,7 @@ def test_config_context(): with config_context(assume_finite=True): assert_equal(get_config(), {'assume_finite': True, - 'working_memory': 64}) + 'working_memory': 1024}) assert_equal(get_config()['assume_finite'], False) with config_context(assume_finite=True): @@ -36,7 +37,7 @@ def test_config_context(): assert_equal(get_config()['assume_finite'], True) assert_equal(get_config(), {'assume_finite': False, - 'working_memory': 64}) + 'working_memory': 1024}) # No positional arguments assert_raises(TypeError, config_context, True) From b70e087d37eb5dfd812a60fbc2e1f3558ae0c8e0 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 11 Feb 2018 12:28:00 +1100 Subject: [PATCH 89/96] Undo changes to neighbors and silhouette Stashing them for later --- doc/whats_new/v0.20.rst | 11 -- .../cluster/tests/test_unsupervised.py | 10 +- sklearn/metrics/cluster/unsupervised.py | 98 ++++++-------- sklearn/neighbors/base.py | 126 +++++++----------- sklearn/neighbors/tests/test_neighbors.py | 1 + 5 files changed, 99 insertions(+), 147 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 484b74388592c..6042107bd6bf8 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -54,9 +54,6 @@ Classifiers and regressors Naive Bayes classifier described in Rennie et al. (2003). :issue:`8190` by :user:`Michael A. Alcorn `. -- Added :class:`multioutput.RegressorChain` for multi-target - regression. :issue:`9257` by :user:`Kumar Ashutosh `. - Preprocessing - Added :class:`preprocessing.CategoricalEncoder`, which allows to encode @@ -128,10 +125,6 @@ Classifiers and regressors :class:`linear_model.BayesianRidge` for weighted linear regression. :issue:`10111` by :user:`Peter St. John `. -- :mod:`Nearest neighbors ` query methods are now more memory - efficient when ``algorithm='brute'``. :issue:`10280` by `Joel Nothman`_ - and :user:`Aman Dalmia `. - - :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegresssor` now only require X to be an object with finite length or shape. :issue:`9832` by :user:`Vrishank Bhardwaj `. @@ -155,10 +148,6 @@ Metrics - :func:`metrics.roc_auc_score` now supports binary ``y_true`` other than ``{0, 1}`` or ``{-1, 1}``. :issue:`9828` by :user:`Hanmin Qin `. -- :func:`metrics.cluster.silhouette_score` and - :func:`metrics.cluster.silhouette_samples` are more memory efficient, - causing them to run faster. :issue:`10280` by `Joel Nothman`_. - Linear, kernelized and related models - Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 07db1e69913b9..6df7e3c193d52 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -12,8 +12,8 @@ from sklearn.utils.testing import assert_greater from sklearn.metrics.cluster import silhouette_score from sklearn.metrics.cluster import silhouette_samples -from sklearn.metrics.cluster import calinski_harabaz_score from sklearn.metrics import pairwise_distances +from sklearn.metrics.cluster import calinski_harabaz_score def test_silhouette(): @@ -136,15 +136,15 @@ def test_correct_labelsize(): # n_labels = n_samples y = np.arange(X.shape[0]) assert_raises_regexp(ValueError, - 'Number of labels is %d\. Valid values are 2 ' - 'to n_samples - 1 \(inclusive\)' % len(np.unique(y)), + r'Number of labels is %d\. Valid values are 2 ' + r'to n_samples - 1 \(inclusive\)' % len(np.unique(y)), silhouette_score, X, y) # n_labels = 1 y = np.zeros(X.shape[0]) assert_raises_regexp(ValueError, - 'Number of labels is %d\. Valid values are 2 ' - 'to n_samples - 1 \(inclusive\)' % len(np.unique(y)), + r'Number of labels is %d\. Valid values are 2 ' + r'to n_samples - 1 \(inclusive\)' % len(np.unique(y)), silhouette_score, X, y) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 4ef240b17607d..f4da109f16e2c 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -5,15 +5,11 @@ # Thierry Guillemot # License: BSD 3 clause -from __future__ import division - -import functools - import numpy as np from ...utils import check_random_state from ...utils import check_X_y -from ..pairwise import pairwise_distances_chunked +from ..pairwise import pairwise_distances from ...preprocessing import LabelEncoder @@ -105,36 +101,6 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None, return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) -def _silhouette_reduce(D_chunk, start, labels, label_freqs): - """Accumulate silhouette statistics for vertical chunk of X - - Parameters - ---------- - D_chunk : shape (n_chunk_samples, n_samples) - precomputed distances for a chunk - start : int - first index in chunk - labels : array, shape (n_samples,) - corresponding cluster labels, encoded as {0, ..., n_clusters-1} - label_freqs : array - distribution of cluster labels in ``labels`` - """ - # accumulate distances from each sample to each cluster - clust_dists = np.zeros((len(D_chunk), len(label_freqs)), - dtype=D_chunk.dtype) - np.add.at(clust_dists.T, labels, D_chunk.T) - - # intra_index selects intra-cluster distances within clust_dists - intra_index = (np.arange(len(D_chunk)), labels[start:start + len(D_chunk)]) - # intra_clust_dists are averaged over cluster size outside this function - intra_clust_dists = clust_dists[intra_index] - # of the remaining distances we normalise and extract the minimum - clust_dists[intra_index] = np.inf - clust_dists /= label_freqs - inter_clust_dists = clust_dists.min(axis=1) - return intra_clust_dists, inter_clust_dists - - def silhouette_samples(X, labels, metric='euclidean', **kwds): """Compute the Silhouette Coefficient for each sample. @@ -173,7 +139,7 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`. If X is the distance array itself, use "precomputed" as the metric. - `**kwds` : optional keyword parameters + **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. If using a ``scipy.spatial.distance`` metric, the parameters are still metric dependent. See the scipy docs for usage examples. @@ -198,28 +164,48 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr']) le = LabelEncoder() labels = le.fit_transform(labels) - n_samples = len(labels) - label_freqs = np.bincount(labels) - check_number_of_labels(len(le.classes_), n_samples) - - kwds['metric'] = metric - reduce_func = functools.partial(_silhouette_reduce, - labels=labels, label_freqs=label_freqs) - results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, - **kwds)) - intra_clust_dists, inter_clust_dists = results - intra_clust_dists = np.concatenate(intra_clust_dists) - inter_clust_dists = np.concatenate(inter_clust_dists) - - denom = (label_freqs - 1).take(labels, mode='clip') - with np.errstate(divide="ignore", invalid="ignore"): - intra_clust_dists /= denom + check_number_of_labels(len(le.classes_), X.shape[0]) + + distances = pairwise_distances(X, metric=metric, **kwds) + unique_labels = le.classes_ + n_samples_per_label = np.bincount(labels, minlength=len(unique_labels)) + + # For sample i, store the mean distance of the cluster to which + # it belongs in intra_clust_dists[i] + intra_clust_dists = np.zeros(distances.shape[0], dtype=distances.dtype) + + # For sample i, store the mean distance of the second closest + # cluster in inter_clust_dists[i] + inter_clust_dists = np.inf + intra_clust_dists + + for curr_label in range(len(unique_labels)): + + # Find inter_clust_dist for all samples belonging to the same + # label. + mask = labels == curr_label + current_distances = distances[mask] + + # Leave out current sample. + n_samples_curr_lab = n_samples_per_label[curr_label] - 1 + if n_samples_curr_lab != 0: + intra_clust_dists[mask] = np.sum( + current_distances[:, mask], axis=1) / n_samples_curr_lab + + # Now iterate over all other labels, finding the mean + # cluster distance that is closest to every sample. + for other_label in range(len(unique_labels)): + if other_label != curr_label: + other_mask = labels == other_label + other_distances = np.mean( + current_distances[:, other_mask], axis=1) + inter_clust_dists[mask] = np.minimum( + inter_clust_dists[mask], other_distances) sil_samples = inter_clust_dists - intra_clust_dists - with np.errstate(divide="ignore", invalid="ignore"): - sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) - # nan values are for clusters of size 1, and should be 0 - return np.nan_to_num(sil_samples) + sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) + # score 0 for clusters of size 1, according to the paper + sil_samples[n_samples_per_label.take(labels) == 1] = 0 + return sil_samples def calinski_harabaz_score(X, labels): diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 72c33f76e7db6..e390860d13463 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -6,8 +6,6 @@ # Multi-output support by Arnaud Joly # # License: BSD 3 clause (C) INRIA, University of Amsterdam -from functools import partial - import warnings from abc import ABCMeta, abstractmethod @@ -17,7 +15,7 @@ from .ball_tree import BallTree from .kd_tree import KDTree from ..base import BaseEstimator -from ..metrics import pairwise_distances_chunked +from ..metrics import pairwise_distances from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices from ..utils.multiclass import check_classification_targets @@ -272,25 +270,9 @@ def _pairwise(self): class KNeighborsMixin(object): """Mixin for k-neighbors searches""" - def _kneighbors_reduce_func(self, dist, start, - n_neighbors, return_distance): - sample_range = np.arange(dist.shape[0])[:, None] - neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) - neigh_ind = neigh_ind[:, :n_neighbors] - # argpartition doesn't guarantee sorted order, so we sort again - neigh_ind = neigh_ind[ - sample_range, np.argsort(dist[sample_range, neigh_ind])] - if return_distance: - if self.effective_metric_ == 'euclidean': - result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind - else: - result = dist[sample_range, neigh_ind], neigh_ind - else: - result = neigh_ind - return result - def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. + Returns indices of and distances to the neighbors of each point. Parameters @@ -368,19 +350,28 @@ class from an array representing our data set and ask who's n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': - - reduce_func = partial(self._kneighbors_reduce_func, - n_neighbors=n_neighbors, - return_distance=return_distance) - # for efficiency, use squared euclidean distances - kwds = ({'squared': True} if self.effective_metric_ == 'euclidean' - else self.effective_metric_params_) + if self.effective_metric_ == 'euclidean': + dist = pairwise_distances(X, self._fit_X, 'euclidean', + n_jobs=n_jobs, squared=True) + else: + dist = pairwise_distances( + X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, + **self.effective_metric_params_) + + neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) + neigh_ind = neigh_ind[:, :n_neighbors] + # argpartition doesn't guarantee sorted order, so we sort again + neigh_ind = neigh_ind[ + sample_range, np.argsort(dist[sample_range, neigh_ind])] - result = pairwise_distances_chunked( - X, self._fit_X, reduce_func=reduce_func, - metric=self.effective_metric_, n_jobs=n_jobs, - **kwds) + if return_distance: + if self.effective_metric_ == 'euclidean': + result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind + else: + result = dist[sample_range, neigh_ind], neigh_ind + else: + result = neigh_ind elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): @@ -392,15 +383,14 @@ class from an array representing our data set and ask who's X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) ) + if return_distance: + dist, neigh_ind = tuple(zip(*result)) + result = np.vstack(dist), np.vstack(neigh_ind) + else: + result = np.vstack(result) else: raise ValueError("internal: _fit_method not recognized") - if return_distance: - dist, neigh_ind = zip(*result) - result = np.vstack(dist), np.vstack(neigh_ind) - else: - result = np.vstack(result) - if not query_is_train: return result else: @@ -512,22 +502,6 @@ def kneighbors_graph(self, X=None, n_neighbors=None, class RadiusNeighborsMixin(object): """Mixin for radius-based neighbors searches""" - def _radius_neighbors_reduce_func(self, dist, start, - radius, return_distance): - neigh_ind = [np.where(d <= radius)[0] for d in dist] - - if return_distance: - if self.effective_metric_ == 'euclidean': - dist = [np.sqrt(d[neigh_ind[i]]) - for i, d in enumerate(dist)] - else: - dist = [d[neigh_ind[i]] - for i, d in enumerate(dist)] - results = dist, neigh_ind - else: - results = neigh_ind - return results - def radius_neighbors(self, X=None, radius=None, return_distance=True): """Finds the neighbors within a given radius of a point or points. @@ -606,37 +580,39 @@ class from an array representing our data set and ask who's if radius is None: radius = self.radius + n_samples = X.shape[0] if self._fit_method == 'brute': # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': + dist = pairwise_distances(X, self._fit_X, 'euclidean', + n_jobs=self.n_jobs, squared=True) radius *= radius - kwds = {'squared': True} else: - kwds = self.effective_metric_params_ + dist = pairwise_distances(X, self._fit_X, + self.effective_metric_, + n_jobs=self.n_jobs, + **self.effective_metric_params_) - reduce_func = partial(self._radius_neighbors_reduce_func, - radius=radius, - return_distance=return_distance) + neigh_ind_list = [np.where(d <= radius)[0] for d in dist] + + # See https://github.com/numpy/numpy/issues/5456 + # if you want to understand why this is initialized this way. + neigh_ind = np.empty(n_samples, dtype='object') + neigh_ind[:] = neigh_ind_list - results = pairwise_distances_chunked( - X, self._fit_X, reduce_func=reduce_func, - metric=self.effective_metric_, n_jobs=self.n_jobs, - **kwds) if return_distance: - dist_chunks, neigh_ind_chunks = zip(*results) - dist_list = sum(dist_chunks, []) - neigh_ind_list = sum(neigh_ind_chunks, []) - # See https://github.com/numpy/numpy/issues/5456 - # if you want to understand why this is initialized this way. - dist = np.empty(len(dist_list), dtype='object') - dist[:] = dist_list - neigh_ind = np.empty(len(neigh_ind_list), dtype='object') - neigh_ind[:] = neigh_ind_list - results = dist, neigh_ind + dist_array = np.empty(n_samples, dtype='object') + if self.effective_metric_ == 'euclidean': + dist_list = [np.sqrt(d[neigh_ind[i]]) + for i, d in enumerate(dist)] + else: + dist_list = [d[neigh_ind[i]] + for i, d in enumerate(dist)] + dist_array[:] = dist_list + + results = dist_array, neigh_ind else: - neigh_ind_list = sum(results, []) - results = np.empty(len(neigh_ind_list), dtype='object') - results[:] = neigh_ind_list + results = neigh_ind elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index d19efd97c63b7..ec4371ce82a98 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -154,6 +154,7 @@ def test_precomputed(random_state=42): neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): + print(Est) est = Est(metric='euclidean') est.radius = est.n_neighbors = 1 pred_X = est.fit(X, target).predict(Y) From ac3b422f5ed59969c640842ea512f99cc0348793 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 27 Feb 2018 20:47:46 +1100 Subject: [PATCH 90/96] In response to Roman's comments --- doc/modules/computational_performance.rst | 16 +++++++--------- sklearn/metrics/pairwise.py | 13 +++++++------ sklearn/metrics/tests/test_pairwise.py | 14 +++++++++----- sklearn/utils/__init__.py | 3 ++- 4 files changed, 25 insertions(+), 21 deletions(-) diff --git a/doc/modules/computational_performance.rst b/doc/modules/computational_performance.rst index b2d54b00ed06b..57f02aebbc903 100644 --- a/doc/modules/computational_performance.rst +++ b/doc/modules/computational_performance.rst @@ -311,15 +311,13 @@ Debian / Ubuntu. Limiting Working Memory ----------------------- -Some calculations when implemented using standard numpy vectorized -operations involve using a large amount of temporary memory. -As well as potentially exhausting system memory, memory management -can overwhelm computation time, or at least, using more memory provides -negligible speed benefit. Where computations can be performed -in fixed-memory chunks, we attempt to do so, and allow the user to -hint at the maximum size of this working memory (defaulting to 1GB) -using :func:`sklearn.set_config` or :func:`config_context`. -The following suggests to limit temporary working memory to 128 MiB:: +Some calculations when implemented using standard numpy vectorized operations +involve using a large amount of temporary memory. This may potentially exhaust +system memory. Where computations can be performed in fixed-memory chunks, we +attempt to do so, and allow the user to hint at the maximum size of this +working memory (defaulting to 1GB) using :func:`sklearn.set_config` or +:func:`config_context`. The following suggests to limit temporary working +memory to 128 MiB:: >>> import sklearn >>> with sklearn.config_context(working_memory=128): diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 8bfaefc8c6c30..aff29b9a9c3ff 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -339,8 +339,8 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", sklearn.metrics.pairwise_distances_argmin """ if batch_size is not None: - warnings.warn("'batch_size' was deprecated in version 0.20 and will " - "be removed in version 0.22. " + warnings.warn("'batch_size' is ignored. It was deprecated in version " + "0.20 and will be removed in version 0.22. " "Use sklearn.set_config(working_memory=...) instead.", DeprecationWarning) X, Y = check_pairwise_arrays(X, Y) @@ -1190,7 +1190,8 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, working_memory : int, optional The sought maximum memory for temporary distance matrix chunks. - Defaults to sklearn.get_config()['working_memory']. + When None (default), the value of + ``sklearn.get_config()['working_memory']`` is used. `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. @@ -1221,14 +1222,14 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, >>> r = .2 >>> def reduce_func(D_chunk, start): ... neigh = [np.flatnonzero(d < r) for d in D_chunk] - ... avg_dist = np.ma.masked_array(D_chunk, D_chunk < r).mean(axis=1) + ... avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1) ... return neigh, avg_dist >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func) >>> neigh, avg_dist = next(gen) >>> neigh [array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])] - >>> avg_dist.data # doctest: +ELLIPSIS - array([ 0.427..., 0.513..., 0.586..., 0.459... , 0.687...]) + >>> avg_dist # doctest: +ELLIPSIS + array([ 0.039..., 0. , 0. , 0.039..., 0. ]) Where r is defined per sample, we need to make use of ``start``: diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 2bc74d25be6ad..d0d2c2e1386f7 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -9,6 +9,7 @@ from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_array_almost_equal +from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal @@ -376,8 +377,7 @@ def test_pairwise_distances_argmin_min(): np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) # Test batch_size deprecation warning - assert_warns_message(DeprecationWarning, "'batch_size' was deprecated in " - "version 0.20 and will be removed in version 0.22.", + assert_warns_message(DeprecationWarning, "version 0.22", pairwise_distances_argmin_min, X, Y, batch_size=500, metric='euclidean') @@ -396,7 +396,8 @@ def test_pairwise_distances_chunked_reduce(): assert isinstance(S_chunks, GeneratorType) S_chunks = list(S_chunks) assert len(S_chunks) > 1 - assert_array_almost_equal(S, np.vstack(S_chunks)) + # atol is for diagonal where S is explcitly zeroed on the diagonal + assert_allclose(np.vstack(S_chunks), S, atol=1e-7) @pytest.mark.parametrize('good_reduce', [ @@ -441,11 +442,10 @@ def check_pairwise_distances_chunked(X, Y, working_memory, metric='euclidean'): assert isinstance(gen, GeneratorType) blockwise_distances = list(gen) min_block_mib = np.array(X).shape[0] * 8 * 2 ** -20 - working_memory = min(working_memory, min_block_mib) for block in blockwise_distances: memory_used = len(block) * 8 - assert_true(memory_used <= working_memory * 2 ** 20) + assert memory_used <= min(working_memory, min_block_mib) * 2 ** 20 blockwise_distances = np.vstack(blockwise_distances) S = pairwise_distances(X, Y, metric=metric) @@ -459,6 +459,10 @@ def test_pairwise_distances_chunked(): X = rng.random_sample((400, 4)) check_pairwise_distances_chunked(X, None, working_memory=1, metric='euclidean') + # Test small amounts of memory + for power in range(-16, 0): + check_pairwise_distances_chunked(X, None, working_memory=2 ** power, + metric='euclidean') # X as list check_pairwise_distances_chunked(X.tolist(), None, working_memory=1, metric='euclidean') diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 682db2b38cbde..8dec45ce3c427 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -521,7 +521,8 @@ def get_chunk_n_rows(row_bytes, max_n_rows=None, The maximum return value. working_memory : int or float, optional The number of rows to fit inside this number of MiB will be returned. - Defaults to ``sklearn.get_config()['working_memory']``. + When None (default), the value of + ``sklearn.get_config()['working_memory']`` is used. Returns ------- From cf284d7bafc2be8d6c985ff8e3ab5089c5e5ad5a Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 23 May 2018 14:11:05 +1000 Subject: [PATCH 91/96] Typo fixes --- sklearn/metrics/pairwise.py | 2 +- sklearn/metrics/tests/test_pairwise.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index dfc970934137b..126a42287f631 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1211,7 +1211,7 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, ``reduce_func``. Examples - ------- + -------- Without reduce_func: >>> X = np.random.RandomState(0).rand(5, 3) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 2e46b7ebb0b4c..294cf8c5ad5b5 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -398,7 +398,7 @@ def test_pairwise_distances_chunked_reduce(): assert isinstance(S_chunks, GeneratorType) S_chunks = list(S_chunks) assert len(S_chunks) > 1 - # atol is for diagonal where S is explcitly zeroed on the diagonal + # atol is for diagonal where S is explicitly zeroed on the diagonal assert_allclose(np.vstack(S_chunks), S, atol=1e-7) From 85971819f0adb5666eac709cdd4951c040ac8db5 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 23 May 2018 14:11:18 +1000 Subject: [PATCH 92/96] Correct the tests --- sklearn/metrics/tests/test_pairwise.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 294cf8c5ad5b5..0ef089c7a3619 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -443,17 +443,19 @@ def check_pairwise_distances_chunked(X, Y, working_memory, metric='euclidean'): metric=metric) assert isinstance(gen, GeneratorType) blockwise_distances = list(gen) - min_block_mib = np.array(X).shape[0] * 8 * 2 ** -20 + Y = np.array(X if Y is None else Y) + min_block_mib = len(Y) * 8 * 2 ** -20 for block in blockwise_distances: - memory_used = len(block) * 8 - assert memory_used <= min(working_memory, min_block_mib) * 2 ** 20 + memory_used = block.nbytes + assert memory_used <= max(working_memory, min_block_mib) * 2 ** 20 blockwise_distances = np.vstack(blockwise_distances) S = pairwise_distances(X, Y, metric=metric) assert_array_almost_equal(blockwise_distances, S) +@ignore_warnings def test_pairwise_distances_chunked(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) From 3240e0fa0ee458e55a57e5f8a72e4d5c3dda1384 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 23 May 2018 21:43:16 +1000 Subject: [PATCH 93/96] Add missing ref target --- doc/modules/computational_performance.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/modules/computational_performance.rst b/doc/modules/computational_performance.rst index 57f02aebbc903..ca128c515fb90 100644 --- a/doc/modules/computational_performance.rst +++ b/doc/modules/computational_performance.rst @@ -308,6 +308,8 @@ Debian / Ubuntu. or upgrade to Python 3.4 which has a new version of ``multiprocessing`` that should be immune to this problem. +.. _working_memory: + Limiting Working Memory ----------------------- From d7c04af3b9cf6d97d8bc0194dfa73aa37401114f Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 23 May 2018 21:46:24 +1000 Subject: [PATCH 94/96] Update array formatting in doctest --- sklearn/metrics/pairwise.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 126a42287f631..3b9d4ec6da521 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1217,11 +1217,11 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, >>> X = np.random.RandomState(0).rand(5, 3) >>> D_chunk = next(pairwise_distances_chunked(X)) >>> D_chunk # doctest: +ELLIPSIS - array([[ 0. ..., 0.29..., 0.41..., 0.19..., 0.57...], - [ 0.29..., 0. ..., 0.57..., 0.41..., 0.76...], - [ 0.41..., 0.57..., 0. ..., 0.44..., 0.90...], - [ 0.19..., 0.41..., 0.44..., 0. ..., 0.51...], - [ 0.57..., 0.76..., 0.90..., 0.51..., 0. ...]]) + array([[ 0. ..., 0.29..., 0.41..., 0.19..., 0.57...], + [ 0.29..., 0. ..., 0.57..., 0.41..., 0.76...], + [ 0.41..., 0.57..., 0. ..., 0.44..., 0.90...], + [ 0.19..., 0.41..., 0.44..., 0. ..., 0.51...], + [ 0.57..., 0.76..., 0.90..., 0.51..., 0. ...]]) Retrieve all neighbors and average distance within radius r: From 24d1801318b918c1f5029222247cb5d80dad4434 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Fri, 25 May 2018 07:52:11 +1000 Subject: [PATCH 95/96] Remove more whitespace in doctest --- sklearn/metrics/pairwise.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 3b9d4ec6da521..25fe7876e0dc5 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1217,11 +1217,11 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, >>> X = np.random.RandomState(0).rand(5, 3) >>> D_chunk = next(pairwise_distances_chunked(X)) >>> D_chunk # doctest: +ELLIPSIS - array([[ 0. ..., 0.29..., 0.41..., 0.19..., 0.57...], - [ 0.29..., 0. ..., 0.57..., 0.41..., 0.76...], - [ 0.41..., 0.57..., 0. ..., 0.44..., 0.90...], - [ 0.19..., 0.41..., 0.44..., 0. ..., 0.51...], - [ 0.57..., 0.76..., 0.90..., 0.51..., 0. ...]]) + array([[0. ..., 0.29..., 0.41..., 0.19..., 0.57...], + [0.29..., 0. ..., 0.57..., 0.41..., 0.76...], + [0.41..., 0.57..., 0. ..., 0.44..., 0.90...], + [0.19..., 0.41..., 0.44..., 0. ..., 0.51...], + [0.57..., 0.76..., 0.90..., 0.51..., 0. ...]]) Retrieve all neighbors and average distance within radius r: From 6252941ee8c17effc4065faac47bc1168da37e38 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Fri, 25 May 2018 08:39:24 +1000 Subject: [PATCH 96/96] Remove more whitespace in doctest --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 25fe7876e0dc5..b4928ed7492f3 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1235,7 +1235,7 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, >>> neigh [array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])] >>> avg_dist # doctest: +ELLIPSIS - array([ 0.039..., 0. , 0. , 0.039..., 0. ]) + array([0.039..., 0. , 0. , 0.039..., 0. ]) Where r is defined per sample, we need to make use of ``start``: