8000 [MRG] ENH: Added block_size parameter for lesser memory consumption by dalmia · Pull Request #7979 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] ENH: Added block_size parameter for lesser memory consumption #7979

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 39 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
7cfcd43
ENH: Added template for pairwise_distances_blockwise with docstring c…
dalmia Dec 5, 2016
dfb99fc
ENH: added generator of blocks based on block_size
dalmia Dec 7, 2016
1e687d1
FIX: removed errors and extra value for metric
dalmia Dec 7, 2016
172e7f5
FIX: remove redundant variables
dalmia Dec 7, 2016
686d0d2
FIX: remove flake8 errors
dalmia Dec 8, 2016
c7de820
BUG: added fix for Y=None
dalmia Dec 8, 2016
0fb992f
FIX: remove whitespace
dalmia Dec 8, 2016
9b80491
FIX: fix typo
dalmia Dec 8, 2016
8000
8e900e3
TST: added tests for pairwise_distances_blockwise
dalmia Dec 8, 2016
6be6ea2
FIX: removed errors and modified pairwise_distances_blockwise with tests
dalmia Dec 12, 2016
6d79bdd
FIX: fix typo
dalmia Dec 12, 2016
54ff9f5
WIP: support for nearest neighbors
dalmia Jan 5, 2017
8986965
ENH: passing arguments to reduce_func via partial
dalmia Jan 7, 2017
a072f11
FIX: remove true_distances as parameter
dalmia Jan 7, 2017
e0fb4c5
FIX: revert unintended change
dalmia Jan 7, 2017
508afae
FIX: convert float indices to int
dalmia Jan 7, 2017
b515105
FIX: removed debug lines
dalmia Jan 7, 2017
f1f7348
ENH: added pairwise_distances_reduce for radius_neighbors
dalmia Jan 8, 2017
d54def1
FIX: changed order of reduce_func
dalmia Feb 4, 2017
3e8adfc
FIX: get pairwise_distances_reduce to work correctly
dalmia Feb 6, 2017
4b8c7b2
FIX: remove flake8 errors
dalmia Feb 6, 2017
97486f6
FIX: rename reduce_func
dalmia Feb 7, 2017
722a9eb
FIX: return stacked distances from pairwise_distances_reduce
dalmia Feb 18, 2017
5ae169b
TST: added tests for pairwise_distances_reduce
dalmia Feb 18, 2017
1cd56a8
FIX: correct doctests
dalmia Feb 19, 2017
855ea0a
FIX: remove conflicting doctests for Python2 and Python3
dalmia Feb 19, 2017
6dd1d36
FEAT: add new file for flexible_vstack
dalmia Feb 20, 2017
d3f607d
FIX: resolve conflicts on tests
dalmia Feb 20, 2017
c373cee
FIX: remove block_size placeholders from neighbors
dalmia Feb 20, 2017
21ad2b1
FIX: use generator expressions
dalmia Feb 20, 2017
676c272
FIX: replace error on invalid block_size with warning
dalmia Feb 20, 2017
a3074be
FIX: replace error on invalid block_size with warning
dalmia Feb 20, 2017
3756a58
TST: check each components meets specified memory requirement
dalmia Feb 20, 2017
2ab293b
FIX: remove PEP8 errors
dalmia Feb 20, 2017
84d34e3
ENH: move flexible_vstack to __init__
dalmia Feb 20, 2017
6d49c12
TST: add tests for flexible_vstack
dalmia Feb 20, 2017
b3fb795
DOC: improve docstring for
dalmia Feb 20, 2017
f3d3a1a
FIX: correct X, y for Python3
dalmia Feb 20, 2017
f901d7e
ENH: rewrote pairwise_distances_argmin_min using pairwise_distances_r…
dalmia Feb 24, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions sklearn/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from .pairwise import pairwise_distances_argmin
from .pairwise import pairwise_distances_argmin_min
from .pairwise import pairwise_kernels
from .pairwise import pairwise_distances_reduce

from .regression import explained_variance_score
from .regression import mean_absolute_error
Expand Down Expand Up @@ -96,6 +97,7 @@
'mutual_info_score',
'normalized_mutual_info_score',
'pairwise_distances',
'pairwise_distances_reduce',
'pairwise_distances_argmin',
'pairwise_distances_argmin_min',
'pairwise_distances_argmin_min',
Expand Down
303 changes: 245 additions & 58 deletions sklearn/metrics/pairwise.py

Large diffs are not rendered by default.

86 changes: 85 additions & 1 deletion sklearn/metrics/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import assert_raises_regexp
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.testing import assert_warns_message

from sklearn.externals.six import iteritems

Expand All @@ -27,6 +29,8 @@
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import pairwise_distances_blockwise
from sklearn.metrics.pairwise import pairwise_distances_reduce
from sklearn.metrics.pairwise import pairwise_distances_argmin_min
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.metrics.pairwise import pairwise_kernels
Expand Down Expand Up @@ -365,10 +369,90 @@ def test_pairwise_distances_argmin_min():
dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
X, Y, axis=0, metric="manhattan", batch_size=50)
X, Y, axis=0, metric="manhattan", block_size=50)
np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)

# Test batch_size deprecation warning
assert_warns_message(DeprecationWarning, "'batch_size' was deprecated in "
"version 0.19 and will be removed in version 0.21.",
pairwise_distances_argmin_min, X, Y, batch_size=500,
metric='euclidean')


def test_pairwise_distances_reduce_invalid_reduce_func():
X = np.empty((400, 4))
y = np.empty((200, 4))
assert_raise_message(ValueError, 'reduce_func needs to be passed as an '
'argument', pairwise_distances_reduce, X, y,
block_size=0, metric='euclidean')


def _reduce_func(dist):
return dist[:, :100]


def test_pairwise_distances_reduce():
rng = np.random.RandomState(0)
X = rng.random_sample((400, 4))
# Reduced Euclidean distance
S = pairwise_distances(X)[:, :100]
S2 = pairwise_distances_reduce(X, None, reduce_func=_reduce_func,
block_size=1)
assert_array_almost_equal(S, S2)


def check_pairwise_distances_blockwise(X, Y, block_size, metric='euclidean'):
from sklearn.metrics.pairwise import BYTES_PER_FLOAT
gen = pairwise_distances_blockwise(X, Y, block_size=block_size,
metric=metric)
blockwise_distances = list(gen)
min_block_mib = X.shape[0] * BYTES_PER_FLOAT * 2 ** -20
if block_size < min_block_mib:
block_size = min_block_mib

for block in blockwise_distances:
memory_used = len(block) * BYTES_PER_FLOAT
assert_true(memory_used <= block_size * 2 ** 20)

blockwise_distances = np.vstack(blockwise_distances)
S = pairwise_distances(X, Y, metric=metric)
assert_array_almost_equal(blockwise_distances, S)


def test_pairwise_distances_blockwise_invalid_block_size():
rng = np.random.RandomState(0)
X = rng.random_sample((400, 4))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might as well be empty or zeros

y = rng.random_sample((200, 4))
assert_warns_message(UserWarning, 'block_size should be at least '
'n_samples * 8 bytes = 1 MiB, got 0',
pairwise_distances_blockwise, X, y, block_size=0,
metric='euclidean')
check_pairwise_distances_blockwise(X, y, block_size=0)


def test_pairwise_distances_blockwise():
# Test the pairwise_distance helper function.
rng = np.random.RandomState(0)
# Euclidean distance should be equivalent to calling the function.
X = rng.random_sample((400, 4))
check_pairwise_distances_blockwise(X, None, block_size=1,
metric='euclidean')
# Euclidean distance, with Y != X.
Y = rng.random_sample((200, 4))
check_pairwise_distances_blockwise(X, Y, block_size=1,
metric='euclidean')
# absurdly large block_size
check_pairwise_distances_blockwise(X, Y, block_size=10000,
metric='euclidean')
# "cityblock" uses scikit-learn metric, cityblock (function) is
# scipy.spatial.
check_pairwise_distances_blockwise(X, Y, block_size=1,
metric='cityblock')
# Test that a value error is raised if the metric is unknown
assert_raises(ValueError, pairwise_distances_blockwise, X, Y,
metric="blah")


def test_euclidean_distances():
# Check the pairwise Euclidean distances computation
Expand Down
6D40
115 changes: 68 additions & 47 deletions sklearn/neighbors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
# Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
#
# License: BSD 3 clause (C) INRIA, University of Amsterdam
from functools import partial

import warnings
from abc import ABCMeta, abstractmethod

Expand All @@ -15,7 +17,7 @@
from .ball_tree import BallTree
from .kd_tree import KDTree
from ..base import BaseEstimator
from ..metrics import pairwise_distances
from ..metrics import pairwise_distances_reduce
from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices
from ..utils.fixes import argpartition
Expand Down Expand Up @@ -266,9 +268,24 @@ def _pairwise(self):
class KNeighborsMixin(object):
"""Mixin for k-neighbors searches"""

def _kneighbors_reduce_func(self, dist, n_neighbors, return_distance):
sample_range = np.arange(dist.shape[0])[:, None]
neigh_ind = argpartition(dist, n_neighbors - 1, axis=1)
neigh_ind = neigh_ind[:, :n_neighbors]
# argpartition doesn't guarantee sorted order, so we sort again
neigh_ind = neigh_ind[
sample_range, np.argsort(dist[sample_range, neigh_ind])]
if return_distance:
if self.effective_metric_ == 'euclidean':
result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
else:
result = dist[sample_range, neigh_ind], neigh_ind
else:
result = neigh_ind
return result

def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
"""Finds the K-neighbors of a point.

Returns indices of and distances to the neighbors of each point.

Parameters
Expand Down Expand Up @@ -347,29 +364,22 @@ class from an array representing our data set and ask who's

n_jobs = _get_n_jobs(self.n_jobs)
if self._fit_method == 'brute':

reduce_func = partial(self._kneighbors_reduce_func,
n_neighbors=n_neighbors,
return_distance=return_distance)

# for efficiency, use squared euclidean distances
if self.effective_metric_ == 'euclidean':
dist = pairwise_distances(X, self._fit_X, 'euclidean',
n_jobs=n_jobs, squared=True)
result = pairwise_distances_reduce(
X, self._fit_X, reduce_func=reduce_func,
metric='euclidean', n_jobs=n_jobs, squared=True)
else:
dist = pairwise_distances(
X, self._fit_X, self.effective_metric_, n_jobs=n_jobs,
result = pairwise_distances_reduce(
X, self._fit_X, reduce_func=reduce_func,
metric=self.effective_metric_, n_jobs=n_jobs,
**self.effective_metric_params_)

neigh_ind = argpartition(dist, n_neighbors - 1, axis=1)
neigh_ind = neigh_ind[:, :n_neighbors]
# argpartition doesn't guarantee sorted order, so we sort again
neigh_ind = neigh_ind[
sample_range, np.argsort(dist[sample_range, neigh_ind])]

if return_distance:
if self.effective_metric_ == 'euclidean':
result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
else:
result = dist[sample_range, neigh_ind], neigh_ind
else:
result = neigh_ind

elif self._fit_method in ['ball_tree', 'kd_tree']:
if issparse(X):
raise ValueError(
Expand Down Expand Up @@ -499,6 +509,29 @@ def kneighbors_graph(self, X=None, n_neighbors=None,
class RadiusNeighborsMixin(object):
"""Mixin for radius-based neighbors searches"""

def _radius_neighbors_reduce_func(self, dist, radius, return_distance):
neigh_ind_list = [np.where(d <= radius)[0] for d in dist]

# See https://github.com/numpy/numpy/issues/5456
# if you want to understand why this is initialized this way.
neigh_ind = np.empty(dist.shape[0], dtype='object')
neigh_ind[:] = neigh_ind_list

if return_distance:
dist_array = np.empty(dist.shape[0], dtype='object')
if self.effective_metric_ == 'euclidean':
dist_list = [np.sqrt(d[neigh_ind[i]])
for i, d in enumerate(dist)]
else:
dist_list = [d[neigh_ind[i]]
for i, d in enumerate(dist)]
dist_array[:] = dist_list

results = dist_array, neigh_ind
else:
results = neigh_ind
return results

def radius_neighbors(self, X=None, radius=None, return_distance=True):
"""Finds the neighbors within a given radius of a point or points.

Expand Down Expand Up @@ -578,39 +611,27 @@ class from an array representing our data set and ask who's
if radius is None:
radius = self.radius

n_samples = X.shape[0]
if self._fit_method == 'brute':
# for efficiency, use squared euclidean distances
if self.effective_metric_ == 'euclidean':
dist = pairwise_distances(X, self._fit_X, 'euclidean',
n_jobs=self.n_jobs, squared=True)
radius *= radius
reduce_func = partial(self._radius_neighbors_reduce_func,
radius=radius,
return_distance=return_distance)

results = pairwise_distances_reduce(
X, self._fit_X, reduce_func=reduce_func,
metric='euclidean', n_jobs=self.n_jobs,
squared=True)
else:
dist = pairwise_distances(X, self._fit_X,
self.effective_metric_,
n_jobs=self.n_jobs,
**self.effective_metric_params_)
reduce_func = partial(self._radius_neighbors_reduce_func,
radius=radius,
return_distance=return_distance)

neigh_ind_list = [np.where(d <= radius)[0] for d in dist]

# See https://github.com/numpy/numpy/issues/5456
# if you want to understand why this is initialized this way.
neigh_ind = np.empty(n_samples, dtype='object')
neigh_ind[:] = neigh_ind_list

if return_distance:
dist_array = np.empty(n_samples, dtype='object')
if self.effective_metric_ == 'euclidean':
dist_list = [np.sqrt(d[neigh_ind[i]])
for i, d in enumerate(dist)]
else:
dist_list = [d[neigh_ind[i]]
for i, d in enumerate(dist)]
dist_array[:] = dist_list

results = dist_array, neigh_ind
else:
results = neigh_ind
results = pairwise_distances_reduce(
X, self._fit_X, reduce_func=reduce_func,
metric=self.effective_metric_, n_jobs=self.n_jobs,
**self.effective_metric_params_)

elif self._fit_method in ['ball_tree', 'kd_tree']:
if issparse(X):
Expand Down
1 change: 0 additions & 1 deletion sklearn/neighbors/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ def predict(self, X):
X = check_array(X, accept_sparse='csr')

neigh_dist, neigh_ind = self.kneighbors(X)

classes_ = self.classes_
_y = self._y
if not self.outputs_2d_:
Expand Down
1 change: 0 additions & 1 deletion sklearn/neighbors/tests/test_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ def test_precomputed(random_state=42):
neighbors.RadiusNeighborsClassifier,
neighbors.KNeighborsRegressor,
neighbors.RadiusNeighborsRegressor):
print(Est)
est = Est(metric='euclidean')
est.radius = est.n_neighbors = 1
pred_X = est.fit(X, target).predict(Y)
Expand Down
Loading
0