8000 ENH (0.21) Make OPTICS more memory efficient when calling kneighbors … · scikit-learn/scikit-learn@5d30b2d · GitHub
[go: up one dir, main page]

Skip to content

Commit 5d30b2d

Browse files
TomDLTqinhanmin2014
authored andcommitted
ENH (0.21) Make OPTICS more memory efficient when calling kneighbors (#12103)
1 parent 4035e60 commit 5d30b2d

File tree

1 file changed

+38
-5
lines changed

1 file changed

+38
-5
lines changed

sklearn/cluster/optics_.py

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import numpy as np
1515

1616
from ..utils import check_array
17+
from ..utils import gen_batches, get_chunk_n_rows
1718
from ..utils.validation import check_is_fitted
1819
from ..neighbors import NearestNeighbors
1920
from ..base import BaseEstimator, ClusterMixin
@@ -395,8 +396,6 @@ def fit(self, X, y=None):
395396
# Start all points as 'unprocessed' ##
396397
self.reachability_ = np.empty(n_samples)
397398
self.reachability_.fill(np.inf)
398-
self.core_distances_ = np.empty(n_samples)
399-
self.core_distances_.fill(np.nan)
400399
# Start all points as noise ##
401400
self.labels_ = np.full(n_samples, -1, dtype=int)
402401

@@ -407,9 +406,7 @@ def fit(self, X, y=None):
407406
n_jobs=self.n_jobs)
408407

409408
nbrs.fit(X)
410-
self.core_distances_[:] = nbrs.kneighbors(X,
411-
self.min_samples)[0][:, -1]
412-
409+
self.core_distances_ = self._compute_core_distances_(X, nbrs)
413410
self.ordering_ = self._calculate_optics_order(X, nbrs)
414411

415412
indices_, self.labels_ = _extract_optics(self.ordering_,
@@ -425,6 +422,42 @@ def fit(self, X, y=None):
425422

426423
# OPTICS helper functions
427424

425+
def _compute_core_distances_(self, X, neighbors, working_memory=None):
426+
"""Compute the k-th nearest neighbor of each sample
427+
428+
Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]
429+
but with more memory efficiency.
430+
431+
Parameters
432+
----------
433+
X : array, shape (n_samples, n_features)
434+
The data.
435+
neighbors : NearestNeighbors instance
436+
The fitted nearest neighbors estimator.
437+
working_memory : int, optional
438+
The sought maximum memory for temporary distance matrix chunks.
439+
When None (default), the value of
440+
``sklearn.get_config()['working_memory']`` is used.
441+
442+
Returns
443+
-------
444+
core_distances : array, shape (n_samples,)
445+
Distance at which each sample becomes a core point.
446+
Points which will never be core have a distance of inf.
447+
"""
448+
n_samples = len(X)
449+
core_distances = np.empty(n_samples)
450+
core_distances.fill(np.nan)
451+
452+
chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self.min_samples,
453+
max_n_rows=n_samples,
454+
working_memory=working_memory)
455+
slices = gen_batches(n_samples, chunk_n_rows)
456+
for sl in slices:
457+
core_distances[sl] = neighbors.kneighbors(
458+
X[sl], self.min_samples)[0][:, -1]
459+
return core_distances
460+
428461
def _calculate_optics_order(self, X, nbrs):
429462
# Main OPTICS loop. Not parallelizable. The order that entries are
430463
# written to the 'ordering_' list is important!

0 commit comments

Comments
 (0)
0