14
14
import numpy as np
15
15
16
16
from ..utils import check_array
17
+ from ..utils import gen_batches , get_chunk_n_rows
17
18
from ..utils .validation import check_is_fitted
18
19
from ..neighbors import NearestNeighbors
19
20
from ..base import BaseEstimator , ClusterMixin
@@ -395,8 +396,6 @@ def fit(self, X, y=None):
395
396
# Start all points as 'unprocessed' ##
396
397
self .reachability_ = np .empty (n_samples )
397
398
self .reachability_ .fill (np .inf )
398
- self .core_distances_ = np .empty (n_samples )
399
- self .core_distances_ .fill (np .nan )
400
399
# Start all points as noise ##
401
400
self .labels_ = np .full (n_samples , - 1 , dtype = int )
402
401
@@ -407,9 +406,7 @@ def fit(self, X, y=None):
407
406
n_jobs = self .n_jobs )
408
407
409
408
nbrs .fit (X )
410
- self .core_distances_ [:] = nbrs .kneighbors (X ,
411
- self .min_samples )[0 ][:, - 1 ]
412
-
409
+ self .core_distances_ = self ._compute_core_distances_ (X , nbrs )
413
410
self .ordering_ = self ._calculate_optics_order (X , nbrs )
414
411
415
412
indices_ , self .labels_ = _extract_optics (self .ordering_ ,
@@ -425,6 +422,42 @@ def fit(self, X, y=None):
425
422
426
423
# OPTICS helper functions
427
424
425
+ def _compute_core_distances_ (self , X , neighbors , working_memory = None ):
426
+ """Compute the k-th nearest neighbor of each sample
427
+
428
+ Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]
429
+ but with more memory efficiency.
430
+
431
+ Parameters
432
+ ----------
433
+ X : array, shape (n_samples, n_features)
434
+ The data.
435
+ neighbors : NearestNeighbors instance
436
+ The fitted nearest neighbors estimator.
437
+ working_memory : int, optional
438
+ The sought maximum memory for temporary distance matrix chunks.
439
+ When None (default), the value of
440
+ ``sklearn.get_config()['working_memory']`` is used.
441
+
442
+ Returns
443
+ -------
444
+ core_distances : array, shape (n_samples,)
445
+ Distance at which each sample becomes a core point.
446
+ Points which will never be core have a distance of inf.
447
+ """
448
+ n_samples = len (X )
449
+ core_distances = np .empty (n_samples )
450
+ core_distances .fill (np .nan )
451
+
452
+ chunk_n_rows = get_chunk_n_rows (row_bytes = 16 * self .min_samples ,
453
+ max_n_rows = n_samples ,
454
+ working_memory = working_memory )
455
+ slices = gen_batches (n_samples , chunk_n_rows )
456
+ for sl in slices :
457
+ core_distances [sl ] = neighbors .kneighbors (
458
+ X [sl ], self .min_samples )[0 ][:, - 1 ]
459
+ return core_distances
460
+
428
461
def _calculate_optics_order (self , X , nbrs ):
429
462
# Main OPTICS loop. Not parallelizable. The order that entries are
430
463
# written to the 'ordering_' list is important!
0 commit comments