8000 Revert "FIX (0.21) OPTICS processing order (#12357)" · xhluca/scikit-learn@c7629ca · GitHub
[go: up one dir, main page]

Skip to content

Commit c7629ca

Browse files
author
Xing
committed
Revert "FIX (0.21) OPTICS processing order (scikit-learn#12357)"
This reverts commit e7d8b82.
1 parent fa3352e commit c7629ca

File tree

5 files changed

+281
-191
lines changed

5 files changed

+281
-191
lines changed

doc/whats_new/v0.21.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ Support for Python 3.4 and below has been officially dropped.
4545

4646
- |MajorFeature| A new clustering algorithm: :class:`cluster.OPTICS`: an
4747
algoritm related to :class:`cluster.DBSCAN`, that has hyperparameters easier
48-
to set and that scales better, by :user:`Shane <espg>`,
49-
:user:`Adrin Jalali <adrinjalali>`, and :user:`Erich Schubert <kno10>`.
48+
to set and that scales better, by :user:`Shane <espg>` and
49+
:user:`Adrin Jalali <adrinjalali>`.
5050

5151
:mod:`sklearn.preprocessing`
5252
............................

sklearn/cluster/_optics_inner.pyx

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
cimport numpy as np
2+
import numpy as np
3+
cimport cython
4+
5+
ctypedef np.float64_t DTYPE_t
6+
ctypedef np.int_t DTYPE
7+
8+
# as defined in PEP485 (python3.5)
9+
cdef inline isclose(double a,
10+
double b,
11+
double rel_tol=1e-09,
12+
double abs_tol=0.0):
13+
return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
14+
15+
@cython.boundscheck(False)
16+
@cython.wraparound(False)
17+
# Checks for smallest reachability distance
18+
# In case of tie, preserves order and returns first instance
19+
# as sorted by distance
20+
cpdef quick_scan(double[:] rdists, double[:] dists):
21+
cdef Py_ssize_t n
22+
cdef int idx
23+
cdef int i
24+
cdef double rdist
25+
cdef double dist
26+
rdist = np.inf
27+
dist = np.inf
28+
n = len(rdists)
29+
for i from 0 <= i < n:
30+
if rdists[i] < rdist:
31+
rdist = rdists[i]
32+
dist = dists[i]
33+
idx = i
34+
elif isclose(rdists[i], rdist):
35+
if dists[i] < dist:
36+
dist = dists[i]
37+
idx = i
38+
return idx

sklearn/cluster/optics_.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from ..neighbors import NearestNeighbors
2121
from ..base import BaseEstimator, ClusterMixin
2222
from ..metrics import pairwise_distances
23+
from ._optics_inner import quick_scan
2324

2425

2526
def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
@@ -36,13 +37,6 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
3637
neighborhood radius. Better suited for usage on large point datasets than
3738
the current sklearn implementation of DBSCAN.
3839
39-
This implementation deviates from the original OPTICS by first performing
40-
k-nearest-neighborhood searches on all points to identify core sizes, then
41-
computing only the distances to unprocessed points when constructing the
42-
cluster order. It also does not employ a heap to manage the expansion
43-
candiates, but rather uses numpy masked arrays. This can be potentially
44-
slower with some parameters (at the benefit from using fast numpy code).
45-
4640
Read more in the :ref:`User Guide <optics>`.
4741
4842
Parameters
@@ -196,11 +190,6 @@ class OPTICS(BaseEstimator, ClusterMixin):
196190
neighborhood radius. Better suited for usage on large point datasets than
197191
the current sklearn implementation of DBSCAN.
198192
199-
This implementation deviates from the original OPTICS by first performing
200-
k-nearest-neighborhood searches on all points to identify core sizes, then
201-
computing only the distances to unprocessed points when constructing the
202-
cluster order.
203-
204193
Read more in the :ref:`User Guide <optics>`.
205194
206195
Parameters
@@ -324,15 +313,15 @@ class OPTICS(BaseEstimator, ClusterMixin):
324313
``clust.reachability_[clust.ordering_]`` to access in cluster order.
325314
326315
ordering_ : array, shape (n_samples,)
327-
The cluster ordered list of sample indices.
316+
The cluster ordered list of sample indices
328317
329318
core_distances_ : array, shape (n_samples,)
330319
Distance at which each sample becomes a core point, indexed by object
331320
order. Points which will never be core have a distance of inf. Use
332321
``clust.core_distances_[clust.ordering_]`` to access in cluster order.
333322
334323
predecessor_ : array, shape (n_samples,)
335-
Point that a sample was reached from, indexed by object order.
324+
Point that a sample was reached from.
336325
Seed points have a predecessor of -1.
337326
338327
See also
@@ -527,11 +516,9 @@ def _set_reach_dist(self, point_index, processed, X, nbrs):
527516
self.reachability_[unproc[improved]] = rdists[improved]
528517
self.predecessor_[unproc[improved]] = point_index
529518

530-
# Choose next based on smallest reachability distance
531-
# (And prefer smaller ids on ties).
532-
# All unprocessed points qualify, not just new neighbors ("unproc")
533-
return (np.ma.array(self.reachability_, mask=processed)
534-
.argmin(fill_value=np.inf))
519+
# Define return order based on reachability distance
520+
return (unproc[quick_scan(np.take(self.reachability_, unproc),
521+
dists)])
535522

536523
def extract_dbscan(self, eps):
537524
"""Performs DBSCAN extraction for an arbitrary epsilon.

sklearn/cluster/setup.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ def configuration(parent_package='', top_path=None):
2323
sources=['_dbscan_inner.pyx'],
2424
include_dirs=[numpy.get_include()],
2525
language="c++")
26+
config.add_extension('_optics_inner',
27+
sources=['_optics_inner.pyx'],
28+
include_dirs=[numpy.get_include()],
29+
libraries=libraries)
2630

2731
config.add_extension('_hierarchical',
2832
sources=['_hierarchical.pyx'],

0 commit comments

Comments
 (0)
0