NelleV
diff --git a/‎scikits/learn/cluster/dbscan_.py
Lines changed: 66 additions & 86 deletions b/‎scikits/learn/cluster/dbscan_.py
Lines changed: 66 additions & 86 deletions
diff --git a/‎scikits/learn/cluster/tests/test_dbscan.py
Lines changed: 6 additions & 7 deletions b/‎scikits/learn/cluster/tests/test_dbscan.py
Lines changed: 6 additions & 7 deletions
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
-""" Algorithms for clustering : DBSCAN
-
-    DBSCAN: (Density-Based Spatial Clustering of Applications with Noise)
 """
-# Author: Robert Layton robertlayton@gmail.com
+DBSCAN: Density-Based Spatial Clustering of Applications with Noise
+"""
+
+# Author: Robert Layton <robertlayton@gmail.com>
 #
 # License: BSD
 
@@ -15,25 +15,24 @@
 
 def dbscan
8000
(S, eps=0.5, min_points=5, metric='euclidean',
            index_order=None, verbose=False, is_similarity=None,):
-    """Perform DBSCAN Clustering of data
+    """Perform DBSCAN clustering of data.
 
     Parameters
     ----------
-
     S: array [n_points, n_points] or [n_points, n_features]
-        Matrix of similarities between points, or a feature matrix.
-        If the matrix is square, it is treated as a similarity matrix,
-        otherwise it is treated as a feature matrix. Use is_similarity to
+        Array of similarities between points, or a feature array.
+        If the array is square, it is treated as a similarity array,
+        otherwise it is treated as a feature array. Use is_similarity to
         override this pattern.
     eps: float, optional
         The minimum similarity for two points to be considered
-        in the same neighbourhood.
+        in the same neighborhood.
     min_points: int, optional
-        The number of points in a neighbourhood for a point to be considered
+        The number of points in a neighborhood for a point to be considered
         as a core point.
     metric: string, or callable
         The metric to use when calculating distance between instances in a
-        feature matrix. If metric is a string, it must be one of the options
+        feature array. If metric is a string, it must be one of the options
         allowed by scipy.spatial.distance.pdist for its metric parameter.
         Alternatively, if metric is a callable function, it is called on each
         pair of instances (rows) and the resulting value recorded.
@@ -44,23 +43,21 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
     verbose: boolean, optional
         The verbosity level
     is_similarity: boolean, optional (default=None)
-        Overrides the behaviour of the matrix handling of S.
-        If is_similarity is None, any square matrix is handled as a similarity
-        matrix and any non-square matrix is a feature matrix.
-        If is_similarity is True, any matrix is handled as a similarity matrix,
-        and the procedure will raise a ValueError if the matrix is not square.
-        If is_similarity is False, any matrix will be handled as a feature
-        matrix, including square matrices.
+        Overrides the behaviour of the array handling of S.
+        If is_similarity is None, any square array is handled as a similarity
+        array and any non-square array is a feature array.
+        If is_similarity is True, any array is handled as a similarity array,
+        and the procedure will raise a ValueError if the array is not square.
+        If is_similarity is False, any array will be handled as a feature
+        array, including square matrices.
 
     Returns
     -------
-
     core_points: array [n_core_points]
-        index of core points
+        Indices of core points.
 
     labels : array [n_points]
-        cluster labels for each point
-        Noisey points are given the label -1
+        Cluster labels for each point.  Noisy points are given the label -1.
 
     Notes
     -----
@@ -71,9 +68,8 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
     Algorithm for Discovering Clusters in Large Spatial Databases with Noise”.
     In: Proceedings of the 2nd International Conference on Knowledge Discovery
     and Data Mining, Portland, OR, AAAI Press, pp. 226–231. 1996
-
-
     """
+
     n = S.shape[0]
     # If index order not given, create random order.
     if index_order is None:
@@ -82,19 +78,19 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
     assert len(index_order) == n, ("Index order must be of length n"
                                    " (%d expected, %d given)"
                                    % (n, len(index_order)))
-    S = calculateSimilarity(S, metric=metric, is_similarity=is_similarity)
-    # Calculate neighbourhood for all points. This leaves the original point
+    S = calculate_similarity(S, metric=metric, is_similarity=is_similarity)
+    # Calculate neighborhood for all points. This leaves the original point
     # in, which needs to be considered later (i.e. point i is the
-    # neighbourhood of point i. While True, its useless information)
-    neighbourhoods = [np.where(x >= eps)[0] for x in S]
+    # neighborhood of point i. While True, its useless information)
+    neighborhoods = [np.where(x >= eps)[0] for x in S]
     # Initially, all points are noise.
-    labels = np.zeros((n,), dtype='int') - 1
+    labels = np.array([-1] * n)
     # A list of all core points found.
     core_points = []
     # If they are then build a new cluster from them.
     for index in index_order:
-        if labels[index] != -1 or len(neighbourhoods[index]) < min_points:
+        if labels[index] != -1 or len(neighborhoods[index]) < min_points:
             # This point is already classified, or not enough for a core point.
             continue
         core_points.append(index)
@@ -108,41 +104,36 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
             # A candidate is a core point in the current cluster that has
             # not yet been used to expand the current cluster.
             for c in candidates:
-                for neighbour in neighbourhoods[c]:
-                    if labels[neighbour] == -1:
-                        # neighbour is part of the current cluster iff
-                        # it is not part of another cluster already.
-                        labels[neighbour] = label_num
-                        # check if its a core point as well
-                        if len(neighbourhoods[neighbour]) >= min_points:
-                            # is new core point
-                            new_candidates.append(neighbour)
-                            core_points.append(neighbour)
+                noise = np.where(labels[neighborhoods[c]] == -1)[0]
+                noise = neighborhoods[c][noise]
+                labels[noise] = label_num
+                for neighbor in noise:
+                    # check if its a core point as well
+                    if len(neighborhoods[neighbor]) >= min_points:
+                        # is new core point
+                        new_candidates.append(neighbor)
+                        core_points.append(neighbor)
             # Update candidates for next round of cluster expansion.
             candidates = new_candidates
     return core_points, labels
 
 
-def calculateSimilarity(S, metric=None, is_similarity=None):
+def calculate_similarity(S, metric=None, is_similarity=None):
     n, d = S.shape
-    # If the matrix looks square, it may be a similarity matrix.
+    # If the array looks square, it may be a similarity array.
     if n == d:
-        if is_similarity is None or is_similarity:
+        if is_similarity in (None, True):
             return S
-    else:
-        # Matrix is not square, so it cannot be a similarity matrix.
-        if is_similarity:
-            raise ValueError("Matrix not square, "
-                             "cannot be a similarity matrix."
-                             " Size: %d x %d." % (n, d))
-    # In all other cases, the matrix is to be considered as a feature matrix.
+    elif is_similarity:
+        # Array is not square, so it cannot be a similarity array.
+        raise ValueError("Array not square, cannot be a similarity array."
+                         " Shape = %s" % repr((n, d))
+    # In all other cases, the array is to be considered as a feature array.
     D = distance.squareform(distance.pdist(S, metric=metric))
     S = 1. - (D / np.max(D))
     return S
 
 
-###############################################################################
-
 class DBSCAN(BaseEstimator):
     """Perform DBSCAN Clustering of data
 
@@ -152,15 +143,14 @@ class DBSCAN(BaseEstimator):
 
     Parameters
     ----------
-
     eps: float, optional
-        The distance for two points to be considered in the same neighbourhood
+        The distance for two points to be considered in the same neighborhood
     min_points: int, optional
-        The number of points in a neighbourhood for a point to be considered
+        The number of points in a neighborhood for a point to be considered
         as a core point.
     metric: string, or callable
         The metric to use when calculating distance between instances in a
-        feature matrix. If metric is a string, it must be one of the options
+        feature array. If metric is a string, it must be one of the options
         allowed by scipy.spatial.distance.pdist for its metric parameter.
         Alternatively, if metric is a callable function, it is called on each
         pair of instances (rows) and the resulting value recorded.
@@ -171,30 +161,26 @@ class DBSCAN(BaseEstimator):
     verbose: boolean, optional
         The verbosity level
     is_similarity: boolean, optional (default=None)
-        Overrides the behaviour of the matrix handling of S.
-        If is_similarity is None, any square matrix is handled as a similarity
-        matrix and any non-square matrix is a feature matrix.
-        If is_similarity is True, any matrix is handled as a similarity matrix,
-        and the procedure will raise a ValueError if the matrix is not square.
-        If is_similarity is False, any matrix will be handled as a feature
-        matrix, including square matrices.
+        Overrides the behaviour of the array handling of S.
+        If is_similarity is None, any square array is handled as a similarity
+        array and any non-square array is a feature array.
+        If is_similarity is True, any array is handled as a similarity array,
+        and the procedure will raise a ValueError if the array is not square.
+        If is_similarity is False, any array will be handled as a feature
+        array, including square matrices.
 
     Methods
     -------
-
     fit:
         Compute the clustering
 
     Attributes
     ----------
+    core_points: array, shape = [n_core_points]
+        Indices of core points.
 
-    core_points: array [n_core_points]
-        index of core points
-
-    labels : array [n_points]
-        cluster labels for each point
-        Noisey points are given the label -1
-
+    labels : array, shape = [n_points]
+        Cluster labels for each point. Noisy points are given the label -1.
 
     Notes
     -----
@@ -219,25 +205,19 @@ def __init__(self, eps=0.5, min_points=5, metric='euclidean',
         self.is_similarity = is_similarity
 
     def fit(self, S, **params):
-        """Compute DBSCAN labels for points, using similarity matrix S.
+        """Compute DBSCAN labels for points, using similarity array S.
 
         Parameters
         ----------
-
         S: array [n_points, n_points] or [n_points, n_features]
-            Matrix of similarities between points, or a feature matrix.
-            If the matrix is square, it is treated as a similarity matrix,
-            otherwise it is treated as a feature matrix. Use is_similarity to
+            Array of similarities between points, or a feature array.
+            If the array is square, it is treated as a similarity array,
+            otherwise it is treated as a feature array. Use is_similarity to
             override this pattern.
-        params: Overwrite keywords from __init__
-
+        params: dict
+            Overwrite keywords from __init__.
         """
+
         self._set_params(**params)
-        self.core_points_, self.labels_ = dbscan(S, eps=self.eps,
-                                                 min_points=self.min_points,
-                                                 verbose=self.verbose,
-                                                 metric=self.metric,
-                                                 index_order=self.index_order,
-                                                 is_similarity=\
-                                                 self.is_similarity)
+        self.core_points_, self.labels_ = dbscan(S, **self._get_params())
         return self
@@ -1,6 +1,5 @@
 """
-Testing for Clustering methods
-
+Tests for DBSCAN clustering algorithm
 """
 
 import numpy as np
@@ -17,7 +16,7 @@
 
 def test_dbscan_similarity():
     """
-    Tests the DBSCAN algorithm with a similarity matrix
+    Tests the DBSCAN algorithm with a similarity array
 
     """
     # Compute similarities
@@ -36,13 +35,13 @@ def test_dbscan_similarity():
     db = DBSCAN()
     labels = db.fit(S, eps=0.85, min_points=10).labels_
 
-    n_clusters_2 = len(set(labels)) - (1 if -1 in labels else 0)
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
     assert_equal(n_clusters, n_clusters_2)
 
 
 def test_dbscan_feature():
     """
-    Tests the DBSCAN algorithm with a features matrix
+    Tests the DBSCAN algorithm with a feature vector array
 
     """
     metric = 'euclidean'
@@ -52,12 +51,12 @@ def test_dbscan_feature():
                                  eps=0.85, min_points=10)
 
     # number of clusters, ignoring noise if present
-    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
+    n_clusters_1 = len(set(labels)) - int(-1 in labels)
     assert_equal(n_clusters, n_clusters_1)
 
     db = DBSCAN()
     labels = db.fit(X, metric=metric,
                     eps=0.85, min_points=10).labels_
 
-    n_clusters_2 = len(set(labels)) - (1 if -1 in labels else 0)
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
     assert_equal(n_clusters, n_clusters_2)