MechCoder · October 29, 2014 13:49
diff --git a/bench_birch3.py b/bench_birch3.py
 File: sklearn/cluster/birch.py
 Function: insert_cf_subcluster at line 101
 Total time: 143.353 s

 Line #      Hits         Time  Per Hit   % Time  Line Contents
 ==============================================================
   101                                               @profile
   102                                               def insert_cf_subcluster(self, subcluster):
   103                                                   """
   104                                                   Insert a new subcluster into the nide
   105                                                   """
   106    656982       570040      0.9      0.4          if not self.subclusters_:
   107         1            1      1.0      0.0              self.subclusters_.append(subcluster)
   108         1            1      1.0      0.0              return False
   109                                           
   110                                                   # We need to find the closest subcluster among all the
   111                                                   # subclusters so that we can insert our new subcluster.
   112                                           
   113    656981     21742431     33.1     15.2          subcluster_centroids = self.get_centroids()
   114                                                   closest_index, closest_threshold = \
   115    656981      1914890      2.9      1.3              pairwise_distances_argmin_min(subcluster.ls_[np.newaxis, :],
   116    656981       424328      0.6      0.3                                            subcluster_centroids,
   117    656981    103074945    156.9     71.9                                            check_X_y=False)
   118                                           
   119                                                   # Index returned is a numpy array.
   120    656981       832977      1.3      0.6          closest_index = closest_index[0]
   121    656981       667938      1.0      0.5          closest_subcluster = self.subclusters_[closest_index]
   122                                           
   123                                                   # If the subcluster has a child, we need a recursive strategy.
   124    656981       583248      0.9      0.4          if closest_subcluster.child_ is not None:
   125    556982       537756      1.0      0.4              split_child = closest_subcluster.child_.insert_cf_subcluster(
   126    556982      1250540      2.2      0.9                  subcluster)
   127                                           
   128    556982       367271      0.7      0.3              if not split_child:
   129                                                           # If it is determined that the child need not be split, we
   130                                                           # can just update the closest_subcluster<
8000
/td>
        
   131    533138      2860898      5.4      2.0                  self.subclusters_[closest_index].update(subcluster)
   132    533138       336903      0.6      0.2                  return False
   133                                           
   134                                                       # things not too good. we need to redistribute the subclusters in
   135                                                       # our child node, and add a new subcluster in the parent
   136                                                       # subcluster to accomodate the new child.
   137                                                       else:
   138     23844      7185851    301.4      5.0                  self.split_node(closest_subcluster.child_, closest_subcluster)
   139                                           
   140     23844        41120      1.7      0.0                  if len(self.subclusters_) > self.branching_factor:
   141      4621         3242      0.7      0.0                      return True
   142     19223        13858      0.7      0.0                  return False
   143                                           
   144                                                   # good to go!
   145     99999       602150      6.0      0.4          elif self.threshold >= closest_threshold:
   146                                                       closest_subcluster.update(subcluster)
   147                                                       return False
   148                                           
   149                                                   # not close to any other subclusters, and we still have space, so add.
   150     99999       156617      1.6      0.1          elif len(self.subclusters_) < self.branching_factor:
   151     80770        91722      1.1      0.1              self.subclusters_.append(subcluster)
   152     80770        52218      0.6      0.0              return False
   153                                           
   154                                                   # We do not have enough space nor is it closer to an other subcluster.
   155                                                   # We need to split.
   156                                                   else:
   157     19229        29591      1.5      0.0              self.subclusters_.append(subcluster)
   158     19229        12773      0.7      0.0              return True

 File: sklearn/metrics/pairwise.py
 Function: check_pairwise_arrays at line 71
 Total time: 1.21733 s

 Line #      Hits         Time  Per Hit   % Time  Line Contents
 ==============================================================
    71                                           @profile
    72                                           def check_pairwise_arrays(X, Y, dtype=None):
    73                                               """ Set X and Y appropriately and checks inputs
    74                                           
    75                                               If Y is None, it is set as a pointer to X (i.e. not a copy).
    76                                               If Y is given, this does not happen.
    77                                               All distance metrics should use this function first to assert that the
    78                                               given parameters are correct and safe to use.
    79                                           
    80                                               Specifically, this function first ensures that both X and Y are arrays,
    81                                               then checks that they are at least two dimensional while ensuring that
    82                                               their elements are floats. Finally, the function checks that the size
    83                                               of the second dimension of the two arrays is equal.
    84                                           
    85                                               Parameters
    86                                               ----------
    87                                               X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
    88                                           
    89                                               Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
    90                                           
    91                                               dtype : string, type or None (default=none)
    92                                                   Data type of result. If None, the dtype of the input is preserved.
    93                                           
    94                                               Returns
    95                                               -------
    96                                               safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
    97                                                   An array equal to X, guaranteed to be a numpy array.
    98                                           
    99                                               safe_Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
   100                                                   An array equal to Y if Y was not None, guaranteed to be a numpy array.
   101                                                   If Y was None, safe_Y will be a pointer to X.
   102                                           
   103                                               """
   104     23851        27401      1.1      2.3      if Y is X or Y is None:
   105     23850      1139043     47.8     93.6          X = Y = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype)
   106                                               else:
   107         1          422    422.0      0.0          X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype)
   108         1          373    373.0      0.0          Y = check_array(Y, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype)
   109     23851        35703      1.5      2.9      if X.shape[1] != Y.shape[1]:
   110                                                   raise ValueError("Incompatible dimension for X and Y matrices: "
   111                                                                    "X.shape[1] == %d while Y.shape[1] == %d" % (
   112                                                                        X.shape[1], Y.shape[1]))
   113                                           
   114     23851        14384      0.6      1.2      return X, Y

 File: sklearn/metrics/pairwise.py
 Function: pairwise_distances_argmin_min at line 238
 Total time: 168.961 s

 Line #      Hits         Time  Per Hit   % Time  Line Contents
 ==============================================================
   238                                           @profile
   239                                           def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
   240                                                                             batch_size=500, metric_kwargs=None,
   241                                                                             check_X_y=True):
   242                                               """Compute minimum distances between one point and a set of points.
   243                                           
   244                                               This function computes for each row in X, the index of the row of Y which
   245                                               is closest (according to the specified distance). The minimal distances are
   246                                               also returned.
   247                                           
   248                                               This is mostly equivalent to calling:
   249                                           
   250                                                   (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),
   251                                                    pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))
   252                                           
   253                                               but uses much less memory, and is faster for large arrays.
   254                                           
   255                                               Parameters
   256                                               ----------
   257                                               X, Y : {array-like, sparse matrix}
   258                                                   Arrays containing points. Respective shapes (n_samples1, n_features)
   259                                                   and (n_samples2, n_features)
   260                                           
   261                                               batch_size : integer
   262                                                   To reduce memory consumption over the naive solution, data are
   263                                                   processed in batches, comprising batch_size rows of X and
   264                                                   batch_size rows of Y. The default value is quite conservative, but
   265                                                   can be changed for fine-tuning. The larger the number, the larger the
   266                                                   memory usage.
   267                                           
   268                                               metric : string or callable
   269                                                   metric to use for distance computation. Any metric from scikit-learn
   270                                                   or scipy.spatial.distance can be used.
   271                                           
   272                                                   If metric is a callable function, it is called on each
   273                                                   pair of instances (rows) and the resulting value recorded. The callable
   274                                                   should take two arrays as input and return one value indicating the
   275                                                   distance between them. This works for Scipy's metrics, but is less
   276                                                   efficient than passing the metric name as a string.
   277                                           
   278                                                   Distance matrices are not supported.
   279                                           
   280                                                   Valid values for metric are:
   281                                           
   282                                                   - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
   283                                                     'manhattan']
   284                                           
   285                                                   - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
   286                                                     'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
   287                                                     'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao',
   288                                                     'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
   289                                           
   290                                                   See the documentation for scipy.spatial.distance for details on these
   291                                                   metrics.
   292                                           
   293                                               metric_kwargs : dict, optional
   294                                                   Keyword arguments to pass to specified metric function.
   295                                           
   296                                               Returns
   297                                               -------
   298                                               argmin : numpy.ndarray
   299                                                   Y[argmin[i], :] is the row in Y that is closest to X[i, :].
   300                                           
   301                                               distances : numpy.ndarray
   302                                                   distances[i] is the distance between the i-th row in X and the
   303                                                   argmin[i]-th row in Y.
   304                                           
   305                                               See also
   306                                               --------
   307                                               sklearn.metrics.pairwise_distances
   308                                               sklearn.metrics.pairwise_distances_argmin
   309                                               """
   310    656982       670566      1.0      0.4      dist_func = None
   311    656982       690610      1.1      0.4      if metric in PAIRWISE_DISTANCE_FUNCTIONS:
   312    656982       686983      1.0      0.4          dist_func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
   313                                               elif not callable(metric) and not isinstance(metric, str):
   314                                                   raise ValueError("'metric' must be a string or a callable")
   315                                           
   316    656982       595715      0.9      0.4      if check_X_y:
   317         1          822    822.0      0.0          X, Y = check_pairwise_arrays(X, Y)
   318                                           
   319    656982       618678      0.9      0.4      if metric_kwargs is None:
   320    656982       657428      1.0      0.4          metric_kwargs = {}
   321                                           
   322    656982       619505      0.9      0.4      if axis == 0:
   323                                                   X, Y = Y, X
   324                                           
   325                                               # Allocate output arrays
   326    656982      2531210      3.9      1.5      indices = np.empty(X.shape[0], dtype=np.intp)
   327    656982      1363314      2.1      0.8      values = np.empty(X.shape[0])
   328    656982      1468895      2.2      0.9      values.fill(np.infty)
   329                                           
   330   1314163      3880532      3.0      2.3      for chunk_x in gen_batches(X.shape[0], batch_size):
   331    657181      1982743      3.0      1.2          X_chunk = X[chunk_x, :]
   332                                           
   333   1354162      3958987      2.9      2.3          for chunk_y in gen_batches(Y.shape[0], batch_size):
   334    696981      1674758      2.4      1.0              Y_chunk = Y[chunk_y, :]
   335                                           
   336    696981       703315      1.0      0.4              if dist_func is not None:
   337    696981       683908      1.0      0.4                  if metric == 'euclidean':  # special case, for speed
   338    696981      1092354      1.6      0.6                      d_chunk = safe_sparse_dot(X_chunk, Y_chunk.T,
   339    696981     36939114     53.0     21.9                                                dense_output=True)
   340    696981     11336271     16.3      6.7                      d_chunk *= -2
   341    696981     24403217     35.0     14.4                      d_chunk += row_norms(X_chunk, squared=True)[:, np.newaxis]
   342    696981     19307949     27.7     11.4                      d_chunk += row_norms(Y_chunk, squared=True)[np.newaxis, :]
   343    696981     15203629     21.8      9.0                      np.maximum(d_chunk, 0, d_chunk)
   344                                                           else:
   345                                                               d_chunk = dist_func(X_chunk, Y_chunk, **metric_kwargs)
   346                                                       else:
   347                                                           d_chunk = pairwise_distances(X_chunk, Y_chunk,
   348                                                                                        metric=metric, **metric_kwargs)
   349                                           
   350                                                       # Update indices and minimum values using chunk
   351    696981     14876105     21.3      8.8              min_indices = d_chunk.argmin(axis=1)
   352    696981      2458249      3.5      1.5              min_values = d_chunk[np.arange(chunk_x.stop - chunk_x.start),
   353    696981      5527021      7.9      3.3                                   min_indices]
   354                                           
   355    696981      3313727      4.8      2.0              flags = values[chunk_x] > min_values
   356    696981      5884946      8.4      3.5              indices[chunk_x][flags] = min_indices[flags] + chunk_y.start
   357    696981      2189400      3.1      1.3              values[chunk_x][flags] = min_values[flags]
   358                                           
   359    656982      1099594      1.7      0.7      if metric == "euclidean" and not metric_kwargs.get("squared", False):
   360    656982      1891684      2.9      1.1          np.sqrt(values, values)
	File: sklearn/cluster/birch.py
	Function: insert_cf_subcluster at line 101
	Total time: 143.353 s

	Line # Hits Time Per Hit % Time Line Contents
	==============================================================
	101 @profile
	102 def insert_cf_subcluster(self, subcluster):
	103 """
	104 Insert a new subcluster into the nide
	105 """
	106 656982 570040 0.9 0.4 if not self.subclusters_:
	107 1 1 1.0 0.0 self.subclusters_.append(subcluster)
	108 1 1 1.0 0.0 return False
	109
	110 # We need to find the closest subcluster among all the
	111 # subclusters so that we can insert our new subcluster.
	112
	113 656981 21742431 33.1 15.2 subcluster_centroids = self.get_centroids()
	114 closest_index, closest_threshold = \
	115 656981 1914890 2.9 1.3 pairwise_distances_argmin_min(subcluster.ls_[np.newaxis, :],
	116 656981 424328 0.6 0.3 subcluster_centroids,
	117 656981 103074945 156.9 71.9 check_X_y=False)
	118
	119 # Index returned is a numpy array.
	120 656981 832977 1.3 0.6 closest_index = closest_index[0]
	121 656981 667938 1.0 0.5 closest_subcluster = self.subclusters_[closest_index]
	122
	123 # If the subcluster has a child, we need a recursive strategy.
	124 656981 583248 0.9 0.4 if closest_subcluster.child_ is not None:
	125 556982 537756 1.0 0.4 split_child = closest_subcluster.child_.insert_cf_subcluster(
	126 556982 1250540 2.2 0.9 subcluster)
	127
	128 556982 367271 0.7 0.3 if not split_child:
	129 # If it is determined that the child need not be split, we
	130 # can just update the closest_subcluster< 8000 /td>
	131 533138 2860898 5.4 2.0 self.subclusters_[closest_index].update(subcluster)
	132 533138 336903 0.6 0.2 return False
	133
	134 # things not too good. we need to redistribute the subclusters in
	135 # our child node, and add a new subcluster in the parent
	136 # subcluster to accomodate the new child.
	137 else:
	138 23844 7185851 301.4 5.0 self.split_node(closest_subcluster.child_, closest_subcluster)
	139
	140 23844 41120 1.7 0.0 if len(self.subclusters_) > self.branching_factor:
	141 4621 3242 0.7 0.0 return True
	142 19223 13858 0.7 0.0 return False
	143
	144 # good to go!
	145 99999 602150 6.0 0.4 elif self.threshold >= closest_threshold:
	146 closest_subcluster.update(subcluster)
	147 return False
	148
	149 # not close to any other subclusters, and we still have space, so add.
	150 99999 156617 1.6 0.1 elif len(self.subclusters_) < self.branching_factor:
	151 80770 91722 1.1 0.1 self.subclusters_.append(subcluster)
	152 80770 52218 0.6 0.0 return False
	153
	154 # We do not have enough space nor is it closer to an other subcluster.
	155 # We need to split.
	156 else:
	157 19229 29591 1.5 0.0 self.subclusters_.append(subcluster)
	158 19229 12773 0.7 0.0 return True

	File: sklearn/metrics/pairwise.py
	Function: check_pairwise_arrays at line 71
	Total time: 1.21733 s

	Line # Hits Time Per Hit % Time Line Contents
	==============================================================
	71 @profile
	72 def check_pairwise_arrays(X, Y, dtype=None):
	73 """ Set X and Y appropriately and checks inputs
	74
	75 If Y is None, it is set as a pointer to X (i.e. not a copy).
	76 If Y is given, this does not happen.
	77 All distance metrics should use this function first to assert that the
	78 given parameters are correct and safe to use.
	79
	80 Specifically, this function first ensures that both X and Y are arrays,
	81 then checks that they are at least two dimensional while ensuring that
	82 their elements are floats. Finally, the function checks that the size
	83 of the second dimension of the two arrays is equal.
	84
	85 Parameters
	86 ----------
	87 X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
	88
	89 Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
	90
	91 dtype : string, type or None (default=none)
	92 Data type of result. If None, the dtype of the input is preserved.
	93
	94 Returns
	95 -------
	96 safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
	97 An array equal to X, guaranteed to be a numpy array.
	98
	99 safe_Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
	100 An array equal to Y if Y was not None, guaranteed to be a numpy array.
	101 If Y was None, safe_Y will be a pointer to X.
	102
	103 """
	104 23851 27401 1.1 2.3 if Y is X or Y is None:
	105 23850 1139043 47.8 93.6 X = Y = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype)
	106 else:
	107 1 422 422.0 0.0 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype)
	108 1 373 373.0 0.0 Y = check_array(Y, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype)
	109 23851 35703 1.5 2.9 if X.shape[1] != Y.shape[1]:
	110 raise ValueError("Incompatible dimension for X and Y matrices: "
	111 "X.shape[1] == %d while Y.shape[1] == %d" % (
	112 X.shape[1], Y.shape[1]))
	113
	114 23851 14384 0.6 1.2 return X, Y

	File: sklearn/metrics/pairwise.py
	Function: pairwise_distances_argmin_min at line 238
	Total time: 168.961 s

	Line # Hits Time Per Hit % Time Line Contents
	==============================================================
	238 @profile
	239 def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
	240 batch_size=500, metric_kwargs=None,
	241 check_X_y=True):
	242 """Compute minimum distances between one point and a set of points.
	243
	244 This function computes for each row in X, the index of the row of Y which
	245 is closest (according to the specified distance). The minimal distances are
	246 also returned.
	247
	248 This is mostly equivalent to calling:
	249
	250 (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),
	251 pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))
	252
	253 but uses much less memory, and is faster for large arrays.
	254
	255 Parameters
	256 ----------
	257 X, Y : {array-like, sparse matrix}
	258 Arrays containing points. Respective shapes (n_samples1, n_features)
	259 and (n_samples2, n_features)
	260
	261 batch_size : integer
	262 To reduce memory consumption over the naive solution, data are
	263 processed in batches, comprising batch_size rows of X and
	264 batch_size rows of Y. The default value is quite conservative, but
	265 can be changed for fine-tuning. The larger the number, the larger the
	266 memory usage.
	267
	268 metric : string or callable
	269 metric to use for distance computation. Any metric from scikit-learn
	270 or scipy.spatial.distance can be used.
	271
	272 If metric is a callable function, it is called on each
	273 pair of instances (rows) and the resulting value recorded. The callable
	274 should take two arrays as input and return one value indicating the
	275 distance between them. This works for Scipy's metrics, but is less
	276 efficient than passing the metric name as a string.
	277
	278 Distance matrices are not supported.
	279
	280 Valid values for metric are:
	281
	282 - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
	283 'manhattan']
	284
	285 - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
	286 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
	287 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao',
	288 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
	289
	290 See the documentation for scipy.spatial.distance for details on these
	291 metrics.
	292
	293 metric_kwargs : dict, optional
	294 Keyword arguments to pass to specified metric function.
	295
	296 Returns
	297 -------
	298 argmin : numpy.ndarray
	299 Y[argmin[i], :] is the row in Y that is closest to X[i, :].
	300
	301 distances : numpy.ndarray
	302 distances[i] is the distance between the i-th row in X and the
	303 argmin[i]-th row in Y.
	304
	305 See also
	306 --------
	307 sklearn.metrics.pairwise_distances
	308 sklearn.metrics.pairwise_distances_argmin
	309 """
	310 656982 670566 1.0 0.4 dist_func = None
	311 656982 690610 1.1 0.4 if metric in PAIRWISE_DISTANCE_FUNCTIONS:
	312 656982 686983 1.0 0.4 dist_func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
	313 elif not callable(metric) and not isinstance(metric, str):
	314 raise ValueError("'metric' must be a string or a callable")
	315
	316 656982 595715 0.9 0.4 if check_X_y:
	317 1 822 822.0 0.0 X, Y = check_pairwise_arrays(X, Y)
	318
	319 656982 618678 0.9 0.4 if metric_kwargs is None:
	320 656982 657428 1.0 0.4 metric_kwargs = {}
	321
	322 656982 619505 0.9 0.4 if axis == 0:
	323 X, Y = Y, X
	324
	325 # Allocate output arrays
	326 656982 2531210 3.9 1.5 indices = np.empty(X.shape[0], dtype=np.intp)
	327 656982 1363314 2.1 0.8 values = np.empty(X.shape[0])
	328 656982 1468895 2.2 0.9 values.fill(np.infty)
	329
	330 1314163 3880532 3.0 2.3 for chunk_x in gen_batches(X.shape[0], batch_size):
	331 657181 1982743 3.0 1.2 X_chunk = X[chunk_x, :]
	332
	333 1354162 3958987 2.9 2.3 for chunk_y in gen_batches(Y.shape[0], batch_size):
	334 696981 1674758 2.4 1.0 Y_chunk = Y[chunk_y, :]
	335
	336 696981 703315 1.0 0.4 if dist_func is not None:
	337 696981 683908 1.0 0.4 if metric == 'euclidean': # special case, for speed
	338 696981 1092354 1.6 0.6 d_chunk = safe_sparse_dot(X_chunk, Y_chunk.T,
	339 696981 36939114 53.0 21.9 dense_output=True)
	340 696981 11336271 16.3 6.7 d_chunk *= -2
	341 696981 24403217 35.0 14.4 d_chunk += row_norms(X_chunk, squared=True)[:, np.newaxis]
	342 696981 19307949 27.7 11.4 d_chunk += row_norms(Y_chunk, squared=True)[np.newaxis, :]
	343 696981 15203629 21.8 9.0 np.maximum(d_chunk, 0, d_chunk)
	344 else:
	345 d_chunk = dist_func(X_chunk, Y_chunk, **metric_kwargs)
	346 else:
	347 d_chunk = pairwise_distances(X_chunk, Y_chunk,
	348 metric=metric, **metric_kwargs)
	349
	350 # Update indices and minimum values using chunk
	351 696981 14876105 21.3 8.8 min_indices = d_chunk.argmin(axis=1)
	352 696981 2458249 3.5 1.5 min_values = d_chunk[np.arange(chunk_x.stop - chunk_x.start),
	353 696981 5527021 7.9 3.3 min_indices]
	354
	355 696981 3313727 4.8 2.0 flags = values[chunk_x] > min_values
	356 696981 5884946 8.4 3.5 indices[chunk_x][flags] = min_indices[flags] + chunk_y.start
	357 696981 2189400 3.1 1.3 values[chunk_x][flags] = min_values[flags]
	358
	359 656982 1099594 1.7 0.7 if metric == "euclidean" and not metric_kwargs.get("squared", False):
	360 656982 1891684 2.9 1.1 np.sqrt(values, values)