8000 Benh birch without repeated check_pairwise (249 s) · GitHub
[go: up one dir, main page]

Skip to content

Instantly share code, notes, and snippets.

@MechCoder
Created October 29, 2014 13:49
Show Gist options
  • Save MechCoder/6c29efb33917c5d97857 to your computer and use it in GitHub Desktop.
Save MechCoder/6c29efb33917c5d97857 to your computer and use it in GitHub Desktop.
Benh birch without repeated check_pairwise (249 s)
3E74
File: sklearn/cluster/birch.py
Function: insert_cf_subcluster at line 101
Total time: 143.353 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
101 @profile
102 def insert_cf_subcluster(self, subcluster):
103 """
104 Insert a new subcluster into the nide
105 """
106 656982 570040 0.9 0.4 if not self.subclusters_:
107 1 1 1.0 0.0 self.subclusters_.append(subcluster)
108 1 1 1.0 0.0 return False
109
110 # We need to find the closest subcluster among all the
111 # subclusters so that we can insert our new subcluster.
112
113 656981 21742431 33.1 15.2 subcluster_centroids = self.get_centroids()
114 closest_index, closest_threshold = \
115 656981 1914890 2.9 1.3 pairwise_distances_argmin_min(subcluster.ls_[np.newaxis, :],
116 656981 424328 0.6 0.3 subcluster_centroids,
117 656981 103074945 156.9 71.9 check_X_y=False)
118
119 # Index returned is a numpy array.
120 656981 832977 1.3 0.6 closest_index = closest_index[0]
121 656981 667938 1.0 0.5 closest_subcluster = self.subclusters_[closest_index]
122
123 # If the subcluster has a child, we need a recursive strategy.
124 656981 583248 0.9 0.4 if closest_subcluster.child_ is not None:
125 556982 537756 1.0 0.4 split_child = closest_subcluster.child_.insert_cf_subcluster(
126 556982 1250540 2.2 0.9 subcluster)
127
128 556982 367271 0.7 0.3 if not split_child:
129 # If it is determined that the child need not be split, we
130 # can just update the closest_subcluster< 8000 /td>
131 533138 2860898 5.4 2.0 self.subclusters_[closest_index].update(subcluster)
132 533138 336903 0.6 0.2 return False
133
134 # things not too good. we need to redistribute the subclusters in
135 # our child node, and add a new subcluster in the parent
136 # subcluster to accomodate the new child.
137 else:
138 23844 7185851 301.4 5.0 self.split_node(closest_subcluster.child_, closest_subcluster)
139
140 23844 41120 1.7 0.0 if len(self.subclusters_) > self.branching_factor:
141 4621 3242 0.7 0.0 return True
142 19223 13858 0.7 0.0 return False
143
144 # good to go!
145 99999 602150 6.0 0.4 elif self.threshold >= closest_threshold:
146 closest_subcluster.update(subcluster)
147 return False
148
149 # not close to any other subclusters, and we still have space, so add.
150 99999 156617 1.6 0.1 elif len(self.subclusters_) < self.branching_factor:
151 80770 91722 1.1 0.1 self.subclusters_.append(subcluster)
152 80770 52218 0.6 0.0 return False
153
154 # We do not have enough space nor is it closer to an other subcluster.
155 # We need to split.
156 else:
157 19229 29591 1.5 0.0 self.subclusters_.append(subcluster)
158 19229 12773 0.7 0.0 return True
File: sklearn/metrics/pairwise.py
Function: check_pairwise_arrays at line 71
Total time: 1.21733 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
71 @profile
72 def check_pairwise_arrays(X, Y, dtype=None):
73 """ Set X and Y appropriately and checks inputs
74
75 If Y is None, it is set as a pointer to X (i.e. not a copy).
76 If Y is given, this does not happen.
77 All distance metrics should use this function first to assert that the
78 given parameters are correct and safe to use.
79
80 Specifically, this function first ensures that both X and Y are arrays,
81 then checks that they are at least two dimensional while ensuring that
82 their elements are floats. Finally, the function checks that the size
83 of the second dimension of the two arrays is equal.
84
85 Parameters
86 ----------
87 X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
88
89 Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
90
91 dtype : string, type or None (default=none)
92 Data type of result. If None, the dtype of the input is preserved.
93
94 Returns
95 -------
96 safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
97 An array equal to X, guaranteed to be a numpy array.
98
99 safe_Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
100 An array equal to Y if Y was not None, guaranteed to be a numpy array.
101 If Y was None, safe_Y will be a pointer to X.
102
103 """
104 23851 27401 1.1 2.3 if Y is X or Y is None:
105 23850 1139043 47.8 93.6 X = Y = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype)
106 else:
107 1 422 422.0 0.0 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype)
108 1 373 373.0 0.0 Y = check_array(Y, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype)
109 23851 35703 1.5 2.9 if X.shape[1] != Y.shape[1]:
110 raise ValueError("Incompatible dimension for X and Y matrices: "
111 "X.shape[1] == %d while Y.shape[1] == %d" % (
112 X.shape[1], Y.shape[1]))
113
114 23851 14384 0.6 1.2 return X, Y
File: sklearn/metrics/pairwise.py
Function: pairwise_distances_argmin_min at line 238
Total time: 168.961 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
238 @profile
239 def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
240 batch_size=500, metric_kwargs=None,
241 check_X_y=True):
242 """Compute minimum distances between one point and a set of points.
243
244 This function computes for each row in X, the index of the row of Y which
245 is closest (according to the specified distance). The minimal distances are
246 also returned.
247
248 This is mostly equivalent to calling:
249
250 (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),
251 pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))
252
253 but uses much less memory, and is faster for large arrays.
254
255 Parameters
256 ----------
257 X, Y : {array-like, sparse matrix}
258 Arrays containing points. Respective shapes (n_samples1, n_features)
259 and (n_samples2, n_features)
260
261 batch_size : integer
262 To reduce memory consumption over the naive solution, data are
263 processed in batches, comprising batch_size rows of X and
264 batch_size rows of Y. The default value is quite conservative, but
265 can be changed for fine-tuning. The larger the number, the larger the
266 memory usage.
267
268 metric : string or callable
269 metric to use for distance computation. Any metric from scikit-learn
270 or scipy.spatial.distance can be used.
271
272 If metric is a callable function, it is called on each
273 pair of instances (rows) and the resulting value recorded. The callable
274 should take two arrays as input and return one value indicating the
275 distance between them. This works for Scipy's metrics, but is less
276 efficient than passing the metric name as a string.
277
278 Distance matrices are not supported.
279
280 Valid values for metric are:
281
282 - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
283 'manhattan']
284
285 - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
286 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
287 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao',
288 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
289
290 See the documentation for scipy.spatial.distance for details on these
291 metrics.
292
293 metric_kwargs : dict, optional
294 Keyword arguments to pass to specified metric function.
295
296 Returns
297 -------
298 argmin : numpy.ndarray
299 Y[argmin[i], :] is the row in Y that is closest to X[i, :].
300
301 distances : numpy.ndarray
302 distances[i] is the distance between the i-th row in X and the
303 argmin[i]-th row in Y.
304
305 See also
306 --------
307 sklearn.metrics.pairwise_distances
308 sklearn.metrics.pairwise_distances_argmin
309 """
310 656982 670566 1.0 0.4 dist_func = None
311 656982 690610 1.1 0.4 if metric in PAIRWISE_DISTANCE_FUNCTIONS:
312 656982 686983 1.0 0.4 dist_func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
313 elif not callable(metric) and not isinstance(metric, str):
314 raise ValueError("'metric' must be a string or a callable")
315
316 656982 595715 0.9 0.4 if check_X_y:
317 1 822 822.0 0.0 X, Y = check_pairwise_arrays(X, Y)
318
319 656982 618678 0.9 0.4 if metric_kwargs is None:
320 656982 657428 1.0 0.4 metric_kwargs = {}
321
322 656982 619505 0.9 0.4 if axis == 0:
323 X, Y = Y, X
324
325 # Allocate output arrays
326 656982 2531210 3.9 1.5 indices = np.empty(X.shape[0], dtype=np.intp)
327 656982 1363314 2.1 0.8 values = np.empty(X.shape[0])
328 656982 1468895 2.2 0.9 values.fill(np.infty)
329
330 1314163 3880532 3.0 2.3 for chunk_x in gen_batches(X.shape[0], batch_size):
331 657181 1982743 3.0 1.2 X_chunk = X[chunk_x, :]
332
333 1354162 3958987 2.9 2.3 for chunk_y in gen_batches(Y.shape[0], batch_size):
334 696981 1674758 2.4 1.0 Y_chunk = Y[chunk_y, :]
335
336 696981 703315 1.0 0.4 if dist_func is not None:
337 696981 683908 1.0 0.4 if metric == 'euclidean': # special case, for speed
338 696981 1092354 1.6 0.6 d_chunk = safe_sparse_dot(X_chunk, Y_chunk.T,
339 696981 36939114 53.0 21.9 dense_output=True)
340 696981 11336271 16.3 6.7 d_chunk *= -2
341 696981 24403217 35.0 14.4 d_chunk += row_norms(X_chunk, squared=True)[:, np.newaxis]
342 696981 19307949 27.7 11.4 d_chunk += row_norms(Y_chunk, squared=True)[np.newaxis, :]
343 696981 15203629 21.8 9.0 np.maximum(d_chunk, 0, d_chunk)
344 else:
345 d_chunk = dist_func(X_chunk, Y_chunk, **metric_kwargs)
346 else:
347 d_chunk = pairwise_distances(X_chunk, Y_chunk,
348 metric=metric, **metric_kwargs)
349
350 # Update indices and minimum values using chunk
351 696981 14876105 21.3 8.8 min_indices = d_chunk.argmin(axis=1)
352 696981 2458249 3.5 1.5 min_values = d_chunk[np.arange(chunk_x.stop - chunk_x.start),
353 696981 5527021 7.9 3.3 min_indices]
354
355 696981 3313727 4.8 2.0 flags = values[chunk_x] > min_values
356 696981 5884946 8.4 3.5 indices[chunk_x][flags] = min_indices[flags] + chunk_y.start
357 696981 2189400 3.1 1.3 values[chunk_x][flags] = min_values[flags]
358
359 656982 1099594 1.7 0.7 if metric == "euclidean" and not metric_kwargs.get("squared", False):
360 656982 1891684 2.9 1.1 np.sqrt(values, values)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
0