Created
October 29, 2014 13:49
-
-
Save MechCoder/6c29efb33917c5d97857 to your computer and use it in GitHub Desktop.
Benh birch without repeated check_pairwise (249 s)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File: sklearn/cluster/birch.py | |
Function: insert_cf_subcluster at line 101 | |
Total time: 143.353 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
101 @profile | |
102 def insert_cf_subcluster(self, subcluster): | |
103 """ | |
104 Insert a new subcluster into the nide | |
105 """ | |
106 656982 570040 0.9 0.4 if not self.subclusters_: | |
107 1 1 1.0 0.0 self.subclusters_.append(subcluster) | |
108 1 1 1.0 0.0 return False | |
109 | |
110 # We need to find the closest subcluster among all the | |
111 # subclusters so that we can insert our new subcluster. | |
112 | |
113 656981 21742431 33.1 15.2 subcluster_centroids = self.get_centroids() | |
114 closest_index, closest_threshold = \ | |
115 656981 1914890 2.9 1.3 pairwise_distances_argmin_min(subcluster.ls_[np.newaxis, :], | |
116 656981 424328 0.6 0.3 subcluster_centroids, | |
117 656981 103074945 156.9 71.9 check_X_y=False) | |
118 | |
119 # Index returned is a numpy array. | |
120 656981 832977 1.3 0.6 closest_index = closest_index[0] | |
121 656981 667938 1.0 0.5 closest_subcluster = self.subclusters_[closest_index] | |
122 | |
123 # If the subcluster has a child, we need a recursive strategy. | |
124 656981 583248 0.9 0.4 if closest_subcluster.child_ is not None: | |
125 556982 537756 1.0 0.4 split_child = closest_subcluster.child_.insert_cf_subcluster( | |
126 556982 1250540 2.2 0.9 subcluster) | |
127 | |
128 556982 367271 0.7 0.3 if not split_child: | |
129 # If it is determined that the child need not be split, we | |
130 # can just update the closest_subcluster< 8000 /td> | |
131 533138 2860898 5.4 2.0 self.subclusters_[closest_index].update(subcluster) | |
132 533138 336903 0.6 0.2 return False | |
133 | |
134 # things not too good. we need to redistribute the subclusters in | |
135 # our child node, and add a new subcluster in the parent | |
136 # subcluster to accomodate the new child. | |
137 else: | |
138 23844 7185851 301.4 5.0 self.split_node(closest_subcluster.child_, closest_subcluster) | |
139 | |
140 23844 41120 1.7 0.0 if len(self.subclusters_) > self.branching_factor: | |
141 4621 3242 0.7 0.0 return True | |
142 19223 13858 0.7 0.0 return False | |
143 | |
144 # good to go! | |
145 99999 602150 6.0 0.4 elif self.threshold >= closest_threshold: | |
146 closest_subcluster.update(subcluster) | |
147 return False | |
148 | |
149 # not close to any other subclusters, and we still have space, so add. | |
150 99999 156617 1.6 0.1 elif len(self.subclusters_) < self.branching_factor: | |
151 80770 91722 1.1 0.1 self.subclusters_.append(subcluster) | |
152 80770 52218 0.6 0.0 return False | |
153 | |
154 # We do not have enough space nor is it closer to an other subcluster. | |
155 # We need to split. | |
156 else: | |
157 19229 29591 1.5 0.0 self.subclusters_.append(subcluster) | |
158 19229 12773 0.7 0.0 return True | |
File: sklearn/metrics/pairwise.py | |
Function: check_pairwise_arrays at line 71 | |
Total time: 1.21733 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
71 @profile | |
72 def check_pairwise_arrays(X, Y, dtype=None): | |
73 """ Set X and Y appropriately and checks inputs | |
74 | |
75 If Y is None, it is set as a pointer to X (i.e. not a copy). | |
76 If Y is given, this does not happen. | |
77 All distance metrics should use this function first to assert that the | |
78 given parameters are correct and safe to use. | |
79 | |
80 Specifically, this function first ensures that both X and Y are arrays, | |
81 then checks that they are at least two dimensional while ensuring that | |
82 their elements are floats. Finally, the function checks that the size | |
83 of the second dimension of the two arrays is equal. | |
84 | |
85 Parameters | |
86 ---------- | |
87 X : {array-like, sparse matrix}, shape (n_samples_a, n_features) | |
88 | |
89 Y : {array-like, sparse matrix}, shape (n_samples_b, n_features) | |
90 | |
91 dtype : string, type or None (default=none) | |
92 Data type of result. If None, the dtype of the input is preserved. | |
93 | |
94 Returns | |
95 ------- | |
96 safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features) | |
97 An array equal to X, guaranteed to be a numpy array. | |
98 | |
99 safe_Y : {array-like, sparse matrix}, shape (n_samples_b, n_features) | |
100 An array equal to Y if Y was not None, guaranteed to be a numpy array. | |
101 If Y was None, safe_Y will be a pointer to X. | |
102 | |
103 """ | |
104 23851 27401 1.1 2.3 if Y is X or Y is None: | |
105 23850 1139043 47.8 93.6 X = Y = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype) | |
106 else: | |
107 1 422 422.0 0.0 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype) | |
108 1 373 373.0 0.0 Y = check_array(Y, accept_sparse=['csr', 'csc', 'coo'], dtype=dtype) | |
109 23851 35703 1.5 2.9 if X.shape[1] != Y.shape[1]: | |
110 raise ValueError("Incompatible dimension for X and Y matrices: " | |
111 "X.shape[1] == %d while Y.shape[1] == %d" % ( | |
112 X.shape[1], Y.shape[1])) | |
113 | |
114 23851 14384 0.6 1.2 return X, Y | |
File: sklearn/metrics/pairwise.py | |
Function: pairwise_distances_argmin_min at line 238 | |
Total time: 168.961 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
238 @profile | |
239 def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", | |
240 batch_size=500, metric_kwargs=None, | |
241 check_X_y=True): | |
242 """Compute minimum distances between one point and a set of points. | |
243 | |
244 This function computes for each row in X, the index of the row of Y which | |
245 is closest (according to the specified distance). The minimal distances are | |
246 also returned. | |
247 | |
248 This is mostly equivalent to calling: | |
249 | |
250 (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis), | |
251 pairwise_distances(X, Y=Y, metric=metric).min(axis=axis)) | |
252 | |
253 but uses much less memory, and is faster for large arrays. | |
254 | |
255 Parameters | |
256 ---------- | |
257 X, Y : {array-like, sparse matrix} | |
258 Arrays containing points. Respective shapes (n_samples1, n_features) | |
259 and (n_samples2, n_features) | |
260 | |
261 batch_size : integer | |
262 To reduce memory consumption over the naive solution, data are | |
263 processed in batches, comprising batch_size rows of X and | |
264 batch_size rows of Y. The default value is quite conservative, but | |
265 can be changed for fine-tuning. The larger the number, the larger the | |
266 memory usage. | |
267 | |
268 metric : string or callable | |
269 metric to use for distance computation. Any metric from scikit-learn | |
270 or scipy.spatial.distance can be used. | |
271 | |
272 If metric is a callable function, it is called on each | |
273 pair of instances (rows) and the resulting value recorded. The callable | |
274 should take two arrays as input and return one value indicating the | |
275 distance between them. This works for Scipy's metrics, but is less | |
276 efficient than passing the metric name as a string. | |
277 | |
278 Distance matrices are not supported. | |
279 | |
280 Valid values for metric are: | |
281 | |
282 - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', | |
283 'manhattan'] | |
284 | |
285 - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', | |
286 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', | |
287 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', | |
288 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] | |
289 | |
290 See the documentation for scipy.spatial.distance for details on these | |
291 metrics. | |
292 | |
293 metric_kwargs : dict, optional | |
294 Keyword arguments to pass to specified metric function. | |
295 | |
296 Returns | |
297 ------- | |
298 argmin : numpy.ndarray | |
299 Y[argmin[i], :] is the row in Y that is closest to X[i, :]. | |
300 | |
301 distances : numpy.ndarray | |
302 distances[i] is the distance between the i-th row in X and the | |
303 argmin[i]-th row in Y. | |
304 | |
305 See also | |
306 -------- | |
307 sklearn.metrics.pairwise_distances | |
308 sklearn.metrics.pairwise_distances_argmin | |
309 """ | |
310 656982 670566 1.0 0.4 dist_func = None | |
311 656982 690610 1.1 0.4 if metric in PAIRWISE_DISTANCE_FUNCTIONS: | |
312 656982 686983 1.0 0.4 dist_func = PAIRWISE_DISTANCE_FUNCTIONS[metric] | |
313 elif not callable(metric) and not isinstance(metric, str): | |
314 raise ValueError("'metric' must be a string or a callable") | |
315 | |
316 656982 595715 0.9 0.4 if check_X_y: | |
317 1 822 822.0 0.0 X, Y = check_pairwise_arrays(X, Y) | |
318 | |
319 656982 618678 0.9 0.4 if metric_kwargs is None: | |
320 656982 657428 1.0 0.4 metric_kwargs = {} | |
321 | |
322 656982 619505 0.9 0.4 if axis == 0: | |
323 X, Y = Y, X | |
324 | |
325 # Allocate output arrays | |
326 656982 2531210 3.9 1.5 indices = np.empty(X.shape[0], dtype=np.intp) | |
327 656982 1363314 2.1 0.8 values = np.empty(X.shape[0]) | |
328 656982 1468895 2.2 0.9 values.fill(np.infty) | |
329 | |
330 1314163 3880532 3.0 2.3 for chunk_x in gen_batches(X.shape[0], batch_size): | |
331 657181 1982743 3.0 1.2 X_chunk = X[chunk_x, :] | |
332 | |
333 1354162 3958987 2.9 2.3 for chunk_y in gen_batches(Y.shape[0], batch_size): | |
334 696981 1674758 2.4 1.0 Y_chunk = Y[chunk_y, :] | |
335 | |
336 696981 703315 1.0 0.4 if dist_func is not None: | |
337 696981 683908 1.0 0.4 if metric == 'euclidean': # special case, for speed | |
338 696981 1092354 1.6 0.6 d_chunk = safe_sparse_dot(X_chunk, Y_chunk.T, | |
339 696981 36939114 53.0 21.9 dense_output=True) | |
340 696981 11336271 16.3 6.7 d_chunk *= -2 | |
341 696981 24403217 35.0 14.4 d_chunk += row_norms(X_chunk, squared=True)[:, np.newaxis] | |
342 696981 19307949 27.7 11.4 d_chunk += row_norms(Y_chunk, squared=True)[np.newaxis, :] | |
343 696981 15203629 21.8 9.0 np.maximum(d_chunk, 0, d_chunk) | |
344 else: | |
345 d_chunk = dist_func(X_chunk, Y_chunk, **metric_kwargs) | |
346 else: | |
347 d_chunk = pairwise_distances(X_chunk, Y_chunk, | |
348 metric=metric, **metric_kwargs) | |
349 | |
350 # Update indices and minimum values using chunk | |
351 696981 14876105 21.3 8.8 min_indices = d_chunk.argmin(axis=1) | |
352 696981 2458249 3.5 1.5 min_values = d_chunk[np.arange(chunk_x.stop - chunk_x.start), | |
353 696981 5527021 7.9 3.3 min_indices] | |
354 | |
355 696981 3313727 4.8 2.0 flags = values[chunk_x] > min_values | |
356 696981 5884946 8.4 3.5 indices[chunk_x][flags] = min_indices[flags] + chunk_y.start | |
357 696981 2189400 3.1 1.3 values[chunk_x][flags] = min_values[flags] | |
358 | |
359 656982 1099594 1.7 0.7 if metric == "euclidean" and not metric_kwargs.get("squared", False): | |
360 656982 1891684 2.9 1.1 np.sqrt(values, values) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment