From 15ba44cad55ca504c37840badfe23c5d153d3a41 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Mon, 13 Mar 2023 18:21:39 +0100 Subject: [PATCH 01/14] wip --- sklearn/cluster/_dbscan_inner.pyx | 1 + sklearn/cluster/_k_means_common.pxd | 2 +- sklearn/cluster/_k_means_common.pyx | 16 ++++++----- sklearn/cluster/_k_means_elkan.pyx | 29 ++++++++++++-------- sklearn/cluster/_k_means_lloyd.pyx | 9 ++++--- sklearn/linear_model/_cd_fast.pyx | 42 ++++++++++++++++++----------- 6 files changed, 62 insertions(+), 37 deletions(-) diff --git a/sklearn/cluster/_dbscan_inner.pyx b/sklearn/cluster/_dbscan_inner.pyx index 8fb494af69e11..49e190cb9585e 100644 --- a/sklearn/cluster/_dbscan_inner.pyx +++ b/sklearn/cluster/_dbscan_inner.pyx @@ -7,6 +7,7 @@ cimport numpy as cnp cnp.import_array() + def dbscan_inner(const cnp.uint8_t[::1] is_core, object[:] neighborhoods, cnp.npy_intp[::1] labels): diff --git a/sklearn/cluster/_k_means_common.pxd b/sklearn/cluster/_k_means_common.pxd index 0ef38dbcf2b7c..9a41ea68d1baf 100644 --- a/sklearn/cluster/_k_means_common.pxd +++ b/sklearn/cluster/_k_means_common.pxd @@ -12,7 +12,7 @@ cdef floating _euclidean_sparse_dense( const floating[::1], const int[::1], const floating[::1], - floating, + floating, bint ) noexcept nogil diff --git a/sklearn/cluster/_k_means_common.pyx b/sklearn/cluster/_k_means_common.pyx index ba78e038feb98..192a4bdec1088 100644 --- a/sklearn/cluster/_k_means_common.pyx +++ b/sklearn/cluster/_k_means_common.pyx @@ -35,11 +35,14 @@ cdef floating _euclidean_dense_dense( # We manually unroll the loop for better cache optimization. for i in range(n): - result += ((a[0] - b[0]) * (a[0] - b[0]) - +(a[1] - b[1]) * (a[1] - b[1]) - +(a[2] - b[2]) * (a[2] - b[2]) - +(a[3] - b[3]) * (a[3] - b[3])) - a += 4; b += 4 + result += ( + (a[0] - b[0]) * (a[0] - b[0]) + + (a[1] - b[1]) * (a[1] - b[1]) + + (a[2] - b[2]) * (a[2] - b[2]) + + (a[3] - b[3]) * (a[3] - b[3]) + ) + a += 4 + b += 4 for i in range(rem): result += (a[i] - b[i]) * (a[i] - b[i]) @@ -77,7 +80,8 @@ cdef floating _euclidean_sparse_dense( result += b_squared_norm - if result < 0: result = 0.0 + if result < 0: + result = 0.0 return result if squared else sqrt(result) diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx index c2919ac1d0012..60b2d080793db 100644 --- a/sklearn/cluster/_k_means_elkan.pyx +++ b/sklearn/cluster/_k_means_elkan.pyx @@ -154,7 +154,6 @@ def init_bounds_sparse( cdef: int n_samples = X.shape[0] int n_clusters = centers.shape[0] - int n_features = X.shape[1] floating[::1] X_data = X.data int[::1] X_indices = X.indices @@ -269,7 +268,7 @@ def elkan_iter_chunked_dense( int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples int n_chunks = n_samples // n_samples_chunk int n_samples_rem = n_samples % n_samples_chunk - int chunk_idx, n_samples_chunk_eff + int chunk_idx int start, end int i, j, k @@ -386,9 +385,11 @@ cdef void _update_chunk_dense( # If this holds, then center_index is a good candidate for the # sample to be relabelled, and we need to confirm this by # recomputing the upper and lower bounds. - if (j != label + if ( + j != label and (upper_bound > lower_bounds[i, j]) - and (upper_bound > center_half_distances[label, j])): + and (upper_bound > center_half_distances[label, j]) + ): # Recompute upper bound by calculating the actual distance # between the sample and its current assigned center. @@ -401,8 +402,10 @@ cdef void _update_chunk_dense( # If the condition still holds, then compute the actual # distance between the sample and center. If this is less # than the previous distance, reassign label. - if (upper_bound > lower_bounds[i, j] - or (upper_bound > center_half_distances[label, j])): + if ( + upper_bound > lower_bounds[i, j] + or (upper_bound > center_half_distances[label, j]) + ): distance = _euclidean_dense_dense( &X[i, 0], ¢ers_old[j, 0], n_features, False) @@ -504,7 +507,7 @@ def elkan_iter_chunked_sparse( int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples int n_chunks = n_samples // n_samples_chunk int n_samples_rem = n_samples % n_samples_chunk - int chunk_idx, n_samples_chunk_eff + int chunk_idx int start, end int i, j, k @@ -631,9 +634,11 @@ cdef void _update_chunk_sparse( # If this holds, then center_index is a good candidate for the # sample to be relabelled, and we need to confirm this by # recomputing the upper and lower bounds. - if (j != label + if ( + j != label and (upper_bound > lower_bounds[i, j]) - and (upper_bound > center_half_distances[label, j])): + and (upper_bound > center_half_distances[label, j]) + ): # Recompute upper bound by calculating the actual distance # between the sample and its current assigned center. @@ -648,8 +653,10 @@ cdef void _update_chunk_sparse( # If the condition still holds, then compute the actual # distance between the sample and center. If this is less # than the previous distance, reassign label. - if (upper_bound > lower_bounds[i, j] - or (upper_bound > center_half_distances[label, j])): + if ( + upper_bound > lower_bounds[i, j] + or (upper_bound > center_half_distances[label, j]) + ): distance = _euclidean_sparse_dense( X_data[X_indptr[i] - s: X_indptr[i + 1] - s], X_indices[X_indptr[i] - s: X_indptr[i + 1] - s], diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx index 6ca50b2a1d0d9..664ec0da2cea2 100644 --- a/sklearn/cluster/_k_means_lloyd.pyx +++ b/sklearn/cluster/_k_means_lloyd.pyx @@ -87,7 +87,7 @@ def lloyd_iter_chunked_dense( int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples int n_chunks = n_samples // n_samples_chunk int n_samples_rem = n_samples % n_samples_chunk - int chunk_idx, n_samples_chunk_eff + int chunk_idx int start, end int j, k @@ -153,8 +153,9 @@ def lloyd_iter_chunked_dense( if update_centers: omp_destroy_lock(&lock) - _relocate_empty_clusters_dense(X, sample_weight, centers_old, - centers_new, weight_in_clusters, labels) + _relocate_empty_clusters_dense( + X, sample_weight, centers_old, centers_new, weight_in_clusters, labels + ) _average_centers(centers_new, weight_in_clusters) _center_shift(centers_old, centers_new, center_shift) @@ -278,7 +279,7 @@ def lloyd_iter_chunked_sparse( int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples int n_chunks = n_samples // n_samples_chunk int n_samples_rem = n_samples % n_samples_chunk - int chunk_idx, n_samples_chunk_eff = 0 + int chunk_idx int start = 0, end = 0 int j, k diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx index 27396acbdc5e0..3b0b2251abf69 100644 --- a/sklearn/linear_model/_cd_fast.pyx +++ b/sklearn/linear_model/_cd_fast.pyx @@ -90,6 +90,7 @@ cdef floating diff_abs_max(int n, const floating* a, floating* b) noexcept nogil m = d return m + def enet_coordinate_descent( floating[::1] w, floating alpha, @@ -151,7 +152,6 @@ def enet_coordinate_descent( cdef floating const cdef floating A_norm2 cdef unsigned int ii - cdef unsigned int i cdef unsigned int n_iter = 0 cdef unsigned int f_iter cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) @@ -207,9 +207,11 @@ def enet_coordinate_descent( w_max = fmax(w_max, fabs(w[ii])) - if (w_max == 0.0 or - d_w_max / w_max < d_w_tol or - n_iter == max_iter - 1): + if ( + w_max == 0.0 + or d_w_max / w_max < d_w_tol + or n_iter == max_iter - 1 + ): # the biggest coordinate update of this iteration was smaller # than the tolerance: check the duality gap as ultimate # stopping criterion @@ -605,7 +607,6 @@ def enet_coordinate_descent_gram( dtype = np.float64 # get the data information into easy vars - cdef unsigned int n_samples = y.shape[0] cdef unsigned int n_features = Q.shape[0] # initial value "Q w" which will be kept of up to date in the iterations @@ -637,9 +638,11 @@ def enet_coordinate_descent_gram( tol = tol * y_norm2 if alpha == 0: - warnings.warn("Coordinate descent without L1 regularization may " + warnings.warn( + "Coordinate descent without L1 regularization may " "lead to unexpected results and is discouraged. " - "Set l1_ratio > 0 to add L1 regularization.") + "Set l1_ratio > 0 to add L1 regularization." + ) with nogil: for n_iter in range(max_iter): @@ -715,9 +718,12 @@ def enet_coordinate_descent_gram( gap = R_norm2 # The call to asum is equivalent to the L1 norm of w - gap += (alpha * _asum(n_features, &w[0], 1) - - const * y_norm2 + const * q_dot_w + - 0.5 * beta * (1 + const ** 2) * w_norm2) + gap += ( + alpha * _asum(n_features, &w[0], 1) + - const * y_norm2 + + const * q_dot_w + + 0.5 * beta * (1 + const ** 2) * w_norm2 + ) if gap < tol: # return if we reached desired tolerance @@ -733,6 +739,7 @@ def enet_coordinate_descent_gram( return np.asarray(w), gap, tol, n_iter + 1 + def enet_coordinate_descent_multi_task( const floating[::1, :] W, floating l1_reg, @@ -806,8 +813,10 @@ def enet_coordinate_descent_multi_task( cdef const floating* Y_ptr = &Y[0, 0] if l1_reg == 0: - warnings.warn("Coordinate descent with l1_reg=0 may lead to unexpected" - " results and is discouraged.") + warnings.warn( + "Coordinate descent with l1_reg=0 may lead to unexpected" + " results and is discouraged." + ) with nogil: # norm_cols_X = (np.asarray(X) ** 2).sum(axis=0) @@ -920,7 +929,7 @@ def enet_coordinate_descent_multi_task( R_norm = _nrm2(n_samples * n_tasks, &R[0, 0], 1) w_norm = _nrm2(n_features * n_tasks, &W[0, 0], 1) if (dual_norm_XtA > l1_reg): - const = l1_reg / dual_norm_XtA + const = l1_reg / dual_norm_XtA A_norm = R_norm * const gap = 0.5 * (R_norm ** 2 + A_norm ** 2) else: @@ -935,8 +944,11 @@ def enet_coordinate_descent_multi_task( for ii in range(n_features): l21_norm += _nrm2(n_tasks, &W[0, ii], 1) - gap += l1_reg * l21_norm - const * ry_sum + \ - 0.5 * l2_reg * (1 + const ** 2) * (w_norm ** 2) + gap += ( + l1_reg * l21_norm + - const * ry_sum + + 0.5 * l2_reg * (1 + const ** 2) * (w_norm ** 2) + ) if gap < tol: # return if we reached desired tolerance From 95266a6985a1f8addcccbd8bb368ee1910d4aa0e Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Mon, 13 Mar 2023 20:57:58 +0100 Subject: [PATCH 02/14] done all but templates --- sklearn/_isotonic.pyx | 4 +- sklearn/_loss/_loss.pxd | 4 +- sklearn/datasets/_svmlight_format_fast.pyx | 17 ++-- sklearn/decomposition/_cdnmf_fast.pyx | 2 +- sklearn/ensemble/_gradient_boosting.pyx | 51 +++++------ .../_hist_gradient_boosting/_binning.pyx | 24 ++--- .../_hist_gradient_boosting/_bitset.pyx | 2 +- .../_hist_gradient_boosting/_predictor.pyx | 6 +- .../_hist_gradient_boosting/histogram.pyx | 5 +- .../_hist_gradient_boosting/splitting.pyx | 27 +++--- sklearn/manifold/_barnes_hut_tsne.pyx | 6 +- sklearn/manifold/_utils.pyx | 2 +- sklearn/metrics/_pairwise_fast.pyx | 2 +- .../cluster/_expected_mutual_info_fast.pyx | 9 +- sklearn/neighbors/_ball_tree.pyx | 2 +- sklearn/neighbors/_binary_tree.pxi | 89 +++++++++---------- sklearn/neighbors/_kd_tree.pyx | 2 +- sklearn/neighbors/_quad_tree.pyx | 39 ++++---- .../_csr_polynomial_expansion.pyx | 8 +- sklearn/svm/_liblinear.pyx | 5 +- sklearn/svm/_libsvm.pxi | 42 ++++----- sklearn/svm/_libsvm.pyx | 21 ++--- sklearn/svm/_libsvm_sparse.pyx | 55 ++++++------ sklearn/svm/_newrand.pyx | 2 + sklearn/tree/_criterion.pxd | 24 ++--- sklearn/tree/_splitter.pyx | 18 ++-- sklearn/tree/_tree.pyx | 45 +++++----- sklearn/tree/_utils.pxd | 4 +- sklearn/utils/_cython_blas.pyx | 2 +- sklearn/utils/_fast_dict.pyx | 20 ++--- sklearn/utils/_random.pxd | 3 +- sklearn/utils/_random.pyx | 12 ++- sklearn/utils/_typedefs.pxd | 2 +- sklearn/utils/_typedefs.pyx | 16 ++-- sklearn/utils/_vector_sentinel.pyx | 2 +- sklearn/utils/arrayfuncs.pyx | 52 +++++------ sklearn/utils/murmurhash.pxd | 14 +-- sklearn/utils/sparsefuncs_fast.pyx | 5 -- 38 files changed, 315 insertions(+), 330 deletions(-) diff --git a/sklearn/_isotonic.pyx b/sklearn/_isotonic.pyx index c3a6685eb8f90..e977df248f26f 100644 --- a/sklearn/_isotonic.pyx +++ b/sklearn/_isotonic.pyx @@ -75,15 +75,13 @@ def _make_unique(cnp.ndarray[dtype=floating] X, """ unique_values = len(np.unique(X)) - cdef cnp.ndarray[dtype=floating] y_out = np.empty(unique_values, - dtype=X.dtype) + cdef cnp.ndarray[dtype=floating] y_out = np.empty(unique_values, dtype=X.dtype) cdef cnp.ndarray[dtype=floating] x_out = np.empty_like(y_out) cdef cnp.ndarray[dtype=floating] weights_out = np.empty_like(y_out) cdef floating current_x = X[0] cdef floating current_y = 0 cdef floating current_weight = 0 - cdef floating y_old = 0 cdef int i = 0 cdef int j cdef floating x diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd index 3aad078c0f3a1..7a1b610ea0980 100644 --- a/sklearn/_loss/_loss.pxd +++ b/sklearn/_loss/_loss.pxd @@ -14,8 +14,8 @@ ctypedef fused G_DTYPE_C: # Struct to return 2 doubles ctypedef struct double_pair: - double val1 - double val2 + double val1 + double val2 # C base class for loss functions diff --git a/sklearn/datasets/_svmlight_format_fast.pyx b/sklearn/datasets/_svmlight_format_fast.pyx index b578584e5ac47..31530ed55d251 100644 --- a/sklearn/datasets/_svmlight_format_fast.pyx +++ b/sklearn/datasets/_svmlight_format_fast.pyx @@ -113,6 +113,7 @@ def _load_svmlight_file(f, dtype, bint multilabel, bint zero_based, return (dtype, data, indices, indptr, labels, query) + # Two fused types are defined to be able to # use all possible combinations of parameters. ctypedef fused int_or_float: @@ -128,8 +129,9 @@ ctypedef fused int_or_longlong: cython.integral signed long long + def get_dense_row_string( - int_or_float[:,:] X, + int_or_float[:, :] X, Py_ssize_t[:] x_inds, double_or_longlong[:] x_vals, Py_ssize_t row, @@ -143,7 +145,7 @@ def get_dense_row_string( int_or_float val for k in range(row_length): - val = X[row,k] + val = X[row, k] if val == 0: continue x_inds[x_nz_used] = k @@ -157,6 +159,7 @@ def get_dense_row_string( return " ".join(reprs) + def get_sparse_row_string( int_or_float[:] X_data, int[:] X_indptr, @@ -176,6 +179,7 @@ def get_sparse_row_string( return " ".join(reprs) + def _dump_svmlight_file( X, y, @@ -211,8 +215,6 @@ def _dump_svmlight_file( Py_ssize_t j Py_ssize_t col_start Py_ssize_t col_end - bint first - Py_ssize_t x_nz_used Py_ssize_t[:] x_inds = np.empty(row_length, dtype=np.intp) signed long long[:] x_vals_int double[:] x_vals_float @@ -224,8 +226,6 @@ def _dump_svmlight_file( x_vals_float = np.zeros(row_length, dtype=np.float64) for i in range(x_len): - x_nz_used = 0 - if not X_is_sp: if X_is_integral: s = get_dense_row_string(X, x_inds, x_vals_int, i, value_pattern, one_based) @@ -234,18 +234,17 @@ def _dump_svmlight_file( else: s = get_sparse_row_string(X.data, X.indptr, X.indices, i, value_pattern, one_based) if multilabel: - first = True if y_is_sp: col_start = y.indptr[i] col_end = y.indptr[i+1] labels_str = ','.join(tuple(label_pattern % y.indices[j] for j in range(col_start, col_end) if y.data[j] != 0)) else: - labels_str = ','.join(label_pattern % j for j in range(num_labels) if y[i,j] != 0) + labels_str = ','.join(label_pattern % j for j in range(num_labels) if y[i, j] != 0) else: if y_is_sp: labels_str = label_pattern % y.data[i] else: - labels_str = label_pattern % y[i,0] + labels_str = label_pattern % y[i, 0] if query_id_is_not_empty: feat = (labels_str, query_id[i], s) diff --git a/sklearn/decomposition/_cdnmf_fast.pyx b/sklearn/decomposition/_cdnmf_fast.pyx index c50e09e1632c7..65db92171c75d 100644 --- a/sklearn/decomposition/_cdnmf_fast.pyx +++ b/sklearn/decomposition/_cdnmf_fast.pyx @@ -34,5 +34,5 @@ def _update_cdnmf_fast(floating[:, ::1] W, floating[:, :] HHt, if hess != 0: W[i, t] = max(W[i, t] - grad / hess, 0.) - + return violation diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index c738966e0332a..f310f2f25f7b1 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -82,6 +82,7 @@ cdef void _predict_regression_tree_inplace_fast_dense( node = root_node + node.right_child out[i, k] += scale * value[node - root_node] + def _predict_regression_tree_stages_sparse( object[:, :] estimators, object X, @@ -215,7 +216,7 @@ def predict_stages( n_features=X.shape[1], out=out ) - ## out[:, k] += scale * tree.predict(X).ravel() + # out[:, k] += scale * tree.predict(X).ravel() def predict_stage( @@ -240,34 +241,34 @@ def _random_sample_mask( cnp.npy_intp n_total_in_bag, random_state ): - """Create a random sample mask where ``n_total_in_bag`` elements are set. + """Create a random sample mask where ``n_total_in_bag`` elements are set. - Parameters - ---------- - n_total_samples : int - The length of the resulting mask. + Parameters + ---------- + n_total_samples : int + The length of the resulting mask. - n_total_in_bag : int - The number of elements in the sample mask which are set to 1. + n_total_in_bag : int + The number of elements in the sample mask which are set to 1. - random_state : RandomState - A numpy ``RandomState`` object. + random_state : RandomState + A numpy ``RandomState`` object. - Returns - ------- - sample_mask : np.ndarray, shape=[n_total_samples] - An ndarray where ``n_total_in_bag`` elements are set to ``True`` - the others are ``False``. - """ - cdef cnp.float64_t[::1] rand = random_state.uniform(size=n_total_samples) - cdef cnp.uint8_t[::1] sample_mask = np_zeros((n_total_samples,), dtype=bool) + Returns + ------- + sample_mask : np.ndarray, shape=[n_total_samples] + An ndarray where ``n_total_in_bag`` elements are set to ``True`` + the others are ``False``. + """ + cdef cnp.float64_t[::1] rand = random_state.uniform(size=n_total_samples) + cdef cnp.uint8_t[::1] sample_mask = np_zeros((n_total_samples,), dtype=bool) - cdef cnp.npy_intp n_bagged = 0 - cdef cnp.npy_intp i = 0 + cdef cnp.npy_intp n_bagged = 0 + cdef cnp.npy_intp i = 0 - for i in range(n_total_samples): - if rand[i] * (n_total_samples - i) < (n_total_in_bag - n_bagged): - sample_mask[i] = 1 - n_bagged += 1 + for i in range(n_total_samples): + if rand[i] * (n_total_samples - i) < (n_total_in_bag - n_bagged): + sample_mask[i] = 1 + n_bagged += 1 - return sample_mask.base + return sample_mask.base diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 5ba1527378d87..755a3cd69164b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -32,18 +32,22 @@ def _map_to_bins(const X_DTYPE_C [:, :] data, int feature_idx for feature_idx in range(data.shape[1]): - _map_col_to_bins(data[:, feature_idx], - binning_thresholds[feature_idx], - missing_values_bin_idx, - n_threads, - binned[:, feature_idx]) + _map_col_to_bins( + data[:, feature_idx], + binning_thresholds[feature_idx], + missing_values_bin_idx, + n_threads, + binned[:, feature_idx] + ) -cdef void _map_col_to_bins(const X_DTYPE_C [:] data, - const X_DTYPE_C [:] binning_thresholds, - const unsigned char missing_values_bin_idx, - int n_threads, - X_BINNED_DTYPE_C [:] binned): +cdef void _map_col_to_bins( + const X_DTYPE_C [:] data, + const X_DTYPE_C [:] binning_thresholds, + const unsigned char missing_values_bin_idx, + int n_threads, + X_BINNED_DTYPE_C [:] binned +): """Binary search to find the bin index for each value in the data.""" cdef: int i diff --git a/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx b/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx index e92d137366433..f658220c9f025 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx @@ -12,7 +12,7 @@ from .common cimport X_BINNED_DTYPE_C # https://en.wikipedia.org/wiki/Bitwise_operation -cdef inline void init_bitset(BITSET_DTYPE_C bitset) noexcept nogil: # OUT +cdef inline void init_bitset(BITSET_DTYPE_C bitset) noexcept nogil: # OUT cdef: unsigned int i diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index e9043909de4f5..d80d558f03be8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -148,7 +148,8 @@ def _compute_partial_dependence( node_struct [:] nodes, const X_DTYPE_C [:, ::1] X, int [:] target_features, - Y_DTYPE_C [:] out): + Y_DTYPE_C [:] out +): """Partial dependence of the response on the ``target_features`` set. For each sample in ``X`` a tree traversal is performed. @@ -250,5 +251,4 @@ def _compute_partial_dependence( # Sanity check. Should never happen. if not (0.999 < total_weight < 1.001): - raise ValueError("Total weight should be 1.0 but was %.9f" % - total_weight) + raise ValueError("Total weight should be 1.0 but was %.9f" %total_weight) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 1dbe64a4a5dfb..336ba372cb53a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -129,9 +129,7 @@ cdef class HistogramBuilder: int f_idx int i # need local views to avoid python interactions - unsigned char hessians_are_constant = \ - self.hessians_are_constant - int n_features = self.n_features + unsigned char hessians_are_constant = self.hessians_are_constant int n_allowed_features = self.n_features G_H_DTYPE_C [::1] ordered_gradients = self.ordered_gradients G_H_DTYPE_C [::1] gradients = self.gradients @@ -259,7 +257,6 @@ cdef class HistogramBuilder: cdef: int feature_idx int f_idx - int n_features = self.n_features int n_allowed_features = self.n_features hist_struct [:, ::1] histograms = np.empty( shape=(self.n_features, self.n_bins), diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index cdeb373350ed4..c1a3038467bc5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -961,7 +961,7 @@ cdef class Splitter: for i in range(middle): sorted_cat_idx = i if direction == 1 else n_used_bins - 1 - i - bin_idx = cat_infos[sorted_cat_idx].bin_idx; + bin_idx = cat_infos[sorted_cat_idx].bin_idx n_samples_left += feature_hist[bin_idx].count n_samples_right = n_samples - n_samples_left @@ -975,18 +975,22 @@ cdef class Splitter: sum_gradient_left += feature_hist[bin_idx].sum_gradients sum_gradient_right = sum_gradients - sum_gradient_left - if (n_samples_left < self.min_samples_leaf or - sum_hessian_left < self.min_hessian_to_split): + if ( + n_samples_left < self.min_samples_leaf or + sum_hessian_left < self.min_hessian_to_split + ): continue - if (n_samples_right < self.min_samples_leaf or - sum_hessian_right < self.min_hessian_to_split): + if ( + n_samples_right < self.min_samples_leaf or + sum_hessian_right < self.min_hessian_to_split + ): break gain = _split_gain(sum_gradient_left, sum_hessian_left, - sum_gradient_right, sum_hessian_right, - loss_current_node, monotonic_cst, - lower_bound, upper_bound, - self.l2_regularization) + sum_gradient_right, sum_hessian_right, + loss_current_node, monotonic_cst, + lower_bound, upper_bound, + self.l2_regularization) if gain > best_gain and gain > self.min_gain_to_split: found_better_split = True best_gain = gain @@ -996,7 +1000,6 @@ cdef class Splitter: best_n_samples_left = n_samples_left best_direction = direction - if found_better_split: split_info.gain = best_gain @@ -1070,8 +1073,8 @@ cdef inline Y_DTYPE_C _split_gain( lower_bound, upper_bound, l2_regularization) value_right = compute_node_value(sum_gradient_right, sum_hessian_right, - lower_bound, upper_bound, - l2_regularization) + lower_bound, upper_bound, + l2_regularization) if ((monotonic_cst == MonotonicConstraint.POS and value_left > value_right) or (monotonic_cst == MonotonicConstraint.NEG and value_left < value_right)): diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index 4abf9f1c28805..5c8d0ded24ede 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -69,7 +69,7 @@ cdef float compute_gradient(float[:] val_P, if qt.verbose > 11: printf("[t-SNE] Allocating %li elements in force arrays\n", - n_samples * n_dimensions * 2) + n_samples * n_dimensions * 2) cdef float* neg_f = malloc(sizeof(float) * n_samples * n_dimensions) cdef float* pos_f = malloc(sizeof(float) * n_samples * n_dimensions) @@ -157,8 +157,7 @@ cdef float compute_gradient_positive(float[:] val_P, # only compute the error when needed if compute_error: qij = qij / sum_Q - C += pij * log(max(pij, FLOAT32_TINY) \ - / max(qij, FLOAT32_TINY)) + C += pij * log(max(pij, FLOAT32_TINY) / max(qij, FLOAT32_TINY)) for ax in range(n_dimensions): pos_f[i * n_dimensions + ax] += dij * buff[ax] @@ -198,7 +197,6 @@ cdef double compute_gradient_negative(float[:, :] pos_reference, clock_t t1 = 0, t2 = 0, t3 = 0 int take_timing = 1 if qt.verbose > 20 else 0 - with nogil, parallel(num_threads=num_threads): # Define thread-local buffers summary = malloc(sizeof(float) * n * offset) diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx index d8a472ad4c5b5..c5a43db305640 100644 --- a/sklearn/manifold/_utils.pyx +++ b/sklearn/manifold/_utils.pyx @@ -3,7 +3,6 @@ import numpy as np cimport numpy as cnp - cdef extern from "numpy/npy_math.h": float NPY_INFINITY @@ -11,6 +10,7 @@ cdef extern from "numpy/npy_math.h": cdef float EPSILON_DBL = 1e-8 cdef float PERPLEXITY_TOLERANCE = 1e-5 + # TODO: have this function support float32 and float64 and preserve inputs' dtypes. def _binary_search_perplexity( const cnp.float32_t[:, :] sqdistances, diff --git a/sklearn/metrics/_pairwise_fast.pyx b/sklearn/metrics/_pairwise_fast.pyx index f7ddd68c46c1e..d5290d94679c9 100644 --- a/sklearn/metrics/_pairwise_fast.pyx +++ b/sklearn/metrics/_pairwise_fast.pyx @@ -31,7 +31,7 @@ def _chi2_kernel_fast(floating[:, :] X, denom = (X[i, k] - Y[j, k]) nom = (X[i, k] + Y[j, k]) if nom != 0: - res += denom * denom / nom + res += denom * denom / nom result[i, j] = -res diff --git a/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx b/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx index 633ace4b613c2..22c5e9773176c 100644 --- a/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx +++ b/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx @@ -15,7 +15,7 @@ def expected_mutual_information(contingency, cnp.int64_t n_samples): cnp.int64_t n_rows, n_cols cnp.float64_t term2, term3, gln cnp.int64_t[::1] a_view, b_view - cnp.float64_t[::1] nijs_view, term1 + cnp.float64_t[::1] term1 cnp.float64_t[::1] gln_a, gln_b, gln_Na, gln_Nb, gln_Nnij, log_Nnij cnp.float64_t[::1] log_a, log_b Py_ssize_t i, j, nij @@ -36,7 +36,6 @@ def expected_mutual_information(contingency, cnp.int64_t n_samples): # While nijs[0] will never be used, having it simplifies the indexing. nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float') nijs[0] = 1 # Stops divide by zero warnings. As its not used, no issue. - nijs_view = nijs # term1 is nij / N term1 = nijs / n_samples # term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b) @@ -61,9 +60,9 @@ def expected_mutual_information(contingency, cnp.int64_t n_samples): term2 = log_Nnij[nij] - log_a[i] - log_b[j] # Numerators are positive, denominators are negative. gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j] - - gln_Nnij[nij] - lgamma(a_view[i] - nij + 1) - - lgamma(b_view[j] - nij + 1) - - lgamma(n_samples - a_view[i] - b_view[j] + nij + 1)) + - gln_Nnij[nij] - lgamma(a_view[i] - nij + 1) + - lgamma(b_view[j] - nij + 1) + - lgamma(n_samples - a_view[i] - b_view[j] + nij + 1)) term3 = exp(gln) emi += (term1[nij] * term2 * term3) return emi diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx index 30b8376be9146..09891cb516527 100644 --- a/sklearn/neighbors/_ball_tree.pyx +++ b/sklearn/neighbors/_ball_tree.pyx @@ -25,7 +25,7 @@ cdef class BallTree(BinaryTree): pass -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # The functions below specialized the Binary Tree as a Ball Tree # # Note that these functions use the concept of "reduced distance". diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 99ed4341ad155..15699b2450812 100644 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -205,8 +205,7 @@ NodeData = np.asarray((&nd_tmp)).dtype ###################################################################### # Define doc strings, substituting the appropriate class name using # the DOC_DICT variable defined in the pyx files. -CLASS_DOC = \ -""" +CLASS_DOC = """ {BinaryTree}(X, leaf_size=40, metric='minkowski', **kwargs) {BinaryTree} for fast generalized N-point problems @@ -411,7 +410,7 @@ cdef inline DTYPE_t compute_log_kernel(DTYPE_t dist, DTYPE_t h, return log_cosine_kernel(dist, h) -#------------------------------------------------------------ +# ------------------------------------------------------------ # Kernel norms are defined via the volume element V_n # and surface element S_(n-1) of an n-sphere. cdef DTYPE_t logVn(ITYPE_t n): @@ -567,7 +566,7 @@ cdef class NeighborsHeap: ) return 0 -#------------------------------------------------------------ +# ------------------------------------------------------------ # find_node_split_dim: # this computes the equivalent of # j_max = np.argmax(np.max(data, 0) - np.min(data, 0)) @@ -872,7 +871,6 @@ cdef class BinaryTree: self.sample_weight = None self.sum_weight = n_samples - def __reduce__(self): """ reduce method used for pickling @@ -1063,7 +1061,7 @@ cdef class BinaryTree: n_features, n_points) partition_node_indices(data, idx_array, i_max, n_mid, n_features, n_points) - self._recursive_build(node_data,2 * i_node + 1, + self._recursive_build(node_data, 2 * i_node + 1, idx_start, idx_start + n_mid) self._recursive_build(node_data, 2 * i_node + 2, idx_start + n_mid, idx_end) @@ -1362,7 +1360,7 @@ cdef class BinaryTree: # deflatten results return indices_npy.reshape(X.shape[:X.ndim - 1]) - except: + except MemoryError: # free any buffer that is not owned by a numpy array for i in range(Xarr.shape[0]): free(indices[i]) @@ -1373,7 +1371,6 @@ cdef class BinaryTree: free(indices) free(distances) - def kernel_density(self, X, h, kernel='gaussian', atol=0, rtol=1E-8, breadth_first=True, return_log=False): @@ -1596,13 +1593,13 @@ cdef class BinaryTree: cdef DTYPE_t* data = &self.data[0, 0] - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 1: query point is outside node radius: # trim it from the query if reduced_dist_LB > heap.largest(i_pt): self.n_trims += 1 - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 2: this is a leaf node. Update set of nearby points elif node_info.is_leaf: self.n_leaves += 1 @@ -1612,7 +1609,7 @@ cdef class BinaryTree: self.data.shape[1]) heap._push(i_pt, dist_pt, self.idx_array[i]) - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 3: Node is not a leaf. Recursively query subnodes # starting with the closest else: @@ -1657,13 +1654,13 @@ cdef class BinaryTree: i_node = nodeheap_item.i1 node_info = node_data[i_node] - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 1: query point is outside node radius: # trim it from the query if reduced_dist_LB > heap.largest(i_pt): self.n_trims += 1 - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 2: this is a leaf node. Update set of nearby points elif node_data[i_node].is_leaf: self.n_leaves += 1 @@ -1674,7 +1671,7 @@ cdef class BinaryTree: self.data.shape[1]) heap._push(i_pt, dist_pt, self.idx_array[i]) - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 3: Node is not a leaf. Add subnodes to the node heap else: self.n_splits += 1 @@ -1703,13 +1700,13 @@ cdef class BinaryTree: cdef DTYPE_t bound_max, dist_pt, reduced_dist_LB1, reduced_dist_LB2 cdef ITYPE_t i1, i2, i_pt, i_parent - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 1: nodes are further apart than the current bound: # trim both from the query if reduced_dist_LB > bounds[i_node2]: pass - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 2: both nodes are leaves: # do a brute-force search comparing all pairs elif node_info1.is_leaf and node_info2.is_leaf: @@ -1743,7 +1740,7 @@ cdef class BinaryTree: else: break - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 3a: node 1 is a leaf or is smaller: split node 2 and # recursively query, starting with the nearest subnode elif node_info1.is_leaf or (not node_info2.is_leaf @@ -1764,7 +1761,7 @@ cdef class BinaryTree: self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 1, bounds, heap, reduced_dist_LB1) - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 3b: node 2 is a leaf or is smaller: split node 1 and # recursively query, starting with the nearest subnode else: @@ -1815,13 +1812,13 @@ cdef class BinaryTree: node_info1 = node_data1[i_node1] node_info2 = node_data2[i_node2] - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 1: nodes are further apart than the current bound: # trim both from the query if reduced_dist_LB > bounds[i_node2]: pass - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 2: both nodes are leaves: # do a brute-force search comparing all pairs elif node_info1.is_leaf and node_info2.is_leaf: @@ -1844,7 +1841,7 @@ cdef class BinaryTree: bounds[i_node2] = fmax(bounds[i_node2], heap.largest(i_pt)) - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 3a: node 1 is a leaf or is smaller: split node 2 and # recursively query, starting with the nearest subnode elif node_info1.is_leaf or (not node_info2.is_leaf @@ -1857,7 +1854,7 @@ cdef class BinaryTree: other, i2) nodeheap.push(nodeheap_item) - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 3b: node 2 is a leaf or is smaller: split node 1 and # recursively query, starting with the nearest subnode else: @@ -1889,13 +1886,13 @@ cdef class BinaryTree: cdef DTYPE_t dist_pt, dist_LB = 0, dist_UB = 0 min_max_dist(self, i_node, pt, &dist_LB, &dist_UB) - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 1: all node points are outside distance r. # prune this branch. if dist_LB > r: pass - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 2: all node points are within distance r # add all points to neighbors elif dist_UB <= r: @@ -1912,7 +1909,7 @@ cdef class BinaryTree: n_features) count += 1 - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 3: this is a leaf node. Go through all points to # determine if they fall within radius elif node_info.is_leaf: @@ -1933,7 +1930,7 @@ cdef class BinaryTree: self.dist_metric._rdist_to_dist(dist_pt) count += 1 - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 4: Node is not a leaf. Recursively query subnodes else: count = self._query_radius_single(2 * i_node + 1, pt, r, @@ -2015,21 +2012,21 @@ cdef class BinaryTree: else: N1 = node_info.idx_end - node_info.idx_start - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 1: local bounds are equal to within per-point tolerance. if (log_knorm + node_log_bound_spreads[i_node] - log(N1) + log(N) <= logaddexp(log_atol, (log_rtol + log_knorm + node_log_min_bounds[i_node]))): pass - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 2: global bounds are within rtol & atol. elif (log_knorm + global_log_bound_spread <= logaddexp(log_atol, log_rtol + log_knorm + global_log_min_bound)): break - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 3: node is a leaf. Count contributions from all points elif node_info.is_leaf: global_log_min_bound =\ @@ -2049,7 +2046,7 @@ cdef class BinaryTree: global_log_min_bound = logaddexp(global_log_min_bound, log_density + log_weight) - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 4: split node and query subnodes else: i1 = 2 * i_node + 1 @@ -2145,28 +2142,30 @@ cdef class BinaryTree: cdef DTYPE_t dist_UB = 0, dist_LB = 0 if with_sample_weight: - N1 = _total_node_weight(node_data, sample_weight, - idx_array, i_node) + N1 = _total_node_weight(node_data, sample_weight, + idx_array, i_node) N2 = self.sum_weight else: N1 = (node_info.idx_end - node_info.idx_start) N2 = self.data.shape[0] - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 1: local bounds are equal to within errors. Return - if (log_knorm + local_log_bound_spread - log(N1) + log(N2) - <= logaddexp(log_atol, (log_rtol + log_knorm - + local_log_min_bound))): + if ( + log_knorm + local_log_bound_spread - log(N1) + log(N2) + <= logaddexp(log_atol, (log_rtol + log_knorm + local_log_min_bound)) + ): pass - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 2: global bounds are within rtol & atol. Return - elif (log_knorm + global_log_bound_spread[0] - <= logaddexp(log_atol, (log_rtol + log_knorm - + global_log_min_bound[0]))): + elif ( + log_knorm + global_log_bound_spread[0] + <= logaddexp(log_atol, (log_rtol + log_knorm + global_log_min_bound[0])) + ): pass - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 3: node is a leaf. Count contributions from all points elif node_info.is_leaf: global_log_min_bound[0] = logsubexp(global_log_min_bound[0], @@ -2185,7 +2184,7 @@ cdef class BinaryTree: (log_dens_contribution + log_weight)) - #------------------------------------------------------------ + # ------------------------------------------------------------ # Case 4: split node and query subnodes else: i1 = 2 * i_node + 1 @@ -2259,7 +2258,7 @@ cdef class BinaryTree: cdef DTYPE_t dist_pt, dist_LB = 0, dist_UB = 0 min_max_dist(self, i_node, pt, &dist_LB, &dist_UB) - #------------------------------------------------------------ + # ------------------------------------------------------------ # Go through bounds and check for cuts while i_min < i_max: if dist_LB > r[i_min]: @@ -2314,7 +2313,7 @@ cdef class BinaryTree: dist_LB = min_dist_dual(self, i_node1, other, i_node2) dist_UB = max_dist_dual(self, i_node1, other, i_node2) - #------------------------------------------------------------ + # ------------------------------------------------------------ # Go through bounds and check for cuts while i_min < i_max: if dist_LB > r[i_min]: @@ -2358,7 +2357,7 @@ cdef class BinaryTree: r, count, i_min, i_max) else: - # neither is a leaf: split & query both + # neither is a leaf: split & query both for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): self._two_point_dual(i1, other, i2, diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx index a5db18b4ad772..d9843c2f42da0 100644 --- a/sklearn/neighbors/_kd_tree.pyx +++ b/sklearn/neighbors/_kd_tree.pyx @@ -18,7 +18,7 @@ cdef class KDTree(BinaryTree): pass -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # The functions below specialized the Binary Tree as a KD Tree # # Note that these functions use the concept of "reduced distance". diff --git a/sklearn/neighbors/_quad_tree.pyx b/sklearn/neighbors/_quad_tree.pyx index cce804b1724a5..05498a650a39b 100644 --- a/sklearn/neighbors/_quad_tree.pyx +++ b/sklearn/neighbors/_quad_tree.pyx @@ -29,7 +29,7 @@ cdef extern from "numpy/arrayobject.h": # This works by casting `dummy` to an array of Cell of length 1, which numpy # can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946 # for a more detailed explanation. -cdef Cell dummy; +cdef Cell dummy CELL_DTYPE = np.asarray((&dummy)).dtype assert CELL_DTYPE.itemsize == sizeof(Cell) @@ -177,9 +177,9 @@ cdef class _QuadTree: return self.insert_point(point, point_index, cell_id) # XXX: This operation is not Thread safe - cdef SIZE_t _insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell, - SIZE_t point_index, SIZE_t size=1 - ) noexcept nogil: + cdef SIZE_t _insert_point_in_new_child( + self, DTYPE_t[3] point, Cell* cell, SIZE_t point_index, SIZE_t size=1 + ) noexcept nogil: """Create a child of cell which will contain point.""" # Local variable definition @@ -204,7 +204,7 @@ cdef class _QuadTree: # Get an empty cell and initialize it cell_id = self.cell_count self.cell_count += 1 - child = &self.cells[cell_id] + child = &self.cells[cell_id] self._init_cell(child, cell.cell_id, cell.depth + 1) child.cell_id = cell_id @@ -247,7 +247,6 @@ cdef class _QuadTree: return cell_id - cdef bint _is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) noexcept nogil: """Check if the two given points are equals.""" cdef int i @@ -257,7 +256,6 @@ cdef class _QuadTree: res &= fabsf(point1[i] - point2[i]) <= EPSILON return res - cdef SIZE_t _select_child(self, DTYPE_t[3] point, Cell* cell) noexcept nogil: """Select the child of cell which contains the given query point.""" cdef: @@ -308,17 +306,17 @@ cdef class _QuadTree: if self.verbose >= 50: if self.n_dimensions == 3: printf("[QuadTree] Checking point (%f, %f, %f) in cell %li " - "([%f/%f, %f/%f, %f/%f], size %li)\n", - point[0], point[1], point[2], cell.cell_id, - cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1], - cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2], - cell.cumulative_size) + "([%f/%f, %f/%f, %f/%f], size %li)\n", + point[0], point[1], point[2], cell.cell_id, + cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1], + cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2], + cell.cumulative_size) else: printf("[QuadTree] Checking point (%f, %f) in cell %li " - "([%f/%f, %f/%f], size %li)\n", - point[0], point[1],cell.cell_id, cell.min_bounds[0], - cell.max_bounds[0], cell.min_bounds[1], - cell.max_bounds[1], cell.cumulative_size) + "([%f/%f, %f/%f], size %li)\n", + point[0], point[1], cell.cell_id, cell.min_bounds[0], + cell.max_bounds[0], cell.min_bounds[1], + cell.max_bounds[1], cell.cumulative_size) for i in range(self.n_dimensions): if (cell.min_bounds[i] > point[i] or @@ -491,8 +489,7 @@ cdef class _QuadTree: def __reduce__(self): """Reduce re-implementation, for pickling.""" - return (_QuadTree, (self.n_dimensions, self.verbose), - self.__getstate__()) + return (_QuadTree, (self.n_dimensions, self.verbose), self.__getstate__()) def __getstate__(self): """Getstate re-implementation, for pickling.""" @@ -528,13 +525,12 @@ cdef class _QuadTree: raise MemoryError("resizing tree to %d" % self.capacity) cdef Cell[:] cell_mem_view = cell_ndarray - cells = memcpy( + memcpy( pto=self.cells, pfrom=&cell_mem_view[0], size=self.capacity * sizeof(Cell), ) - # Array manipulation methods, to convert it to numpy or to resize # self.cells array @@ -606,10 +602,9 @@ cdef class _QuadTree: # Used for testing summarize cdef: DTYPE_t[:] summary - int n_samples, n_dimensions + int n_samples n_samples = X.shape[0] - n_dimensions = X.shape[1] summary = np.empty(4 * n_samples, dtype=np.float32) idx = self.summarize(&query_pt[0], &summary[0], angle * angle) diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index 784968c463953..e2cff65f07972 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -131,9 +131,8 @@ def _csr_polynomial_expansion( shape=num_rows + 1, dtype=np.int32 ) - cnp.int32_t expanded_index = 0, row_starts, row_ends, i, j, k, \ - i_ptr, j_ptr, k_ptr, num_cols_in_row, \ - expanded_column + cnp.int32_t expanded_index = 0, row_starts, row_ends + cnp.int32_t i, j, k, i_ptr, j_ptr, k_ptr, num_cols_in_row with nogil: expanded_indptr[0] = indptr[0] @@ -154,8 +153,7 @@ def _csr_polynomial_expansion( num_cols_in_row += 1 else: # degree == 3 - for k_ptr in range(j_ptr + interaction_only, - row_ends): + for k_ptr in range(j_ptr + interaction_only, row_ends): k = indices[k_ptr] col = _deg3_column(d, i, j, k, interaction_only) expanded_indices[expanded_index] = col diff --git a/sklearn/svm/_liblinear.pyx b/sklearn/svm/_liblinear.pyx index 3bd1fcc3e6c94..900439b65cea4 100644 --- a/sklearn/svm/_liblinear.pyx +++ b/sklearn/svm/_liblinear.pyx @@ -111,7 +111,7 @@ def train_wrap( with nogil: model = train(problem, param, &blas_functions) - ### FREE + # FREE free_problem(problem) free_parameter(param) # destroy_param(param) don't call this or it will destroy class_weight_label and class_weight @@ -127,7 +127,8 @@ def train_wrap( get_n_iter(model, &n_iter[0]) cdef int nr_feature = get_nr_feature(model) - if bias > 0: nr_feature = nr_feature + 1 + if bias > 0: + nr_feature = nr_feature + 1 if nr_class == 2 and solver_type != 4: # solver is not Crammer-Singer w = np.empty((1, nr_feature), order='F') copy_w(&w[0, 0], model, nr_feature) diff --git a/sklearn/svm/_libsvm.pxi b/sklearn/svm/_libsvm.pxi index 8f1250d884687..efe138f1cfd8f 100644 --- a/sklearn/svm/_libsvm.pxi +++ b/sklearn/svm/_libsvm.pxi @@ -12,29 +12,29 @@ cdef extern from "svm.h": cdef struct svm_parameter: int svm_type int kernel_type - int degree # for poly - double gamma # for poly/rbf/sigmoid - double coef0 # for poly/sigmoid + int degree # for poly + double gamma # for poly/rbf/sigmoid + double coef0 # for poly/sigmoid # these are for training only - double cache_size # in MB - double eps # stopping criteria - double C # for C_SVC, EPSILON_SVR and NU_SVR - int nr_weight # for C_SVC - int *weight_label # for C_SVC - double* weight # for C_SVC - double nu # for NU_SVC, ONE_CLASS, and NU_SVR - double p # for EPSILON_SVR - int shrinking # use the shrinking heuristics - int probability # do probability estimates - int max_iter # ceiling on Solver runtime - int random_seed # seed for random generator in probability estimation + double cache_size # in MB + double eps # stopping criteria + double C # for C_SVC, EPSILON_SVR and NU_SVR + int nr_weight # for C_SVC + int *weight_label # for C_SVC + double* weight # for C_SVC + double nu # for NU_SVC, ONE_CLASS, and NU_SVR + double p # for EPSILON_SVR + int shrinking # use the shrinking heuristics + int probability # do probability estimates + int max_iter # ceiling on Solver runtime + int random_seed # seed for random generator in probability estimation cdef struct svm_problem: int l double *y svm_node *x - double *W # instance weights + double *W # instance weights char *svm_check_parameter(svm_problem *, svm_parameter *) svm_model *svm_train(svm_problem *, svm_parameter *, int *, BlasFunctions *) nogil @@ -46,14 +46,14 @@ cdef extern from "libsvm_helper.c": # this file contains methods for accessing libsvm 'hidden' fields svm_node **dense_to_sparse (char *, cnp.npy_intp *) void set_parameter (svm_parameter *, int , int , int , double, double , - double , double , double , double, - double, int, int, int, char *, char *, int, - int) + double , double , double , double, + double, int, int, int, char *, char *, int, + int) void set_problem (svm_problem *, char *, char *, char *, cnp.npy_intp *, int) svm_model *set_model (svm_parameter *, int, char *, cnp.npy_intp *, - char *, cnp.npy_intp *, cnp.npy_intp *, char *, - char *, char *, char *, char *) + char *, cnp.npy_intp *, cnp.npy_intp *, char *, + char *, char *, char *, char *) void copy_sv_coef (char *, svm_model *) void copy_n_iter (char *, svm_model *) diff --git a/sklearn/svm/_libsvm.pyx b/sklearn/svm/_libsvm.pyx index 9bda1d14331d7..45c746164b4da 100644 --- a/sklearn/svm/_libsvm.pyx +++ b/sklearn/svm/_libsvm.pyx @@ -167,14 +167,11 @@ def fit( cdef svm_model *model cdef const char *error_msg cdef cnp.npy_intp SV_len - cdef cnp.npy_intp nr - if len(sample_weight) == 0: sample_weight = np.ones(X.shape[0], dtype=np.float64) else: - assert ( - sample_weight.shape[0] == X.shape[0], + assert sample_weight.shape[0] == X.shape[0], ( f"sample_weight and X have incompatible shapes: sample_weight has " f"{sample_weight.shape[0]} samples while X has {X.shape[0]}" ) @@ -228,7 +225,7 @@ def fit( # from here until the end, we just copy the data returned by # svm_train - SV_len = get_l(model) + SV_len = get_l(model) n_class = get_nr(model) cdef int[::1] n_iter = np.empty(max(1, n_class * (n_class - 1) // 2), dtype=np.intc) @@ -270,7 +267,7 @@ def fit( cdef cnp.float64_t[::1] probA cdef cnp.float64_t[::1] probB if probability != 0: - if svm_type < 2: # SVC and NuSVC + if svm_type < 2: # SVC and NuSVC probA = np.empty(int(n_class*(n_class-1)/2), dtype=np.float64) probB = np.empty(int(n_class*(n_class-1)/2), dtype=np.float64) copy_probB( &probB[0], model, probB.shape) @@ -451,7 +448,7 @@ def predict( ) cdef BlasFunctions blas_functions blas_functions.dot = _dot[double] - #TODO: use check_model + # TODO: use check_model try: dec_values = np.empty(X.shape[0]) with nogil: @@ -605,7 +602,7 @@ def predict_proba( def decision_function( - const cnp.float64_t[:,::1] X, + const cnp.float64_t[:, ::1] X, const cnp.int32_t[::1] support, const cnp.float64_t[:, ::1] SV, const cnp.int32_t[::1] nSV, @@ -841,16 +838,12 @@ def cross_validation( cdef svm_parameter param cdef svm_problem problem - cdef svm_model *model cdef const char *error_msg - cdef cnp.npy_intp SV_len - cdef cnp.npy_intp nr if len(sample_weight) == 0: sample_weight = np.ones(X.shape[0], dtype=np.float64) else: - assert ( - sample_weight.shape[0] == X.shape[0], + assert sample_weight.shape[0] == X.shape[0], ( f"sample_weight and X have incompatible shapes: sample_weight has " f"{sample_weight.shape[0]} samples while X has {X.shape[0]}" ) @@ -896,7 +889,7 @@ def cross_validation( random_seed, ) - error_msg = svm_check_parameter(&problem, ¶m); + error_msg = svm_check_parameter(&problem, ¶m) if error_msg: raise ValueError(error_msg) diff --git a/sklearn/svm/_libsvm_sparse.pyx b/sklearn/svm/_libsvm_sparse.pyx index 37619f399988c..330b71d32bb55 100644 --- a/sklearn/svm/_libsvm_sparse.pyx +++ b/sklearn/svm/_libsvm_sparse.pyx @@ -26,14 +26,14 @@ cdef extern from "svm.h": cdef extern from "libsvm_sparse_helper.c": # this file contains methods for accessing libsvm 'hidden' fields - svm_csr_problem * csr_set_problem (char *, cnp.npy_intp *, - char *, cnp.npy_intp *, char *, char *, char *, int ) + svm_csr_problem * csr_set_problem ( + char *, cnp.npy_intp *, char *, cnp.npy_intp *, char *, char *, char *, int) svm_csr_model *csr_set_model(svm_parameter *param, int nr_class, - char *SV_data, cnp.npy_intp *SV_indices_dims, - char *SV_indices, cnp.npy_intp *SV_intptr_dims, - char *SV_intptr, - char *sv_coef, char *rho, char *nSV, - char *probA, char *probB) + char *SV_data, cnp.npy_intp *SV_indices_dims, + char *SV_indices, cnp.npy_intp *SV_intptr_dims, + char *SV_intptr, + char *sv_coef, char *rho, char *nSV, + char *probA, char *probB) svm_parameter *set_parameter (int , int , int , double, double , double , double , double , double, double, int, int, int, char *, char *, int, @@ -44,20 +44,20 @@ cdef extern from "libsvm_sparse_helper.c": void copy_intercept (char *, svm_csr_model *, cnp.npy_intp *) int copy_predict (char *, svm_csr_model *, cnp.npy_intp *, char *, BlasFunctions *) int csr_copy_predict_values (cnp.npy_intp *data_size, char *data, cnp.npy_intp *index_size, - char *index, cnp.npy_intp *intptr_size, char *size, - svm_csr_model *model, char *dec_values, int nr_class, BlasFunctions *) + char *index, cnp.npy_intp *intptr_size, char *size, + svm_csr_model *model, char *dec_values, int nr_class, BlasFunctions *) int csr_copy_predict (cnp.npy_intp *data_size, char *data, cnp.npy_intp *index_size, - char *index, cnp.npy_intp *intptr_size, char *size, - svm_csr_model *model, char *dec_values, BlasFunctions *) nogil + char *index, cnp.npy_intp *intptr_size, char *size, + svm_csr_model *model, char *dec_values, BlasFunctions *) nogil int csr_copy_predict_proba (cnp.npy_intp *data_size, char *data, cnp.npy_intp *index_size, - char *index, cnp.npy_intp *intptr_size, char *size, - svm_csr_model *model, char *dec_values, BlasFunctions *) nogil + char *index, cnp.npy_intp *intptr_size, char *size, + svm_csr_model *model, char *dec_values, BlasFunctions *) nogil int copy_predict_values(char *, svm_csr_model *, cnp.npy_intp *, char *, int, BlasFunctions *) int csr_copy_SV (char *values, cnp.npy_intp *n_indices, - char *indices, cnp.npy_intp *n_indptr, char *indptr, - svm_csr_model *model, int n_features) - cnp.npy_intp get_nonzero_SV ( svm_csr_model *) + char *indices, cnp.npy_intp *n_indptr, char *indptr, + svm_csr_model *model, int n_features) + cnp.npy_intp get_nonzero_SV (svm_csr_model *) void copy_nSV (char *, svm_csr_model *) void copy_probA (char *, svm_csr_model *, cnp.npy_intp *) void copy_probB (char *, svm_csr_model *, cnp.npy_intp *) @@ -160,7 +160,7 @@ def libsvm_sparse_train (int n_features, # check parameters if (param == NULL or problem == NULL): raise MemoryError("Seems we've run out of memory") - error_msg = svm_csr_check_parameter(problem, param); + error_msg = svm_csr_check_parameter(problem, param) if error_msg: free_problem(problem) free_param(param) @@ -217,7 +217,8 @@ def libsvm_sparse_train (int n_features, n_features, ) support_vectors_ = sparse.csr_matrix( - (SV_data, SV_indices, SV_indptr), (SV_len, n_features)) + (SV_data, SV_indices, SV_indptr), (SV_len, n_features) + ) # copy model.nSV # TODO: do only in classification @@ -228,7 +229,7 @@ def libsvm_sparse_train (int n_features, # # copy probabilities cdef cnp.float64_t[::1] probA, probB if probability != 0: - if svm_type < 2: # SVC and NuSVC + if svm_type < 2: # SVC and NuSVC probA = np.empty(n_class*(n_class-1)//2, dtype=np.float64) probB = np.empty(n_class*(n_class-1)//2, dtype=np.float64) copy_probB( &probB[0], model, probB.shape) @@ -309,7 +310,7 @@ def libsvm_sparse_predict (const cnp.float64_t[::1] T_data, gamma, coef0, nu, - 100.0, # cache size has no effect on predict + 100.0, # cache size has no effect on predict C, eps, p, @@ -319,7 +320,7 @@ def libsvm_sparse_predict (const cnp.float64_t[::1] T_data, &class_weight_label[0] if class_weight_label.size > 0 else NULL, &class_weight[0] if class_weight.size > 0 else NULL, -1, - -1, # random seed has no effect on predict either + -1, # random seed has no effect on predict either ) model = csr_set_model( @@ -335,7 +336,7 @@ def libsvm_sparse_predict (const cnp.float64_t[::1] T_data, &probA[0] if probA.size > 0 else NULL, &probB[0] if probB.size > 0 else NULL, ) - #TODO: use check_model + # TODO: use check_model dec_values = np.empty(T_indptr.shape[0]-1) cdef BlasFunctions blas_functions blas_functions.dot = _dot[double] @@ -393,7 +394,7 @@ def libsvm_sparse_predict_proba( gamma, coef0, nu, - 100.0, # cache size has no effect on predict + 100.0, # cache size has no effect on predict C, eps, p, @@ -403,7 +404,7 @@ def libsvm_sparse_predict_proba( &class_weight_label[0] if class_weight_label.size > 0 else NULL, &class_weight[0] if class_weight.size > 0 else NULL, -1, - -1, # random seed has no effect on predict either + -1, # random seed has no effect on predict either ) model = csr_set_model( @@ -420,7 +421,7 @@ def libsvm_sparse_predict_proba( &probA[0] if probA.size > 0 else NULL, &probB[0] if probB.size > 0 else NULL, ) - #TODO: use check_model + # TODO: use check_model cdef cnp.npy_intp n_class = get_nr(model) cdef int rv dec_values = np.empty((T_indptr.shape[0]-1, n_class), dtype=np.float64) @@ -447,8 +448,6 @@ def libsvm_sparse_predict_proba( return dec_values.base - - def libsvm_sparse_decision_function( const cnp.float64_t[::1] T_data, const cnp.int32_t[::1] T_indices, @@ -487,7 +486,7 @@ def libsvm_sparse_decision_function( gamma, coef0, nu, - 100.0, # cache size has no effect on predict + 100.0, # cache size has no effect on predict C, eps, p, diff --git a/sklearn/svm/_newrand.pyx b/sklearn/svm/_newrand.pyx index 2b1523079279b..af543ed73286a 100644 --- a/sklearn/svm/_newrand.pyx +++ b/sklearn/svm/_newrand.pyx @@ -4,8 +4,10 @@ cdef extern from "newrand.h": void set_seed(unsigned int) unsigned int bounded_rand_int(unsigned int) + def set_seed_wrap(unsigned int custom_seed): set_seed(custom_seed) + def bounded_rand_int_wrap(unsigned int range_): return bounded_rand_int(range_) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 47f616c6bad50..1addca40f239b 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -21,21 +21,21 @@ cdef class Criterion: # such as the mean in regression and class probabilities in classification. # Internal structures - cdef const DOUBLE_t[:, ::1] y # Values of y - cdef const DOUBLE_t[:] sample_weight # Sample weights + cdef const DOUBLE_t[:, ::1] y # Values of y + cdef const DOUBLE_t[:] sample_weight # Sample weights - cdef const SIZE_t[:] sample_indices # Sample indices in X, y - cdef SIZE_t start # samples[start:pos] are the samples in the left node - cdef SIZE_t pos # samples[pos:end] are the samples in the right node + cdef const SIZE_t[:] sample_indices # Sample indices in X, y + cdef SIZE_t start # samples[start:pos] are the samples in the left node + cdef SIZE_t pos # samples[pos:end] are the samples in the right node cdef SIZE_t end - cdef SIZE_t n_outputs # Number of outputs - cdef SIZE_t n_samples # Number of samples - cdef SIZE_t n_node_samples # Number of samples in the node (end-start) - cdef double weighted_n_samples # Weighted number of samples (in total) - cdef double weighted_n_node_samples # Weighted number of samples in the node - cdef double weighted_n_left # Weighted number of samples in the left node - cdef double weighted_n_right # Weighted number of samples in the right node + cdef SIZE_t n_outputs # Number of outputs + cdef SIZE_t n_samples # Number of samples + cdef SIZE_t n_node_samples # Number of samples in the node (end-start) + cdef double weighted_n_samples # Weighted number of samples (in total) + cdef double weighted_n_node_samples # Weighted number of samples in the node + cdef double weighted_n_left # Weighted number of samples in the left node + cdef double weighted_n_right # Weighted number of samples in the right node # The criterion object is maintained such that left and right collected # statistics correspond to samples[start:pos] and samples[pos:end]. diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 83a80d90cc1b9..c7bfb21a24c3c 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -417,13 +417,13 @@ cdef inline int node_split_best( # by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997). cdef inline void sort(DTYPE_t* feature_values, SIZE_t* samples, SIZE_t n) noexcept nogil: if n == 0: - return + return cdef int maxd = 2 * log(n) introsort(feature_values, samples, n, maxd) cdef inline void swap(DTYPE_t* feature_values, SIZE_t* samples, - SIZE_t i, SIZE_t j) noexcept nogil: + SIZE_t i, SIZE_t j) noexcept nogil: # Helper for sort feature_values[i], feature_values[j] = feature_values[j], feature_values[i] samples[i], samples[j] = samples[j], samples[i] @@ -831,6 +831,7 @@ cdef class DensePartitioner: partition_end -= 1 samples[p], samples[partition_end] = samples[partition_end], samples[p] + @final cdef class SparsePartitioner: """Partitioner specialized for sparse CSC data. @@ -903,8 +904,11 @@ cdef class SparsePartitioner: # Sort the positive and negative parts of `feature_values` sort(&feature_values[self.start], &samples[self.start], self.end_negative - self.start) if self.start_positive < self.end: - sort(&feature_values[self.start_positive], &samples[self.start_positive], - self.end - self.start_positive) + sort( + &feature_values[self.start_positive], + &samples[self.start_positive], + self.end - self.start_positive + ) # Update index_to_samples to take into account the sort for p in range(self.start, self.end_negative): @@ -1146,7 +1150,6 @@ cdef inline void extract_nnz_index_to_samples(const INT32_t[::1] X_indices, index = index_to_samples[X_indices[k]] sparse_swap(index_to_samples, samples, index, start_positive_) - elif X_data[k] < 0: feature_values[end_negative_] = X_data[k] index = index_to_samples[X_indices[k]] @@ -1209,7 +1212,7 @@ cdef inline void extract_nnz_binary_search(const INT32_t[::1] X_indices, sorted_samples[p], &k, &indptr_start) if k != -1: - # If k != -1, we have found a non zero value + # If k != -1, we have found a non zero value if X_data[k] > 0: start_positive_ -= 1 @@ -1217,7 +1220,6 @@ cdef inline void extract_nnz_binary_search(const INT32_t[::1] X_indices, index = index_to_samples[X_indices[k]] sparse_swap(index_to_samples, samples, index, start_positive_) - elif X_data[k] < 0: feature_values[end_negative_] = X_data[k] index = index_to_samples[X_indices[k]] @@ -1233,7 +1235,7 @@ cdef inline void extract_nnz_binary_search(const INT32_t[::1] X_indices, cdef inline void sparse_swap(SIZE_t[::1] index_to_samples, SIZE_t[::1] samples, SIZE_t pos_1, SIZE_t pos_2) noexcept nogil: """Swap sample pos_1 and pos_2 preserving sparse invariant.""" - samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1] + samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1] index_to_samples[samples[pos_1]] = pos_1 index_to_samples[samples[pos_2]] = pos_2 diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 75eed058bfd4e..46b6816a0fe54 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -76,7 +76,7 @@ cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED # This works by casting `dummy` to an array of Node of length 1, which numpy # can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946 # for a more detailed explanation. -cdef Node dummy; +cdef Node dummy NODE_DTYPE = np.asarray((&dummy)).dtype # ============================================================================= @@ -124,11 +124,14 @@ cdef class TreeBuilder: if y.base.dtype != DOUBLE or not y.base.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) - if (sample_weight is not None and - (sample_weight.base.dtype != DOUBLE or - not sample_weight.base.flags.contiguous)): - sample_weight = np.asarray(sample_weight, dtype=DOUBLE, - order="C") + if ( + sample_weight is not None and + ( + sample_weight.base.dtype != DOUBLE or + not sample_weight.base.flags.contiguous + ) + ): + sample_weight = np.asarray(sample_weight, dtype=DOUBLE, order="C") return X, y, sample_weight @@ -708,10 +711,10 @@ cdef class Tree: if self._resize_c(self.capacity) != 0: raise MemoryError("resizing tree to %d" % self.capacity) - nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), - self.capacity * sizeof(Node)) - value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) + memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), + self.capacity * sizeof(Node)) + memcpy(self.value, cnp.PyArray_DATA(value_ndarray), + self.capacity * self.value_stride * sizeof(double)) cdef int _resize(self, SIZE_t capacity) except -1 nogil: """Resize all inner arrays to `capacity`, if `capacity` == -1, then @@ -863,8 +866,8 @@ cdef class Tree: # Extract input cdef const DTYPE_t[:] X_data = X.data - cdef const INT32_t[:] X_indices = X.indices - cdef const INT32_t[:] X_indptr = X.indptr + cdef const INT32_t[:] X_indices = X.indices + cdef const INT32_t[:] X_indptr = X.indptr cdef SIZE_t n_samples = X.shape[0] cdef SIZE_t n_features = X.shape[1] @@ -991,8 +994,8 @@ cdef class Tree: # Extract input cdef const DTYPE_t[:] X_data = X.data - cdef const INT32_t[:] X_indices = X.indices - cdef const INT32_t[:] X_indptr = X.indptr + cdef const INT32_t[:] X_indices = X.indices + cdef const INT32_t[:] X_indptr = X.indptr cdef SIZE_t n_samples = X.shape[0] cdef SIZE_t n_features = X.shape[1] @@ -1468,7 +1471,7 @@ cdef struct CostComplexityPruningRecord: SIZE_t node_idx SIZE_t parent -cdef _cost_complexity_prune(unsigned char[:] leaves_in_subtree, # OUT +cdef _cost_complexity_prune(unsigned char[:] leaves_in_subtree, # OUT Tree orig_tree, _CCPPruneController controller): """Perform cost complexity pruning. @@ -1595,7 +1598,7 @@ cdef _cost_complexity_prune(unsigned char[:] leaves_in_subtree, # OUT node_indices_stack.pop() if not in_subtree[node_idx]: - continue # branch has already been marked for pruning + continue # branch has already been marked for pruning candidate_nodes[node_idx] = 0 leaves_in_subtree[node_idx] = 0 in_subtree[node_idx] = 0 @@ -1628,9 +1631,10 @@ cdef _cost_complexity_prune(unsigned char[:] leaves_in_subtree, # OUT def _build_pruned_tree_ccp( - Tree tree, # OUT + Tree tree, # OUT Tree orig_tree, - DOUBLE_t ccp_alpha): + DOUBLE_t ccp_alpha +): """Build a pruned tree from the original tree using cost complexity pruning. @@ -1714,10 +1718,11 @@ cdef struct BuildPrunedRecord: bint is_left cdef _build_pruned_tree( - Tree tree, # OUT + Tree tree, # OUT Tree orig_tree, const unsigned char[:] leaves_in_subtree, - SIZE_t capacity): + SIZE_t capacity +): """Build a pruned tree. Build a pruned tree from the original tree by transforming the nodes in diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index b8f2b71513bd2..4938d3030245f 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -93,9 +93,7 @@ cdef class WeightedMedianCalculator: cdef WeightedPQueue samples cdef DOUBLE_t total_weight cdef SIZE_t k - cdef DOUBLE_t sum_w_0_k # represents sum(weights[0:k]) - # = w[0] + w[1] + ... + w[k-1] - + cdef DOUBLE_t sum_w_0_k # represents sum(weights[0:k]) = w[0] + w[1] + ... + w[k-1] cdef SIZE_t size(self) noexcept nogil cdef int push(self, DOUBLE_t data, DOUBLE_t weight) except -1 nogil cdef int reset(self) except -1 nogil diff --git a/sklearn/utils/_cython_blas.pyx b/sklearn/utils/_cython_blas.pyx index 8969dafb3c0aa..c242e59e1b9de 100644 --- a/sklearn/utils/_cython_blas.pyx +++ b/sklearn/utils/_cython_blas.pyx @@ -165,7 +165,7 @@ cdef void _ger(BLAS_Order order, int m, int n, floating alpha, dger(&n, &m, &alpha, y, &incy, x, &incx, A, &lda) else: if floating is float: - sger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda) + sger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda) else: dger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda) diff --git a/sklearn/utils/_fast_dict.pyx b/sklearn/utils/_fast_dict.pyx index 5fe642b14c626..a1190dc13db02 100644 --- a/sklearn/utils/_fast_dict.pyx +++ b/sklearn/utils/_fast_dict.pyx @@ -12,11 +12,11 @@ from libcpp.map cimport map as cpp_map import numpy as np -#DTYPE = np.float64 -#ctypedef cnp.float64_t DTYPE_t +# DTYPE = np.float64 +# ctypedef cnp.float64_t DTYPE_t -#ITYPE = np.intp -#ctypedef cnp.intp_t ITYPE_t +# ITYPE = np.intp +# ctypedef cnp.intp_t ITYPE_t ############################################################################### # An object to be used in Python @@ -55,12 +55,12 @@ cdef class IntFloatDict: # Cython 0.20 generates buggy code below. Commenting this out for now # and relying on the to_arrays method - #def __iter__(self): - # cdef cpp_map[ITYPE_t, DTYPE_t].iterator it = self.my_map.begin() - # cdef cpp_map[ITYPE_t, DTYPE_t].iterator end = self.my_map.end() - # while it != end: - # yield deref(it).first, deref(it).second - # inc(it) + # def __iter__(self): + # cdef cpp_map[ITYPE_t, DTYPE_t].iterator it = self.my_map.begin() + # cdef cpp_map[ITYPE_t, DTYPE_t].iterator end = self.my_map.end() + # while it != end: + # yield deref(it).first, deref(it).second + # inc(it) def __iter__(self): cdef int size = self.my_map.size() diff --git a/sklearn/utils/_random.pxd b/sklearn/utils/_random.pxd index 9bd7b9666f490..89741ea38179c 100644 --- a/sklearn/utils/_random.pxd +++ b/sklearn/utils/_random.pxd @@ -26,7 +26,8 @@ cpdef sample_without_replacement(cnp.int_t n_population, cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil: """Generate a pseudo-random np.uint32 from a np.uint32 seed""" # seed shouldn't ever be 0. - if (seed[0] == 0): seed[0] = DEFAULT_SEED + if (seed[0] == 0): + seed[0] = DEFAULT_SEED seed[0] ^= (seed[0] << 13) seed[0] ^= (seed[0] >> 17) diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx index 3589ffd2fdc4c..deb17639ed5da 100644 --- a/sklearn/utils/_random.pyx +++ b/sklearn/utils/_random.pyx @@ -133,10 +133,8 @@ cpdef _sample_without_replacement_with_pool(cnp.int_t n_population, cdef cnp.int_t i cdef cnp.int_t j - cdef cnp.int_t[:] out = np.empty((n_samples, ), dtype=int) - - cdef cnp.int_t[:] pool = np.empty((n_population, ), - dtype=int) + cdef cnp.int_t[:] out = np.empty((n_samples,), dtype=int) + cdef cnp.int_t[:] pool = np.empty((n_population,), dtype=int) rng = check_random_state(random_state) rng_randint = rng.randint @@ -150,8 +148,7 @@ cpdef _sample_without_replacement_with_pool(cnp.int_t n_population, for i in range(n_samples): j = rng_randint(n_population - i) # invariant: non-selected at [0,n-i) out[i] = pool[j] - pool[j] = pool[n_population - i - 1] # move non-selected item into - # vacancy + pool[j] = pool[n_population - i - 1] # move non-selected item into vacancy return np.asarray(out) @@ -159,7 +156,8 @@ cpdef _sample_without_replacement_with_pool(cnp.int_t n_population, cpdef _sample_without_replacement_with_reservoir_sampling( cnp.int_t n_population, cnp.int_t n_samples, - random_state=None): + random_state=None +): """Sample integers without replacement. Select n_samples integers from the set [0, n_population) without diff --git a/sklearn/utils/_typedefs.pxd b/sklearn/utils/_typedefs.pxd index 9298fad89a762..41bdac61198d9 100644 --- a/sklearn/utils/_typedefs.pxd +++ b/sklearn/utils/_typedefs.pxd @@ -15,7 +15,7 @@ cimport numpy as cnp # receive values from a numpy array of dtype np.float64, the type float64_t must be # used. # -# TODO: Stop defining custom types locally or globally like DTYPE_t and friends and +# TODO: Stop defining custom types locally or globally like DTYPE_t and friends and # use these consistently throughout the codebase. # NOTE: Extend this list as needed when converting more cython extensions. ctypedef unsigned char bool_t diff --git a/sklearn/utils/_typedefs.pyx b/sklearn/utils/_typedefs.pyx index 49d0e46101b4f..3038124ad12aa 100644 --- a/sklearn/utils/_typedefs.pyx +++ b/sklearn/utils/_typedefs.pyx @@ -7,16 +7,16 @@ from libc.math cimport sqrt # use a hack to determine the associated numpy data types # NOTE: the following requires the buffer interface, only available in # numpy 1.5+. We'll choose the DTYPE by hand instead. -#cdef ITYPE_t idummy -#cdef ITYPE_t[:] idummy_view = &idummy -#ITYPE = np.asarray(idummy_view).dtype +# cdef ITYPE_t idummy +# cdef ITYPE_t[:] idummy_view = &idummy +# ITYPE = np.asarray(idummy_view).dtype ITYPE = np.intp # WARNING: this should match ITYPE_t in typedefs.pxd -INT32TYPE = np.int32 # WARNING: should match INT32TYPE_t in typedefs.pyx -INT64TYPE = np.int64 # WARNING: this should match INT64TYPE_t in typedefs.pxd +INT32TYPE = np.int32 # WARNING: should match INT32TYPE_t in typedefs.pyx +INT64TYPE = np.int64 # WARNING: this should match INT64TYPE_t in typedefs.pxd -#cdef DTYPE_t ddummy -#cdef DTYPE_t[:] ddummy_view = &ddummy -#DTYPE = np.asarray(ddummy_view).dtype +# cdef DTYPE_t ddummy +# cdef DTYPE_t[:] ddummy_view = &ddummy +# DTYPE = np.asarray(ddummy_view).dtype DTYPE = np.float64 # WARNING: this should match DTYPE_t in typedefs.pxd # WARNING: this must match SPARSE_INDEX_TYPE_t in typedefs.pxd diff --git a/sklearn/utils/_vector_sentinel.pyx b/sklearn/utils/_vector_sentinel.pyx index 45c48de9dac68..4ec99a293e013 100644 --- a/sklearn/utils/_vector_sentinel.pyx +++ b/sklearn/utils/_vector_sentinel.pyx @@ -107,7 +107,7 @@ cdef class StdVectorSentinelInt64(StdVectorSentinel): cdef cnp.ndarray vector_to_nd_array(vector_typed * vect_ptr): cdef: cnp.npy_intp size = deref(vect_ptr).size() - StdVectorSentinel sentinel = _create_sentinel(vect_ptr) + StdVectorSentinel sentinel = _create_sentinel(vect_ptr) cnp.ndarray arr = cnp.PyArray_SimpleNewFromData( 1, &size, sentinel.get_typenum(), sentinel.get_data()) diff --git a/sklearn/utils/arrayfuncs.pyx b/sklearn/utils/arrayfuncs.pyx index 4cdb6d6145e38..251f69e3ee3f0 100644 --- a/sklearn/utils/arrayfuncs.pyx +++ b/sklearn/utils/arrayfuncs.pyx @@ -31,34 +31,34 @@ def min_pos(const floating[:] X): # # TODO: put transpose as an option def cholesky_delete(const floating[:, :] L, int go_out): - cdef: - int n = L.shape[0] - int m = L.strides[0] - floating c, s - floating *L1 - int i + cdef: + int n = L.shape[0] + int m = L.strides[0] + floating c, s + floating *L1 + int i - if floating is float: - m /= sizeof(float) - else: - m /= sizeof(double) + if floating is float: + m /= sizeof(float) + else: + m /= sizeof(double) - # delete row go_out - L1 = &L[0, 0] + (go_out * m) - for i in range(go_out, n-1): - _copy(i + 2, L1 + m, 1, L1, 1) - L1 += m + # delete row go_out + L1 = &L[0, 0] + (go_out * m) + for i in range(go_out, n-1): + _copy(i + 2, L1 + m, 1, L1, 1) + L1 += m - L1 = &L[0, 0] + (go_out * m) - for i in range(go_out, n-1): - _rotg(L1 + i, L1 + i + 1, &c, &s) - if L1[i] < 0: - # Diagonals cannot be negative - L1[i] = fabs(L1[i]) - c = -c - s = -s + L1 = &L[0, 0] + (go_out * m) + for i in range(go_out, n-1): + _rotg(L1 + i, L1 + i + 1, &c, &s) + if L1[i] < 0: + # Diagonals cannot be negative + L1[i] = fabs(L1[i]) + c = -c + s = -s - L1[i + 1] = 0. # just for cleanup - L1 += m + L1[i + 1] = 0. # just for cleanup + L1 += m - _rot(n - i - 2, L1 + i, m, L1 + i + 1, m, c, s) + _rot(n - i - 2, L1 + i, m, L1 + i + 1, m, c, s) diff --git a/sklearn/utils/murmurhash.pxd b/sklearn/utils/murmurhash.pxd index 2e40e925b0e06..1844be154b39d 100644 --- a/sklearn/utils/murmurhash.pxd +++ b/sklearn/utils/murmurhash.pxd @@ -4,15 +4,15 @@ cimport numpy as cnp # The C API is disabled for now, since it requires -I flags to get # compilation to work even when these functions are not used. -#cdef extern from "MurmurHash3.h": -# void MurmurHash3_x86_32(void* key, int len, unsigned int seed, -# void* out) -# -# void MurmurHash3_x86_128(void* key, int len, unsigned int seed, +# cdef extern from "MurmurHash3.h": +# void MurmurHash3_x86_32(void* key, int len, unsigned int seed, # void* out) # -# void MurmurHash3_x64_128(void* key, int len, unsigned int seed, -# void* out) +# void MurmurHash3_x86_128(void* key, int len, unsigned int seed, +# void* out) +# +# void MurmurHash3_x64_128(void* key, int len, unsigned int seed, +# void* out) cpdef cnp.uint32_t murmurhash3_int_u32(int key, unsigned int seed) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index a2e0089250a6d..7f093fc5c10c7 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -6,8 +6,6 @@ # # License: BSD 3 clause -#!python - from libc.math cimport fabs, sqrt cimport numpy as cnp import numpy as np @@ -42,7 +40,6 @@ def _sqeuclidean_row_norms_sparse( cdef: integral n_samples = X_indptr.shape[0] - 1 integral i, j - double sum_ dtype = np.float32 if floating is float else np.float64 @@ -501,7 +498,6 @@ def _inplace_csr_row_normalize_l1( ): cdef: unsigned long long n_samples = shape[0] - unsigned long long n_features = shape[1] # the column indices for row i are stored in: # indices[indptr[i]:indices[i+1]] @@ -539,7 +535,6 @@ def _inplace_csr_row_normalize_l2( ): cdef: unsigned long long n_samples = shape[0] - unsigned long long n_features = shape[1] unsigned long long i integral j double sum_ From 9becad866fccc6a10f77517a159180c0ec6e2259 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Mon, 13 Mar 2023 21:05:58 +0100 Subject: [PATCH 03/14] done this time --- sklearn/utils/murmurhash.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/murmurhash.pyx b/sklearn/utils/murmurhash.pyx index e6613883b88f8..9e8c9891f23a9 100644 --- a/sklearn/utils/murmurhash.pyx +++ b/sklearn/utils/murmurhash.pyx @@ -124,11 +124,9 @@ def murmurhash3_32(key, seed=0, positive=False): raise TypeError( "key.dtype should be int32, got %s" % key.dtype) if positive: - return _murmurhash3_bytes_array_u32(key.ravel(), - seed).reshape(key.shape) + return _murmurhash3_bytes_array_u32(key.ravel(), seed).reshape(key.shape) else: - return _murmurhash3_bytes_array_s32(key.ravel(), - seed).reshape(key.shape) + return _murmurhash3_bytes_array_s32(key.ravel(), seed).reshape(key.shape) else: raise TypeError( "key %r with type %s is not supported. " From 273d513a31ef6a9c9684072c6be4197ac6d1712d Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Tue, 14 Mar 2023 12:49:41 +0100 Subject: [PATCH 04/14] last ones --- sklearn/neighbors/_quad_tree.pxd | 4 ++-- sklearn/tree/_splitter.pxd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/_quad_tree.pxd b/sklearn/neighbors/_quad_tree.pxd index 59008968280f6..71c4c3071344c 100644 --- a/sklearn/neighbors/_quad_tree.pxd +++ b/sklearn/neighbors/_quad_tree.pxd @@ -31,12 +31,12 @@ cdef struct Cell: # Cell description SIZE_t cell_id # Id of the cell in the cells array in the Tree SIZE_t point_index # Index of the point at this cell (only defined - # in non empty leaf) + # # in non empty leaf) bint is_leaf # Does this cell have children? DTYPE_t squared_max_width # Squared value of the maximum width w SIZE_t depth # Depth of the cell in the tree SIZE_t cumulative_size # Number of points included in the subtree with - # this cell as a root. + # # this cell as a root. # Internal constants DTYPE_t[3] center # Store the center for quick split of cells diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 13fec5974c3c5..4758731bdfce8 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -21,8 +21,8 @@ cdef struct SplitRecord: # Data to track sample split SIZE_t feature # Which feature to split on. SIZE_t pos # Split samples array at the given position, - # i.e. count of samples below threshold for feature. - # pos is >= end if the node is a leaf. + # # i.e. count of samples below threshold for feature. + # # pos is >= end if the node is a leaf. double threshold # Threshold to split at. double improvement # Impurity improvement given parent node. double impurity_left # Impurity of the left split. From d002019af433839b525330859a1bae69e76233f0 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Wed, 15 Mar 2023 13:24:49 +0100 Subject: [PATCH 05/14] remove outdated ignored flake8 errors --- setup.cfg | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/setup.cfg b/setup.cfg index 081e78c92d480..90c278490bf06 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,18 +33,8 @@ target-version = ['py37'] ignore= # check ignored by default in flake8. Meaning unclear. E24, - # continuation line under-indented - E121, - # closing bracket does not match indentation - E123, - # continuation line over-indented for hanging indent - E126, # space before : (needed for how black formats slicing) E203, - # missing whitespace around arithmetic operator - E226, - # multiple statements on one line (def) - E704, # do not assign a lambda expression, use a def E731, # do not use variables named 'l', 'O', or 'I' From ba3979f300001e174b0e3afa19b388f3ff6342d3 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Wed, 15 Mar 2023 13:32:20 +0100 Subject: [PATCH 06/14] cython-lint in CI --- .pre-commit-config.yaml | 4 ++++ azure-pipelines.yml | 2 +- build_tools/linting.sh | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e5a6018df4473..f8cf13fceb052 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,3 +20,7 @@ repos: - id: mypy files: sklearn/ additional_dependencies: [pytest==6.2.4] +- repo: https://github.com/MarcoGorelli/cython-lint + rev: v0.14.0 + hooks: + - id: cython-lint diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 353a893f43ef3..3d983c11e3c94 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -35,7 +35,7 @@ jobs: - bash: | source build_tools/shared.sh # Include pytest compatibility with mypy - pip install pytest flake8 $(get_dep mypy min) $(get_dep black min) + pip install pytest flake8 $(get_dep mypy min) $(get_dep black min) cython-lint displayName: Install linters - bash: | ./build_tools/linting.sh diff --git a/build_tools/linting.sh b/build_tools/linting.sh index 84d0414190300..d538dd4fdeadd 100755 --- a/build_tools/linting.sh +++ b/build_tools/linting.sh @@ -13,6 +13,9 @@ echo -e "No problem detected by flake8\n" mypy sklearn/ echo -e "No problem detected by mypy\n" +cython-lint --max-line-length=999 --ignore=E24,E203,E731,E741,W503,W504 . +echo -e "No problem detected by cython-lint\n" + # For docstrings and warnings of deprecated attributes to be rendered # properly, the property decorator must come before the deprecated decorator # (else they are treated as functions) From 920df1b7deefc1119eac0c0dce054d0fef940af0 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Wed, 15 Mar 2023 14:46:03 +0100 Subject: [PATCH 07/14] args in pre-commit --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f8cf13fceb052..da9673a8190b8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,3 +24,4 @@ repos: rev: v0.14.0 hooks: - id: cython-lint + args: [--max-line-length=999, --ignore=E24,E203,E731,E741,W503,W504] From c59f028c35ffbe3b308cd21f50b6e7e73fed5cad Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Wed, 15 Mar 2023 15:34:49 +0100 Subject: [PATCH 08/14] fix circle ci --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e2f54c0665c78..ea22d8b28832f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,7 +11,7 @@ jobs: command: | source build_tools/shared.sh # Include pytest compatibility with mypy - pip install pytest flake8 $(get_dep mypy min) $(get_dep black min) + pip install pytest flake8 $(get_dep mypy min) $(get_dep black min) cython-lint - run: name: linting command: ./build_tools/linting.sh From 182b965d6038675df92bc8aca9292471f209c342 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Tue, 21 Mar 2023 12:33:07 +0100 Subject: [PATCH 09/14] address review comments --- .pre-commit-config.yaml | 7 ++++++- build_tools/linting.sh | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index da9673a8190b8..8fd6037884c4d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,5 +23,10 @@ repos: - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.14.0 hooks: + # TODO: add the double-quote-cython-strings hook when it's usability has improved: + # possibility to pass a directory and use it as a check instead of auto-formatter. - id: cython-lint - args: [--max-line-length=999, --ignore=E24,E203,E731,E741,W503,W504] + # The list for `--ignore` has to be maintained with the one in setup.cfg except + # for E501 (line too long) because keeping it < 88 in cython often makes code + # less readable + args: [--ignore=E24,E203,E501,E731,E741,W503,W504] diff --git a/build_tools/linting.sh b/build_tools/linting.sh index d538dd4fdeadd..cdf66b115138c 100755 --- a/build_tools/linting.sh +++ b/build_tools/linting.sh @@ -13,7 +13,10 @@ echo -e "No problem detected by flake8\n" mypy sklearn/ echo -e "No problem detected by mypy\n" -cython-lint --max-line-length=999 --ignore=E24,E203,E731,E741,W503,W504 . +# The list for `--ignore` has to be maintained with the one in setup.cfg except +# for E501 (line too long) because keeping it < 88 in cython often makes code +# less readable +cython-lint --ignore=E24,E203,E501,E731,E741,W503,W504 . echo -e "No problem detected by cython-lint\n" # For docstrings and warnings of deprecated attributes to be rendered From d14ed546268fbc0c5d1958f7aa922a643bbeeccf Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Mon, 3 Apr 2023 16:22:38 +0200 Subject: [PATCH 10/14] config pyproject.toml --- pyproject.toml | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 298eded146149..9c396d885cd7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,3 +38,44 @@ exclude = ''' | asv_benchmarks/env )/ ''' + +[tool.cython-lint] +ignore = [ + # check ignored by default in flake8. Meaning unclear. + 'E24', + # space before : (needed for how black formats slicing) + 'E203', + # line too long + 'E501', + # do not assign a lambda expression, use a def + 'E731', + # do not use variables named 'l', 'O', or 'I' + 'E741', + # line break before binary operator + 'W503', + # line break after binary operator + 'W504', +] +exclude= ''' +( + sklearn/_loss/_loss.pyx + | sklearn/linear_model/_sag_fast.pyx + | sklearn/utils/_seq_dataset.pyx + | sklearn/utils/_seq_dataset.pxd + | sklearn/utils/_weight_vector.pyx + | sklearn/utils/_weight_vector.pxd + | sklearn/metrics/_dist_metrics.pyx + | sklearn/metrics/_dist_metrics.pxd + | sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd + | sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx + | sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx + | sklearn/metrics/_pairwise_distances_reduction/_base.pxd + | sklearn/metrics/_pairwise_distances_reduction/_base.pyx + | sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd + | sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx + | sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd + | sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx + | sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd + | sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx +) +''' From 213b9917ac1676e36163bad8cc5c970b50d5b84d Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Mon, 3 Apr 2023 16:48:50 +0200 Subject: [PATCH 11/14] update --- .pre-commit-config.yaml | 6 +----- build_tools/linting.sh | 5 +---- pyproject.toml | 2 ++ 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8fd6037884c4d..6811277b5e5c0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,12 +21,8 @@ repos: files: sklearn/ additional_dependencies: [pytest==6.2.4] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.14.0 + rev: v0.15.0 hooks: # TODO: add the double-quote-cython-strings hook when it's usability has improved: # possibility to pass a directory and use it as a check instead of auto-formatter. - id: cython-lint - # The list for `--ignore` has to be maintained with the one in setup.cfg except - # for E501 (line too long) because keeping it < 88 in cython often makes code - # less readable - args: [--ignore=E24,E203,E501,E731,E741,W503,W504] diff --git a/build_tools/linting.sh b/build_tools/linting.sh index cdf66b115138c..bf1d7145b6848 100755 --- a/build_tools/linting.sh +++ b/build_tools/linting.sh @@ -13,10 +13,7 @@ echo -e "No problem detected by flake8\n" mypy sklearn/ echo -e "No problem detected by mypy\n" -# The list for `--ignore` has to be maintained with the one in setup.cfg except -# for E501 (line too long) because keeping it < 88 in cython often makes code -# less readable -cython-lint --ignore=E24,E203,E501,E731,E741,W503,W504 . +cython-lint sklearn/ echo -e "No problem detected by cython-lint\n" # For docstrings and warnings of deprecated attributes to be rendered diff --git a/pyproject.toml b/pyproject.toml index 9c396d885cd7c..8f7b1161239d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ exclude = ''' ''' [tool.cython-lint] +# Ignore the same error codes as flake8 ignore = [ # check ignored by default in flake8. Meaning unclear. 'E24', @@ -56,6 +57,7 @@ ignore = [ # line break after binary operator 'W504', ] +# Exclude files are generated from tempita templates exclude= ''' ( sklearn/_loss/_loss.pyx From 2f8842640bd72e862a429c4b33ced6defaf89fe8 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Mon, 3 Apr 2023 16:53:38 +0200 Subject: [PATCH 12/14] exclude newly templated sgd_fast --- pyproject.toml | 1 + setup.cfg | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index c646048205f2b..9d512da53e163 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ exclude= ''' ( sklearn/_loss/_loss.pyx | sklearn/linear_model/_sag_fast.pyx + | sklearn/linear_model/_sgd_fast.pyx | sklearn/utils/_seq_dataset.pyx | sklearn/utils/_seq_dataset.pxd | sklearn/utils/_weight_vector.pyx diff --git a/setup.cfg b/setup.cfg index cc3d51774405a..19f2bebeb7280 100644 --- a/setup.cfg +++ b/setup.cfg @@ -72,6 +72,7 @@ allow_redefinition = True ignore = sklearn/_loss/_loss.pyx sklearn/linear_model/_sag_fast.pyx + sklearn/linear_model/_sgd_fast.pyx sklearn/utils/_seq_dataset.pyx sklearn/utils/_seq_dataset.pxd sklearn/utils/_weight_vector.pyx From e25e24d40b52e0eb239aba6c6464452875fd4316 Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Mon, 3 Apr 2023 16:55:42 +0200 Subject: [PATCH 13/14] comment --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 9d512da53e163..9e44401b3a485 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,8 @@ exclude = ''' [tool.cython-lint] # Ignore the same error codes as flake8 +# + E501 (line too long) because keeping it < 88 in cython +# often makes code less readable. ignore = [ # check ignored by default in flake8. Meaning unclear. 'E24', From 7a63d584fb268e4a69228511b149e5a9e586b45c Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Fri, 7 Apr 2023 12:13:00 +0200 Subject: [PATCH 14/14] lint --- sklearn/_isotonic.pyx | 3 +- .../_hist_gradient_boosting/_binning.pyx | 28 +++++++++++-------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/sklearn/_isotonic.pyx b/sklearn/_isotonic.pyx index 4f3dba7c74940..31489f1107645 100644 --- a/sklearn/_isotonic.pyx +++ b/sklearn/_isotonic.pyx @@ -8,7 +8,6 @@ import numpy as np from cython cimport floating - def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w): cdef: Py_ssize_t n = y.shape[0], i, k @@ -113,4 +112,4 @@ def _make_unique(const floating[::1] X, np.asarray(x_out[:i+1]), np.asarray(y_out[:i+1]), np.asarray(weights_out[:i+1]), -) + ) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 90241509cf96b..3819ef2c0ab6f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -35,20 +35,24 @@ def _map_to_bins(const X_DTYPE_C [:, :] data, int feature_idx for feature_idx in range(data.shape[1]): - _map_col_to_bins(data[:, feature_idx], - binning_thresholds[feature_idx], - is_categorical[feature_idx], - missing_values_bin_idx, - n_threads, - binned[:, feature_idx]) + _map_col_to_bins( + data[:, feature_idx], + binning_thresholds[feature_idx], + is_categorical[feature_idx], + missing_values_bin_idx, + n_threads, + binned[:, feature_idx] + ) -cdef void _map_col_to_bins(const X_DTYPE_C [:] data, - const X_DTYPE_C [:] binning_thresholds, - const unsigned char is_categorical, - const unsigned char missing_values_bin_idx, - int n_threads, - X_BINNED_DTYPE_C [:] binned): +cdef void _map_col_to_bins( + const X_DTYPE_C [:] data, + const X_DTYPE_C [:] binning_thresholds, + const unsigned char is_categorical, + const unsigned char missing_values_bin_idx, + int n_threads, + X_BINNED_DTYPE_C [:] binned +): """Binary search to find the bin index for each value in the data.""" cdef: int i