From 930e27d115489363b4cb6e5b9324ca9474fb0149 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 1 Mar 2019 16:10:27 +0100 Subject: [PATCH 01/39] [skip ci] Trigger PR From f40dcca834dbc5fdc65f53f08efc665aa3a1b683 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 1 Mar 2019 16:52:32 +0100 Subject: [PATCH 02/39] Move weith_vector to a template --- sklearn/utils/setup.py | 4 +++- sklearn/utils/{weight_vector.pxd => weight_vector.pxd.tp} | 0 2 files changed, 3 insertions(+), 1 deletion(-) rename sklearn/utils/{weight_vector.pxd => weight_vector.pxd.tp} (100%) diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index f3002ed3ffed9..4b5246fd7b5b1 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -48,7 +48,9 @@ def configuration(parent_package='', top_path=None): # generate files from a template pyx_templates = ['sklearn/utils/seq_dataset.pyx.tp', - 'sklearn/utils/seq_dataset.pxd.tp'] + 'sklearn/utils/seq_dataset.pxd.tp', + 'sklearn/utils/weight_vector.pxd.tp', + ] for pyxfiles in pyx_templates: outfile = pyxfiles.replace('.tp', '') diff --git a/sklearn/utils/weight_vector.pxd b/sklearn/utils/weight_vector.pxd.tp similarity index 100% rename from sklearn/utils/weight_vector.pxd rename to sklearn/utils/weight_vector.pxd.tp From c158a791b6cd56c3dec8bdd338e042d19c9b9a05 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 1 Mar 2019 16:53:25 +0100 Subject: [PATCH 03/39] transform weight_vector.pxd.tp into a template --- sklearn/utils/weight_vector.pxd.tp | 58 +++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/sklearn/utils/weight_vector.pxd.tp b/sklearn/utils/weight_vector.pxd.tp index 4ba4374c05e6c..144f84669afbd 100644 --- a/sklearn/utils/weight_vector.pxd.tp +++ b/sklearn/utils/weight_vector.pxd.tp @@ -1,30 +1,56 @@ +{{py: + +""" +Efficient (dense) parameter vector implementation for linear models. + +Template file for easily generate fused types consistent code using Tempita +(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py). + +Generated file: weight_vector.pxd + +Each class is duplicated for all dtypes (float and double). The keywords +between double braces are substituted in setup.py. +""" + +# name, c_type +dtypes = [('64', 'double'), + ('32', 'float')] + +def get_dispatch(dtypes): + for name, c_type in dtypes: + yield name, c_type + +}} + # cython: language_level=3 -"""Efficient (dense) parameter vector implementation for linear models. """ cimport numpy as np +{{for name, c_type in get_dispatch(dtypes)}} cdef extern from "math.h": - cdef extern double sqrt(double x) + cdef extern {{c_type}} sqrt({{c_type}} x) -cdef class WeightVector(object): +cdef class WeightVector{{name}}(object): cdef np.ndarray w cdef np.ndarray aw - cdef double *w_data_ptr - cdef double *aw_data_ptr - cdef double wscale - cdef double average_a - cdef double average_b + cdef {{c_type}} *w_data_ptr + cdef {{c_type}} *aw_data_ptr + cdef {{c_type}} wscale + cdef {{c_type}} average_a + cdef {{c_type}} average_b cdef int n_features - cdef double sq_norm + cdef {{c_type}} sq_norm - cdef void add(self, double *x_data_ptr, int *x_ind_ptr, - int xnnz, double c) nogil - cdef void add_average(self, double *x_data_ptr, int *x_ind_ptr, - int xnnz, double c, double num_iter) nogil - cdef double dot(self, double *x_data_ptr, int *x_ind_ptr, + cdef void add(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, + int xnnz, {{c_type}} c) nogil + cdef void add_average(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, + int xnnz, {{c_type}} c, {{c_type}} num_iter) nogil + cdef {{c_type}} dot(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz) nogil - cdef void scale(self, double c) nogil + cdef void scale(self, {{c_type}} c) nogil cdef void reset_wscale(self) nogil - cdef double norm(self) nogil + cdef {{c_type}} norm(self) nogil + +{{endfor}} \ No newline at end of file From d1139a10b544fc60a7e67ebee095968d64819c85 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 1 Mar 2019 16:56:39 +0100 Subject: [PATCH 04/39] move pyx into pyx.tp --- sklearn/utils/{weight_vector.pyx => weight_vector.pyx.tp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sklearn/utils/{weight_vector.pyx => weight_vector.pyx.tp} (100%) diff --git a/sklearn/utils/weight_vector.pyx b/sklearn/utils/weight_vector.pyx.tp similarity index 100% rename from sklearn/utils/weight_vector.pyx rename to sklearn/utils/weight_vector.pyx.tp From ef6ca46850ed48c14afc33c01775ec68a4ea7c76 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 1 Mar 2019 17:39:20 +0100 Subject: [PATCH 05/39] first pass in changing doubles --- sklearn/utils/setup.py | 1 + sklearn/utils/weight_vector.pyx.tp | 100 ++++++++++++++++++----------- 2 files changed, 65 insertions(+), 36 deletions(-) diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index 4b5246fd7b5b1..8c13ffec21ecf 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -49,6 +49,7 @@ def configuration(parent_package='', top_path=None): # generate files from a template pyx_templates = ['sklearn/utils/seq_dataset.pyx.tp', 'sklearn/utils/seq_dataset.pxd.tp', + 'sklearn/utils/weight_vector.pyx.tp', 'sklearn/utils/weight_vector.pxd.tp', ] diff --git a/sklearn/utils/weight_vector.pyx.tp b/sklearn/utils/weight_vector.pyx.tp index 91c5273d210e4..4cb9a2059b21c 100644 --- a/sklearn/utils/weight_vector.pyx.tp +++ b/sklearn/utils/weight_vector.pyx.tp @@ -1,3 +1,27 @@ +{{py: + +""" +Efficient (dense) parameter vector implementation for linear models. + +Template file for easily generate fused types consistent code using Tempita +(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py). + +Generated file: weight_vector.pxd + +Each class is duplicated for all dtypes (float and double). The keywords +between double braces are substituted in setup.py. +""" + +# name, c_type +dtypes = [('64', 'double'), + ('32', 'float')] + +def get_dispatch(dtypes): + for name, c_type in dtypes: + yield name, c_type + +}} + # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False @@ -21,7 +45,9 @@ from ._cython_blas cimport _dot, _scal, _axpy np.import_array() -cdef class WeightVector(object): +{{for name, c_type in get_dispatch(dtypes)}} + +cdef class WeightVector{{name}}(object): """Dense vector represented by a scalar and a numpy array. The class provides methods to ``add`` a sparse vector @@ -31,24 +57,24 @@ cdef class WeightVector(object): Attributes ---------- - w : ndarray, dtype=double, order='C' + w : ndarray, dtype={{c_type}}, order='C' The numpy array which backs the weight vector. - aw : ndarray, dtype=double, order='C' + aw : ndarray, dtype={{c_type}}, order='C' The numpy array which backs the average_weight vector. - w_data_ptr : double* + w_data_ptr : {{c_type}}* A pointer to the data of the numpy array. - wscale : double + wscale : {{c_type}} The scale of the vector. n_features : int The number of features (= dimensionality of ``w``). - sq_norm : double + sq_norm : {{c_type}} The squared norm of ``w``. """ def __cinit__(self, - np.ndarray[double, ndim=1, mode='c'] w, - np.ndarray[double, ndim=1, mode='c'] aw): - cdef double *wdata = w.data + np.ndarray[{{c_type}}, ndim=1, mode='c'] w, + np.ndarray[{{c_type}}, ndim=1, mode='c'] aw): + cdef {{c_type}} *wdata = <{{c_type}} *>w.data if w.shape[0] > INT_MAX: raise ValueError("More than %d features not supported; got %d." @@ -57,7 +83,7 @@ cdef class WeightVector(object): self.w_data_ptr = wdata self.wscale = 1.0 self.n_features = w.shape[0] - self.sq_norm = _dot(w.shape[0], wdata, 1, wdata, 1) + self.sq_norm = _dot(w.shape[0], wdata, 1, wdata, 1) self.aw = aw if self.aw is not None: @@ -65,32 +91,32 @@ cdef class WeightVector(object): self.average_a = 0.0 self.average_b = 1.0 - cdef void add(self, double *x_data_ptr, int *x_ind_ptr, int xnnz, - double c) nogil: + cdef void add(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz, + {{c_type}} c) nogil: """Scales sample x by constant c and adds it to the weight vector. This operation updates ``sq_norm``. Parameters ---------- - x_data_ptr : double* + x_data_ptr : {{c_type}}* The array which holds the feature values of ``x``. x_ind_ptr : np.intc* The array which holds the feature indices of ``x``. xnnz : int The number of non-zero features of ``x``. - c : double + c : {{c_type}} The scaling constant for the example. """ cdef int j cdef int idx - cdef double val - cdef double innerprod = 0.0 - cdef double xsqnorm = 0.0 + cdef {{c_type}} val + cdef {{c_type}} innerprod = 0.0 + cdef {{c_type}} xsqnorm = 0.0 # the next two lines save a factor of 2! - cdef double wscale = self.wscale - cdef double* w_data_ptr = self.w_data_ptr + cdef {{c_type}} wscale = self.wscale + cdef {{c_type}}* w_data_ptr = self.w_data_ptr for j in range(xnnz): idx = x_ind_ptr[j] @@ -104,30 +130,30 @@ cdef class WeightVector(object): # Update the average weights according to the sparse trick defined # here: https://research.microsoft.com/pubs/192769/tricks-2012.pdf # by Leon Bottou - cdef void add_average(self, double *x_data_ptr, int *x_ind_ptr, int xnnz, - double c, double num_iter) nogil: + cdef void add_average(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz, + {{c_type}} c, {{c_type}} num_iter) nogil: """Updates the average weight vector. Parameters ---------- - x_data_ptr : double* + x_data_ptr : {{c_type}}* The array which holds the feature values of ``x``. x_ind_ptr : np.intc* The array which holds the feature indices of ``x``. xnnz : int The number of non-zero features of ``x``. - c : double + c : {{c_type}} The scaling constant for the example. - num_iter : double + num_iter : {{c_type}} The total number of iterations. """ cdef int j cdef int idx - cdef double val - cdef double mu = 1.0 / num_iter - cdef double average_a = self.average_a - cdef double wscale = self.wscale - cdef double* aw_data_ptr = self.aw_data_ptr + cdef {{c_type}} val + cdef {{c_type}} mu = 1.0 / num_iter + cdef {{c_type}} average_a = self.average_a + cdef {{c_type}} wscale = self.wscale + cdef {{c_type}}* aw_data_ptr = self.aw_data_ptr for j in range(xnnz): idx = x_ind_ptr[j] @@ -140,13 +166,13 @@ cdef class WeightVector(object): self.average_b /= (1.0 - mu) self.average_a += mu * self.average_b * wscale - cdef double dot(self, double *x_data_ptr, int *x_ind_ptr, + cdef {{c_type}} dot(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz) nogil: """Computes the dot product of a sample x and the weight vector. Parameters ---------- - x_data_ptr : double* + x_data_ptr : {{c_type}}* The array which holds the feature values of ``x``. x_ind_ptr : np.intc* The array which holds the feature indices of ``x``. @@ -155,20 +181,20 @@ cdef class WeightVector(object): Returns ------- - innerprod : double + innerprod : {{c_type}} The inner product of ``x`` and ``w``. """ cdef int j cdef int idx - cdef double innerprod = 0.0 - cdef double* w_data_ptr = self.w_data_ptr + cdef {{c_type}} innerprod = 0.0 + cdef {{c_type}}* w_data_ptr = self.w_data_ptr for j in range(xnnz): idx = x_ind_ptr[j] innerprod += w_data_ptr[idx] * x_data_ptr[j] innerprod *= self.wscale return innerprod - cdef void scale(self, double c) nogil: + cdef void scale(self, {{c_type}} c) nogil: """Scales the weight vector by a constant ``c``. It updates ``wscale`` and ``sq_norm``. If ``wscale`` gets too @@ -191,6 +217,8 @@ cdef class WeightVector(object): _scal(self.w.shape[0], self.wscale, self.w.data, 1) self.wscale = 1.0 - cdef double norm(self) nogil: + cdef {{c_type}} norm(self) nogil: """The L2 norm of the weight vector. """ return sqrt(self.sq_norm) + +{{endfor}} \ No newline at end of file From 04ce5c9ac5ac63dd559a7e937950a39e326dac3e Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 1 Mar 2019 17:40:55 +0100 Subject: [PATCH 06/39] change all double that were missing --- sklearn/utils/weight_vector.pyx.tp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/weight_vector.pyx.tp b/sklearn/utils/weight_vector.pyx.tp index 4cb9a2059b21c..0dbee605e9a3d 100644 --- a/sklearn/utils/weight_vector.pyx.tp +++ b/sklearn/utils/weight_vector.pyx.tp @@ -87,7 +87,7 @@ cdef class WeightVector{{name}}(object): self.aw = aw if self.aw is not None: - self.aw_data_ptr = aw.data + self.aw_data_ptr = <{{c_type}} *>aw.data self.average_a = 0.0 self.average_b = 1.0 @@ -208,13 +208,13 @@ cdef class WeightVector{{name}}(object): """Scales each coef of ``w`` by ``wscale`` and resets it to 1. """ if self.aw is not None: _axpy(self.aw.shape[0], self.average_a, - self.w.data, 1, self.aw.data, 1) + <{{c_type}} *>self.w.data, 1, <{{c_type}} *>self.aw.data, 1) _scal(self.aw.shape[0], 1.0 / self.average_b, - self.aw.data, 1) + <{{c_type}} *>self.aw.data, 1) self.average_a = 0.0 self.average_b = 1.0 - _scal(self.w.shape[0], self.wscale, self.w.data, 1) + _scal(self.w.shape[0], self.wscale, <{{c_type}} *>self.w.data, 1) self.wscale = 1.0 cdef {{c_type}} norm(self) nogil: From 50de2663befcbd755db0fa62535dfdb71ed27c4a Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 1 Mar 2019 17:45:11 +0100 Subject: [PATCH 07/39] fix compilation --- sklearn/linear_model/sgd_fast.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/sgd_fast.pyx b/sklearn/linear_model/sgd_fast.pyx index ddea4b9710501..1b98e8c907f2b 100644 --- a/sklearn/linear_model/sgd_fast.pyx +++ b/sklearn/linear_model/sgd_fast.pyx @@ -22,7 +22,7 @@ from numpy.math cimport INFINITY cdef extern from "sgd_fast_helpers.h": bint skl_isfinite(double) nogil -from sklearn.utils.weight_vector cimport WeightVector +from sklearn.utils.weight_vector cimport WeightVector64 as WeightVector from sklearn.utils.seq_dataset cimport SequentialDataset64 as SequentialDataset np.import_array() From 3a05e422833c742c978dc7bcbeffde186d1c5aae Mon Sep 17 00:00:00 2001 From: mbatoul Date: Wed, 7 Jul 2021 12:39:48 +0200 Subject: [PATCH 08/39] Add newlines to template files. --- sklearn/utils/_weight_vector.pxd.tp | 2 +- sklearn/utils/_weight_vector.pyx.tp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_weight_vector.pxd.tp b/sklearn/utils/_weight_vector.pxd.tp index 144f84669afbd..cba9c18520722 100644 --- a/sklearn/utils/_weight_vector.pxd.tp +++ b/sklearn/utils/_weight_vector.pxd.tp @@ -53,4 +53,4 @@ cdef class WeightVector{{name}}(object): cdef void reset_wscale(self) nogil cdef {{c_type}} norm(self) nogil -{{endfor}} \ No newline at end of file +{{endfor}} diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index 0dbee605e9a3d..19ad52b24b107 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -221,4 +221,4 @@ cdef class WeightVector{{name}}(object): """The L2 norm of the weight vector. """ return sqrt(self.sq_norm) -{{endfor}} \ No newline at end of file +{{endfor}} From f8bd689cda74584e30d6ef5bd612211287519eb9 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Wed, 7 Jul 2021 14:18:42 +0200 Subject: [PATCH 09/39] Git-ignore `.pyx` and `.pxd` files generated by Tempita from `weight_vector` templates --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 3ebd8e2bb1699..2c3dd0c4794c1 100644 --- a/.gitignore +++ b/.gitignore @@ -78,4 +78,6 @@ _configtest.o.d # files generated from a template sklearn/utils/_seq_dataset.pyx sklearn/utils/_seq_dataset.pxd +sklearn/utils/_weight_vector.pyx +sklearn/utils/_weight_vector.pxd sklearn/linear_model/_sag_fast.pyx From 41d9a0dc8ea27fd64494793c436da35874b3dda7 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Wed, 7 Jul 2021 14:21:39 +0200 Subject: [PATCH 10/39] Remove `get_dispatch` unnecessary function and iterate directly over the`dtypes` list in Tempita `.tp` files --- sklearn/linear_model/_sag_fast.pyx.tp | 22 +++++++++------------- sklearn/utils/_seq_dataset.pxd.tp | 6 +----- sklearn/utils/_seq_dataset.pyx.tp | 6 +----- sklearn/utils/_weight_vector.pxd.tp | 6 +----- sklearn/utils/_weight_vector.pyx.tp | 7 +------ 5 files changed, 13 insertions(+), 34 deletions(-) diff --git a/sklearn/linear_model/_sag_fast.pyx.tp b/sklearn/linear_model/_sag_fast.pyx.tp index 8508340e3b329..d9989f60aa23a 100644 --- a/sklearn/linear_model/_sag_fast.pyx.tp +++ b/sklearn/linear_model/_sag_fast.pyx.tp @@ -23,10 +23,6 @@ License: BSD 3 clause dtypes = [('64', 'double', 'np.float64'), ('32', 'float', 'np.float32')] -def get_dispatch(dtypes): - for name, c_type, np_type in dtypes: - yield name, c_type, np_type - }} #------------------------------------------------------------------------------ @@ -61,7 +57,7 @@ from libc.stdio cimport printf np.import_array() -{{for name, c_type, np_type in get_dispatch(dtypes)}} +{{for name, c_type, np_type in dtypes}} cdef extern from "_sgd_fast_helpers.h": bint skl_isfinite{{name}}({{c_type}}) nogil @@ -69,7 +65,7 @@ cdef extern from "_sgd_fast_helpers.h": {{endfor}} -{{for name, c_type, np_type in get_dispatch(dtypes)}} +{{for name, c_type, np_type in dtypes}} cdef inline {{c_type}} fmax{{name}}({{c_type}} x, {{c_type}} y) nogil: if x > y: @@ -79,7 +75,7 @@ cdef inline {{c_type}} fmax{{name}}({{c_type}} x, {{c_type}} y) nogil: {{endfor}} -{{for name, c_type, np_type in get_dispatch(dtypes)}} +{{for name, c_type, np_type in dtypes}} cdef {{c_type}} _logsumexp{{name}}({{c_type}}* arr, int n_classes) nogil: """Computes the sum of arr assuming arr is in the log domain. @@ -105,7 +101,7 @@ cdef {{c_type}} _logsumexp{{name}}({{c_type}}* arr, int n_classes) nogil: {{endfor}} -{{for name, c_type, np_type in get_dispatch(dtypes)}} +{{for name, c_type, np_type in dtypes}} cdef class MultinomialLogLoss{{name}}: cdef {{c_type}} _loss(self, {{c_type}}* prediction, {{c_type}} y, int n_classes, @@ -209,7 +205,7 @@ cdef class MultinomialLogLoss{{name}}: {{endfor}} -{{for name, c_type, np_type in get_dispatch(dtypes)}} +{{for name, c_type, np_type in dtypes}} cdef inline {{c_type}} _soft_thresholding{{name}}({{c_type}} x, {{c_type}} shrinkage) nogil: return fmax{{name}}(x - shrinkage, 0) - fmax{{name}}(- x - shrinkage, 0) @@ -217,7 +213,7 @@ cdef inline {{c_type}} _soft_thresholding{{name}}({{c_type}} x, {{c_type}} shrin {{endfor}} -{{for name, c_type, np_type in get_dispatch(dtypes)}} +{{for name, c_type, np_type in dtypes}} def sag{{name}}(SequentialDataset{{name}} dataset, np.ndarray[{{c_type}}, ndim=2, mode='c'] weights_array, @@ -553,7 +549,7 @@ def sag{{name}}(SequentialDataset{{name}} dataset, {{endfor}} -{{for name, c_type, np_type in get_dispatch(dtypes)}} +{{for name, c_type, np_type in dtypes}} cdef int scale_weights{{name}}({{c_type}}* weights, {{c_type}}* wscale, int n_features, @@ -592,7 +588,7 @@ cdef int scale_weights{{name}}({{c_type}}* weights, {{c_type}}* wscale, {{endfor}} -{{for name, c_type, np_type in get_dispatch(dtypes)}} +{{for name, c_type, np_type in dtypes}} cdef int lagged_update{{name}}({{c_type}}* weights, {{c_type}} wscale, int xnnz, int n_samples, int n_classes, int sample_itr, @@ -683,7 +679,7 @@ cdef int lagged_update{{name}}({{c_type}}* weights, {{c_type}} wscale, int xnnz, {{endfor}} -{{for name, c_type, np_type in get_dispatch(dtypes)}} +{{for name, c_type, np_type in dtypes}} cdef void predict_sample{{name}}({{c_type}}* x_data_ptr, int* x_ind_ptr, int xnnz, {{c_type}}* w_data_ptr, {{c_type}} wscale, diff --git a/sklearn/utils/_seq_dataset.pxd.tp b/sklearn/utils/_seq_dataset.pxd.tp index be2d94a05b015..9e4237fc0f51a 100644 --- a/sklearn/utils/_seq_dataset.pxd.tp +++ b/sklearn/utils/_seq_dataset.pxd.tp @@ -16,12 +16,8 @@ between double braces are substituted in setup.py. dtypes = [('64', 'double'), ('32', 'float')] -def get_dispatch(dtypes): - for name, c_type in dtypes: - yield name, c_type - }} -{{for name, c_type in get_dispatch(dtypes)}} +{{for name, c_type in dtypes}} #------------------------------------------------------------------------------ diff --git a/sklearn/utils/_seq_dataset.pyx.tp b/sklearn/utils/_seq_dataset.pyx.tp index 44edb9216dc62..a5173fb60c0da 100644 --- a/sklearn/utils/_seq_dataset.pyx.tp +++ b/sklearn/utils/_seq_dataset.pyx.tp @@ -24,12 +24,8 @@ License: BSD 3 clause dtypes = [('64', 'double', 'np.float64'), ('32', 'float', 'np.float32')] -def get_dispatch(dtypes): - for name, c_type, np_type in dtypes: - yield name, c_type, np_type - }} -{{for name, c_type, np_type in get_dispatch(dtypes)}} +{{for name, c_type, np_type in dtypes}} #------------------------------------------------------------------------------ diff --git a/sklearn/utils/_weight_vector.pxd.tp b/sklearn/utils/_weight_vector.pxd.tp index cba9c18520722..7ef0fd017bd69 100644 --- a/sklearn/utils/_weight_vector.pxd.tp +++ b/sklearn/utils/_weight_vector.pxd.tp @@ -16,17 +16,13 @@ between double braces are substituted in setup.py. dtypes = [('64', 'double'), ('32', 'float')] -def get_dispatch(dtypes): - for name, c_type in dtypes: - yield name, c_type - }} # cython: language_level=3 cimport numpy as np -{{for name, c_type in get_dispatch(dtypes)}} +{{for name, c_type in dtypes}} cdef extern from "math.h": cdef extern {{c_type}} sqrt({{c_type}} x) diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index 19ad52b24b107..b36f475183c2a 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -16,10 +16,6 @@ between double braces are substituted in setup.py. dtypes = [('64', 'double'), ('32', 'float')] -def get_dispatch(dtypes): - for name, c_type in dtypes: - yield name, c_type - }} # cython: cdivision=True @@ -44,8 +40,7 @@ from ._cython_blas cimport _dot, _scal, _axpy np.import_array() - -{{for name, c_type in get_dispatch(dtypes)}} +{{for name, c_type in dtypes}} cdef class WeightVector{{name}}(object): """Dense vector represented by a scalar and a numpy array. From c383712440bc7ce040031d5256fe140accbfcc3e Mon Sep 17 00:00:00 2001 From: mbatoul Date: Tue, 13 Jul 2021 09:15:25 +0200 Subject: [PATCH 11/39] Test that files generated by templates are git-ignored. --- sklearn/utils/tests/test_cython_templating.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 sklearn/utils/tests/test_cython_templating.py diff --git a/sklearn/utils/tests/test_cython_templating.py b/sklearn/utils/tests/test_cython_templating.py new file mode 100644 index 0000000000000..79d71e3993f68 --- /dev/null +++ b/sklearn/utils/tests/test_cython_templating.py @@ -0,0 +1,15 @@ +import pathlib +import pytest +import sklearn + +TEMPITA_EXTENSION = "tp" + + +def test_files_generated_by_templates_are_git_ignored(): + gitignore_file = pathlib.Path(sklearn.__file__).parent.parent / ".gitignore" + if not gitignore_file.exists(): + pytest.skip("Tests are not run from the source folder") + base_dir = pathlib.Path(sklearn.__file__).parent + for filename in base_dir.glob(f"**/*.{TEMPITA_EXTENSION}"): + filename_wo_tempita_suffix = filename.with_suffix("") + assert not filename_wo_tempita_suffix.exists() From c6625c0eee909135d6f6a690f4cf940fbdae8275 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Tue, 13 Jul 2021 19:14:35 +0200 Subject: [PATCH 12/39] Add `_sag_fast` templates to `setup.py` + git-ignore the generated files. --- .gitignore | 1 + sklearn/utils/setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 2c3dd0c4794c1..95a87a876d8a1 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,4 @@ sklearn/utils/_seq_dataset.pxd sklearn/utils/_weight_vector.pyx sklearn/utils/_weight_vector.pxd sklearn/linear_model/_sag_fast.pyx +sklearn/linear_model/_sag_fast.pxd diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index eba6daf26d5fd..7069478d513c0 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -55,6 +55,7 @@ def configuration(parent_package="", top_path=None): "sklearn/utils/_seq_dataset.pxd.tp", "sklearn/utils/_weight_vector.pyx.tp", "sklearn/utils/_weight_vector.pxd.tp", + "sklearn/linear_model/_sag_fast.pyx.tp", ] gen_from_templates(templates, top_path) From 1bc6be834ccd3e14bee17cf98f60509cf4149158 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 11:10:13 +0200 Subject: [PATCH 13/39] Read `.gitignore` file content and check files generated by Tempita templates are included. --- sklearn/utils/tests/test_cython_templating.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_cython_templating.py b/sklearn/utils/tests/test_cython_templating.py index 79d71e3993f68..124045d84b08f 100644 --- a/sklearn/utils/tests/test_cython_templating.py +++ b/sklearn/utils/tests/test_cython_templating.py @@ -9,7 +9,12 @@ def test_files_generated_by_templates_are_git_ignored(): gitignore_file = pathlib.Path(sklearn.__file__).parent.parent / ".gitignore" if not gitignore_file.exists(): pytest.skip("Tests are not run from the source folder") + base_dir = pathlib.Path(sklearn.__file__).parent + ignored_files = open(gitignore_file, "r").readlines() + ignored_files = list(map(lambda line: line.strip("\n"), ignored_files)) + for filename in base_dir.glob(f"**/*.{TEMPITA_EXTENSION}"): - filename_wo_tempita_suffix = filename.with_suffix("") - assert not filename_wo_tempita_suffix.exists() + filename = str(filename).split("scikit-learn/")[-1] + filename_wo_tempita_suffix = filename.strip(f".{TEMPITA_EXTENSION}") + assert filename_wo_tempita_suffix in ignored_files From c05767b6328409376b4dff0c4d582d833ebb7629 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 11:16:56 +0200 Subject: [PATCH 14/39] Set `w` and `aw` attributes as `readonly`. --- sklearn/utils/_weight_vector.pxd.tp | 4 ++-- sklearn/utils/_weight_vector.pyx.tp | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_weight_vector.pxd.tp b/sklearn/utils/_weight_vector.pxd.tp index 7ef0fd017bd69..b70c8e8eccd52 100644 --- a/sklearn/utils/_weight_vector.pxd.tp +++ b/sklearn/utils/_weight_vector.pxd.tp @@ -29,8 +29,8 @@ cdef extern from "math.h": cdef class WeightVector{{name}}(object): - cdef np.ndarray w - cdef np.ndarray aw + cdef readonly np.ndarray w + cdef readonly np.ndarray aw cdef {{c_type}} *w_data_ptr cdef {{c_type}} *aw_data_ptr cdef {{c_type}} wscale diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index b36f475183c2a..b77488b73a725 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -66,6 +66,9 @@ cdef class WeightVector{{name}}(object): The squared norm of ``w``. """ + cdef readonly np.ndarray w + cdef readonly np.ndarray aw + def __cinit__(self, np.ndarray[{{c_type}}, ndim=1, mode='c'] w, np.ndarray[{{c_type}}, ndim=1, mode='c'] aw): From bedf4cfb6589b1b5cf23b09eb470ef8a1d36dbaf Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 13:01:38 +0200 Subject: [PATCH 15/39] Add test for type invariance for `WeightVector` interface. --- sklearn/utils/tests/test_weight_vector.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 sklearn/utils/tests/test_weight_vector.py diff --git a/sklearn/utils/tests/test_weight_vector.py b/sklearn/utils/tests/test_weight_vector.py new file mode 100644 index 0000000000000..21fe69a9e2e82 --- /dev/null +++ b/sklearn/utils/tests/test_weight_vector.py @@ -0,0 +1,23 @@ +# Author: Mathis Batoul +# +# License: BSD 3 clause + +import numpy as np +from sklearn.utils._weight_vector import ( + WeightVector32, + WeightVector64, +) + + +def test_type_invariance(): + weights32 = np.random.rand(100).astype(np.float32) + average_weights32 = np.random.rand(100).astype(np.float32) + weight_vector32 = WeightVector32(weights32, average_weights32) + assert weight_vector32.w.dtype is np.dtype(np.float32) + assert weight_vector32.aw.dtype is np.dtype(np.float32) + + weights64 = np.random.rand(100).astype(np.float64) + average_weights64 = np.random.rand(100).astype(np.float64) + weight_vector64 = WeightVector64(weights64, average_weights64) + assert weight_vector64.w.dtype is np.dtype(np.float64) + assert weight_vector64.aw.dtype is np.dtype(np.float64) From bf766571124e24d97090670f381c027e028cd059 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 13:24:26 +0200 Subject: [PATCH 16/39] Remove unnecessary file from `gitignore`. --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 95a87a876d8a1..2c3dd0c4794c1 100644 --- a/.gitignore +++ b/.gitignore @@ -81,4 +81,3 @@ sklearn/utils/_seq_dataset.pxd sklearn/utils/_weight_vector.pyx sklearn/utils/_weight_vector.pxd sklearn/linear_model/_sag_fast.pyx -sklearn/linear_model/_sag_fast.pxd From df6e5490c25a3d92c262f2861501e0ef60278d2f Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 13:24:59 +0200 Subject: [PATCH 17/39] Move lines of code to reduce diff. --- sklearn/utils/setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index 7069478d513c0..2b54918ac12ca 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -49,6 +49,10 @@ def configuration(parent_package="", top_path=None): libraries=libraries, ) + config.add_extension( + "_openmp_helpers", sources=["_openmp_helpers.pyx"], libraries=libraries + ) + # generate files from a template templates = [ "sklearn/utils/_seq_dataset.pyx.tp", @@ -60,10 +64,6 @@ def configuration(parent_package="", top_path=None): gen_from_templates(templates, top_path) - config.add_extension( - "_openmp_helpers", sources=["_openmp_helpers.pyx"], libraries=libraries - ) - config.add_extension( "_seq_dataset", sources=["_seq_dataset.pyx"], include_dirs=[numpy.get_include()] ) From 77fa38299b380337944f87209ae28092556dc588 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 14:14:54 +0200 Subject: [PATCH 18/39] Remove outdated argument of `gen_from_templates` function. --- sklearn/_build_utils/__init__.py | 2 +- sklearn/linear_model/setup.py | 2 +- sklearn/utils/setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py index 670297dab3d22..be83b4c4d8baf 100644 --- a/sklearn/_build_utils/__init__.py +++ b/sklearn/_build_utils/__init__.py @@ -80,7 +80,7 @@ def cythonize_extensions(top_path, config): ) -def gen_from_templates(templates, top_path): +def gen_from_templates(templates): """Generate cython files from a list of templates""" # Lazy import because cython is not a runtime dependency. from Cython import Tempita diff --git a/sklearn/linear_model/setup.py b/sklearn/linear_model/setup.py index cc5d277e13502..74d7d9e2b05ea 100644 --- a/sklearn/linear_model/setup.py +++ b/sklearn/linear_model/setup.py @@ -29,7 +29,7 @@ def configuration(parent_package="", top_path=None): # generate sag_fast from template templates = ["sklearn/linear_model/_sag_fast.pyx.tp"] - gen_from_templates(templates, top_path) + gen_from_templates(templates) config.add_extension( "_sag_fast", sources=["_sag_fast.pyx"], include_dirs=numpy.get_include() diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index 2b54918ac12ca..4df5ac39a6bfe 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -62,7 +62,7 @@ def configuration(parent_package="", top_path=None): "sklearn/linear_model/_sag_fast.pyx.tp", ] - gen_from_templates(templates, top_path) + gen_from_templates(templates) config.add_extension( "_seq_dataset", sources=["_seq_dataset.pyx"], include_dirs=[numpy.get_include()] From d772d43389352ecf45019508c08ef4a8f6dc7b7e Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 14:24:31 +0200 Subject: [PATCH 19/39] Parametrize `test_type_invariance`. --- sklearn/utils/tests/test_weight_vector.py | 26 ++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/sklearn/utils/tests/test_weight_vector.py b/sklearn/utils/tests/test_weight_vector.py index 21fe69a9e2e82..93ca628b8d520 100644 --- a/sklearn/utils/tests/test_weight_vector.py +++ b/sklearn/utils/tests/test_weight_vector.py @@ -3,21 +3,23 @@ # License: BSD 3 clause import numpy as np +import pytest from sklearn.utils._weight_vector import ( WeightVector32, WeightVector64, ) -def test_type_invariance(): - weights32 = np.random.rand(100).astype(np.float32) - average_weights32 = np.random.rand(100).astype(np.float32) - weight_vector32 = WeightVector32(weights32, average_weights32) - assert weight_vector32.w.dtype is np.dtype(np.float32) - assert weight_vector32.aw.dtype is np.dtype(np.float32) - - weights64 = np.random.rand(100).astype(np.float64) - average_weights64 = np.random.rand(100).astype(np.float64) - weight_vector64 = WeightVector64(weights64, average_weights64) - assert weight_vector64.w.dtype is np.dtype(np.float64) - assert weight_vector64.aw.dtype is np.dtype(np.float64) +@pytest.mark.parametrize( + "dtype, weight_vector_class", + [ + (np.float32, WeightVector32), + (np.float64, WeightVector64), + ], +) +def test_type_invariance(dtype, weight_vector_class): + weights = np.random.rand(100).astype(dtype) + average_weights = np.random.rand(100).astype(dtype) + weight_vector = weight_vector_class(weights, average_weights) + assert weight_vector.w.dtype is np.dtype(dtype) + assert weight_vector.aw.dtype is np.dtype(dtype) From 9456ba37980141c029a997468ab552c25a609f21 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 14:46:11 +0200 Subject: [PATCH 20/39] Add test for source files generation from templates. --- sklearn/utils/tests/test_utils.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 76d12abf10f6d..244de855c00bd 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -5,6 +5,9 @@ import timeit import pytest +import os +import sklearn +import pathlib import numpy as np import scipy.sparse as sp @@ -31,6 +34,7 @@ from sklearn.utils import _to_object_array from sklearn.utils._mocking import MockDataFrame from sklearn import config_context +from sklearn._build_utils import gen_from_templates # toy array X_toy = np.arange(9).reshape((3, 3)) @@ -685,3 +689,22 @@ def test_to_object_array(sequence): assert isinstance(out, np.ndarray) assert out.dtype.kind == "O" assert out.ndim == 1 + + +@pytest.mark.parametrize("file_extension", ["pyx", "pxd"]) +def test_files_generation_from_templates(file_extension): + base_dir = base_dir = pathlib.Path(sklearn.__file__).parent.parent + template_filename = f"dummy.{file_extension}.tp" + template_file = base_dir / template_filename + + f = open(template_file, "a") + f.close() + + templates = [template_filename] + gen_from_templates(templates) + + generated_file = base_dir / f"dummy.{file_extension}" + assert generated_file.exists() + + template_file.unlink() + generated_file.unlink() From 9a64435921f16f777d55b7ca0016e3264ea8944c Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 14:57:48 +0200 Subject: [PATCH 21/39] Remove `sklearn/linear_model/_sag_fast.pyx.tp` from `sklearn/utils/setup.py`. --- sklearn/utils/setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index 4df5ac39a6bfe..cdb7b77ac0844 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -59,7 +59,6 @@ def configuration(parent_package="", top_path=None): "sklearn/utils/_seq_dataset.pxd.tp", "sklearn/utils/_weight_vector.pyx.tp", "sklearn/utils/_weight_vector.pxd.tp", - "sklearn/linear_model/_sag_fast.pyx.tp", ] gen_from_templates(templates) From d39edd1d8546ee82416cd51ffb0416784e71f57e Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 15:17:48 +0200 Subject: [PATCH 22/39] Remove duplicated `cdef` declaration in `WeightVector` class. --- sklearn/utils/_weight_vector.pyx.tp | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index b77488b73a725..b36f475183c2a 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -66,9 +66,6 @@ cdef class WeightVector{{name}}(object): The squared norm of ``w``. """ - cdef readonly np.ndarray w - cdef readonly np.ndarray aw - def __cinit__(self, np.ndarray[{{c_type}}, ndim=1, mode='c'] w, np.ndarray[{{c_type}}, ndim=1, mode='c'] aw): From da946c6ad437ac4d17fb267af4a73b5206b8e2a2 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 15:24:30 +0200 Subject: [PATCH 23/39] Remove `cython: language_level=3` from `weight_vector` template files. --- sklearn/utils/_weight_vector.pxd.tp | 2 -- sklearn/utils/_weight_vector.pyx.tp | 1 - 2 files changed, 3 deletions(-) diff --git a/sklearn/utils/_weight_vector.pxd.tp b/sklearn/utils/_weight_vector.pxd.tp index b70c8e8eccd52..d9a4bebf74fa5 100644 --- a/sklearn/utils/_weight_vector.pxd.tp +++ b/sklearn/utils/_weight_vector.pxd.tp @@ -18,8 +18,6 @@ dtypes = [('64', 'double'), }} -# cython: language_level=3 - cimport numpy as np {{for name, c_type in dtypes}} diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index b36f475183c2a..b9d80799081e1 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -21,7 +21,6 @@ dtypes = [('64', 'double'), # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False -# cython: language_level=3 # # Author: Peter Prettenhofer # Lars Buitinck From ea2470be1c6cf08a837744b14088b71c57f7d535 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 26 Jul 2021 15:24:49 +0200 Subject: [PATCH 24/39] Remove useless constant variable in `test_cython_templating`. --- sklearn/utils/tests/test_cython_templating.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_cython_templating.py b/sklearn/utils/tests/test_cython_templating.py index 124045d84b08f..4cc5e6dae1fee 100644 --- a/sklearn/utils/tests/test_cython_templating.py +++ b/sklearn/utils/tests/test_cython_templating.py @@ -2,8 +2,6 @@ import pytest import sklearn -TEMPITA_EXTENSION = "tp" - def test_files_generated_by_templates_are_git_ignored(): gitignore_file = pathlib.Path(sklearn.__file__).parent.parent / ".gitignore" @@ -14,7 +12,7 @@ def test_files_generated_by_templates_are_git_ignored(): ignored_files = open(gitignore_file, "r").readlines() ignored_files = list(map(lambda line: line.strip("\n"), ignored_files)) - for filename in base_dir.glob(f"**/*.{TEMPITA_EXTENSION}"): + for filename in base_dir.glob("**/*.tp"): filename = str(filename).split("scikit-learn/")[-1] - filename_wo_tempita_suffix = filename.strip(f".{TEMPITA_EXTENSION}") + filename_wo_tempita_suffix = filename.strip(".tp") assert filename_wo_tempita_suffix in ignored_files From fc8a32d299a8f11ce03861f5c7b2eea8c5441c4f Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 2 Aug 2021 16:13:21 +0200 Subject: [PATCH 25/39] Remove unused import. --- sklearn/utils/tests/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 244de855c00bd..edb1deb6d2031 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -5,7 +5,6 @@ import timeit import pytest -import os import sklearn import pathlib import numpy as np From 2794c77a20aea53d79d5f631167321fa5cbf6369 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 2 Aug 2021 16:14:37 +0200 Subject: [PATCH 26/39] Fix `test_files_generated_by_templates_are_git_ignored`. --- sklearn/utils/tests/test_cython_templating.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_cython_templating.py b/sklearn/utils/tests/test_cython_templating.py index 4cc5e6dae1fee..47d054562b4cd 100644 --- a/sklearn/utils/tests/test_cython_templating.py +++ b/sklearn/utils/tests/test_cython_templating.py @@ -13,6 +13,6 @@ def test_files_generated_by_templates_are_git_ignored(): ignored_files = list(map(lambda line: line.strip("\n"), ignored_files)) for filename in base_dir.glob("**/*.tp"): - filename = str(filename).split("scikit-learn/")[-1] - filename_wo_tempita_suffix = filename.strip(".tp") - assert filename_wo_tempita_suffix in ignored_files + filename = filename.relative_to(base_dir.parent) + filename_wo_tempita_suffix = filename.with_suffix("") + assert str(filename_wo_tempita_suffix) in ignored_files From 9452515501c1fb80e640ad9e609aad7544986a87 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 2 Aug 2021 16:42:19 +0200 Subject: [PATCH 27/39] Revert "Add test for source files generation from templates." This reverts commit 9456ba37980141c029a997468ab552c25a609f21. --- sklearn/utils/tests/test_utils.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index edb1deb6d2031..76d12abf10f6d 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -5,8 +5,6 @@ import timeit import pytest -import sklearn -import pathlib import numpy as np import scipy.sparse as sp @@ -33,7 +31,6 @@ from sklearn.utils import _to_object_array from sklearn.utils._mocking import MockDataFrame from sklearn import config_context -from sklearn._build_utils import gen_from_templates # toy array X_toy = np.arange(9).reshape((3, 3)) @@ -688,22 +685,3 @@ def test_to_object_array(sequence): assert isinstance(out, np.ndarray) assert out.dtype.kind == "O" assert out.ndim == 1 - - -@pytest.mark.parametrize("file_extension", ["pyx", "pxd"]) -def test_files_generation_from_templates(file_extension): - base_dir = base_dir = pathlib.Path(sklearn.__file__).parent.parent - template_filename = f"dummy.{file_extension}.tp" - template_file = base_dir / template_filename - - f = open(template_file, "a") - f.close() - - templates = [template_filename] - gen_from_templates(templates) - - generated_file = base_dir / f"dummy.{file_extension}" - assert generated_file.exists() - - template_file.unlink() - generated_file.unlink() From 0d2af891080d1e23c7b0a54683dce3dcefd36bed Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 2 Aug 2021 16:43:56 +0200 Subject: [PATCH 28/39] Remove top file author. --- sklearn/utils/tests/test_weight_vector.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/utils/tests/test_weight_vector.py b/sklearn/utils/tests/test_weight_vector.py index 93ca628b8d520..18308f086d003 100644 --- a/sklearn/utils/tests/test_weight_vector.py +++ b/sklearn/utils/tests/test_weight_vector.py @@ -1,7 +1,3 @@ -# Author: Mathis Batoul -# -# License: BSD 3 clause - import numpy as np import pytest from sklearn.utils._weight_vector import ( From 68282bef81181c59578b0c025f31461c1a18ee94 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Mon, 2 Aug 2021 17:23:53 +0200 Subject: [PATCH 29/39] Use typed-memoryview for `WeightVector` interface. --- sklearn/utils/_weight_vector.pxd.tp | 4 ++-- sklearn/utils/_weight_vector.pyx.tp | 22 ++++++++++------------ sklearn/utils/tests/test_weight_vector.py | 4 ++-- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/sklearn/utils/_weight_vector.pxd.tp b/sklearn/utils/_weight_vector.pxd.tp index d9a4bebf74fa5..c947872bee026 100644 --- a/sklearn/utils/_weight_vector.pxd.tp +++ b/sklearn/utils/_weight_vector.pxd.tp @@ -27,8 +27,8 @@ cdef extern from "math.h": cdef class WeightVector{{name}}(object): - cdef readonly np.ndarray w - cdef readonly np.ndarray aw + cdef readonly {{c_type}}[::1] w + cdef readonly {{c_type}}[::1] aw cdef {{c_type}} *w_data_ptr cdef {{c_type}} *aw_data_ptr cdef {{c_type}} wscale diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index b9d80799081e1..a57ec6e9cd145 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -66,22 +66,21 @@ cdef class WeightVector{{name}}(object): """ def __cinit__(self, - np.ndarray[{{c_type}}, ndim=1, mode='c'] w, - np.ndarray[{{c_type}}, ndim=1, mode='c'] aw): - cdef {{c_type}} *wdata = <{{c_type}} *>w.data + {{c_type}}[::1] w, + {{c_type}}[::1] aw): if w.shape[0] > INT_MAX: raise ValueError("More than %d features not supported; got %d." % (INT_MAX, w.shape[0])) self.w = w - self.w_data_ptr = wdata + self.w_data_ptr = &w[0] self.wscale = 1.0 self.n_features = w.shape[0] - self.sq_norm = _dot(w.shape[0], wdata, 1, wdata, 1) + self.sq_norm = _dot(self.n_features, self.w_data_ptr, 1, self.w_data_ptr, 1) self.aw = aw if self.aw is not None: - self.aw_data_ptr = <{{c_type}} *>aw.data + self.aw_data_ptr = &aw[0] self.average_a = 0.0 self.average_b = 1.0 @@ -200,15 +199,14 @@ cdef class WeightVector{{name}}(object): cdef void reset_wscale(self) nogil: """Scales each coef of ``w`` by ``wscale`` and resets it to 1. """ - if self.aw is not None: - _axpy(self.aw.shape[0], self.average_a, - <{{c_type}} *>self.w.data, 1, <{{c_type}} *>self.aw.data, 1) - _scal(self.aw.shape[0], 1.0 / self.average_b, - <{{c_type}} *>self.aw.data, 1) + if self.aw_data_ptr != NULL: + _axpy(self.n_features, self.average_a, + self.w_data_ptr, 1, self.aw_data_ptr, 1) + _scal(self.n_features, 1.0 / self.average_b, self.aw_data_ptr, 1) self.average_a = 0.0 self.average_b = 1.0 - _scal(self.w.shape[0], self.wscale, <{{c_type}} *>self.w.data, 1) + _scal(self.n_features, self.wscale, self.w_data_ptr, 1) self.wscale = 1.0 cdef {{c_type}} norm(self) nogil: diff --git a/sklearn/utils/tests/test_weight_vector.py b/sklearn/utils/tests/test_weight_vector.py index 18308f086d003..fa83648ea3b72 100644 --- a/sklearn/utils/tests/test_weight_vector.py +++ b/sklearn/utils/tests/test_weight_vector.py @@ -17,5 +17,5 @@ def test_type_invariance(dtype, weight_vector_class): weights = np.random.rand(100).astype(dtype) average_weights = np.random.rand(100).astype(dtype) weight_vector = weight_vector_class(weights, average_weights) - assert weight_vector.w.dtype is np.dtype(dtype) - assert weight_vector.aw.dtype is np.dtype(dtype) + assert np.asarray(weight_vector.w).dtype is np.dtype(dtype) + assert np.asarray(weight_vector.aw).dtype is np.dtype(dtype) From 1043fa091db0bf169218af8c43f8c4d913cee015 Mon Sep 17 00:00:00 2001 From: Mathis Batoul Date: Wed, 4 Aug 2021 11:00:16 +0200 Subject: [PATCH 30/39] Pass class name argument to parametrize`test_weight_vector`. Co-authored-by: Julien Jerphanion --- sklearn/utils/tests/test_weight_vector.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_weight_vector.py b/sklearn/utils/tests/test_weight_vector.py index fa83648ea3b72..3177c8bcaf2ee 100644 --- a/sklearn/utils/tests/test_weight_vector.py +++ b/sklearn/utils/tests/test_weight_vector.py @@ -7,15 +7,17 @@ @pytest.mark.parametrize( - "dtype, weight_vector_class", + "dtype, WeightVector", [ (np.float32, WeightVector32), (np.float64, WeightVector64), ], ) -def test_type_invariance(dtype, weight_vector_class): +def test_type_invariance(dtype, WeightVector): weights = np.random.rand(100).astype(dtype) average_weights = np.random.rand(100).astype(dtype) - weight_vector = weight_vector_class(weights, average_weights) + + weight_vector = WeightVector(weights, average_weights) + assert np.asarray(weight_vector.w).dtype is np.dtype(dtype) assert np.asarray(weight_vector.aw).dtype is np.dtype(dtype) From 1ad9ae3ef0324af0a5919f6fd47dcf625a3b5cca Mon Sep 17 00:00:00 2001 From: Mathis Batoul Date: Wed, 4 Aug 2021 11:01:04 +0200 Subject: [PATCH 31/39] Add comment to `test_cython_templating`. Co-authored-by: Julien Jerphanion --- sklearn/utils/tests/test_cython_templating.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/tests/test_cython_templating.py b/sklearn/utils/tests/test_cython_templating.py index 47d054562b4cd..f39b3ab161fcc 100644 --- a/sklearn/utils/tests/test_cython_templating.py +++ b/sklearn/utils/tests/test_cython_templating.py @@ -14,5 +14,6 @@ def test_files_generated_by_templates_are_git_ignored(): for filename in base_dir.glob("**/*.tp"): filename = filename.relative_to(base_dir.parent) + # From "path/to/template.p??.tp" to "path/to/template.p??" filename_wo_tempita_suffix = filename.with_suffix("") assert str(filename_wo_tempita_suffix) in ignored_files From a8d7cf96ef6da4c5c7946dc74c502d90b4fabf2f Mon Sep 17 00:00:00 2001 From: mbatoul Date: Wed, 4 Aug 2021 12:12:58 +0200 Subject: [PATCH 32/39] Add type to epsilon for comparison in `WeightVector#scale`. --- sklearn/utils/_weight_vector.pyx.tp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index a57ec6e9cd145..4c28d19dad017 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -194,7 +194,8 @@ cdef class WeightVector{{name}}(object): small we call ``reset_swcale``.""" self.wscale *= c self.sq_norm *= (c * c) - if self.wscale < 1e-9: + cdef {{c_type}} epsilon = 1e-9 + if self.wscale < epsilon: self.reset_wscale() cdef void reset_wscale(self) nogil: From ac353026032a07ca05f188e87890a259e7789971 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Wed, 4 Aug 2021 12:18:24 +0200 Subject: [PATCH 33/39] Remove whitespaces. --- sklearn/utils/tests/test_cython_templating.py | 2 +- sklearn/utils/tests/test_weight_vector.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_cython_templating.py b/sklearn/utils/tests/test_cython_templating.py index f39b3ab161fcc..06ad89135fb87 100644 --- a/sklearn/utils/tests/test_cython_templating.py +++ b/sklearn/utils/tests/test_cython_templating.py @@ -14,6 +14,6 @@ def test_files_generated_by_templates_are_git_ignored(): for filename in base_dir.glob("**/*.tp"): filename = filename.relative_to(base_dir.parent) - # From "path/to/template.p??.tp" to "path/to/template.p??" + # From "path/to/template.p??.tp" to "path/to/template.p??" filename_wo_tempita_suffix = filename.with_suffix("") assert str(filename_wo_tempita_suffix) in ignored_files diff --git a/sklearn/utils/tests/test_weight_vector.py b/sklearn/utils/tests/test_weight_vector.py index 3177c8bcaf2ee..350520e02fab8 100644 --- a/sklearn/utils/tests/test_weight_vector.py +++ b/sklearn/utils/tests/test_weight_vector.py @@ -16,8 +16,8 @@ def test_type_invariance(dtype, WeightVector): weights = np.random.rand(100).astype(dtype) average_weights = np.random.rand(100).astype(dtype) - + weight_vector = WeightVector(weights, average_weights) - + assert np.asarray(weight_vector.w).dtype is np.dtype(dtype) assert np.asarray(weight_vector.aw).dtype is np.dtype(dtype) From 0f4b057999ecbb67b35aeee88679d64c93d15b36 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Thu, 5 Aug 2021 15:27:25 +0200 Subject: [PATCH 34/39] Rename Tempita variable `name` as `name_suffix` in `.tp` files. --- sklearn/linear_model/_sag_fast.pyx.tp | 74 +++++++++++++-------------- sklearn/utils/_seq_dataset.pxd.tp | 10 ++-- sklearn/utils/_seq_dataset.pyx.tp | 10 ++-- sklearn/utils/_weight_vector.pxd.tp | 6 +-- sklearn/utils/_weight_vector.pyx.tp | 6 +-- 5 files changed, 53 insertions(+), 53 deletions(-) diff --git a/sklearn/linear_model/_sag_fast.pyx.tp b/sklearn/linear_model/_sag_fast.pyx.tp index d9989f60aa23a..1a4f188a91fbc 100644 --- a/sklearn/linear_model/_sag_fast.pyx.tp +++ b/sklearn/linear_model/_sag_fast.pyx.tp @@ -19,7 +19,7 @@ Authors: Danny Sullivan License: BSD 3 clause """ -# name, c_type +# name_suffix, c_type dtypes = [('64', 'double', 'np.float64'), ('32', 'float', 'np.float32')] @@ -57,17 +57,17 @@ from libc.stdio cimport printf np.import_array() -{{for name, c_type, np_type in dtypes}} +{{for name_suffix, c_type, np_type in dtypes}} cdef extern from "_sgd_fast_helpers.h": - bint skl_isfinite{{name}}({{c_type}}) nogil + bint skl_isfinite{{name_suffix}}({{c_type}}) nogil {{endfor}} -{{for name, c_type, np_type in dtypes}} +{{for name_suffix, c_type, np_type in dtypes}} -cdef inline {{c_type}} fmax{{name}}({{c_type}} x, {{c_type}} y) nogil: +cdef inline {{c_type}} fmax{{name_suffix}}({{c_type}} x, {{c_type}} y) nogil: if x > y: return x return y @@ -75,9 +75,9 @@ cdef inline {{c_type}} fmax{{name}}({{c_type}} x, {{c_type}} y) nogil: {{endfor}} -{{for name, c_type, np_type in dtypes}} +{{for name_suffix, c_type, np_type in dtypes}} -cdef {{c_type}} _logsumexp{{name}}({{c_type}}* arr, int n_classes) nogil: +cdef {{c_type}} _logsumexp{{name_suffix}}({{c_type}}* arr, int n_classes) nogil: """Computes the sum of arr assuming arr is in the log domain. Returns log(sum(exp(arr))) while minimizing the possibility of @@ -101,9 +101,9 @@ cdef {{c_type}} _logsumexp{{name}}({{c_type}}* arr, int n_classes) nogil: {{endfor}} -{{for name, c_type, np_type in dtypes}} +{{for name_suffix, c_type, np_type in dtypes}} -cdef class MultinomialLogLoss{{name}}: +cdef class MultinomialLogLoss{{name_suffix}}: cdef {{c_type}} _loss(self, {{c_type}}* prediction, {{c_type}} y, int n_classes, {{c_type}} sample_weight) nogil: r"""Multinomial Logistic regression loss. @@ -141,7 +141,7 @@ cdef class MultinomialLogLoss{{name}}: Bishop, C. M. (2006). Pattern recognition and machine learning. Springer. (Chapter 4.3.4) """ - cdef {{c_type}} logsumexp_prediction = _logsumexp{{name}}(prediction, n_classes) + cdef {{c_type}} logsumexp_prediction = _logsumexp{{name_suffix}}(prediction, n_classes) cdef {{c_type}} loss # y is the indice of the correct class of current sample. @@ -187,7 +187,7 @@ cdef class MultinomialLogLoss{{name}}: Bishop, C. M. (2006). Pattern recognition and machine learning. Springer. (Chapter 4.3.4) """ - cdef {{c_type}} logsumexp_prediction = _logsumexp{{name}}(prediction, n_classes) + cdef {{c_type}} logsumexp_prediction = _logsumexp{{name_suffix}}(prediction, n_classes) cdef int class_ind for class_ind in range(n_classes): @@ -201,21 +201,21 @@ cdef class MultinomialLogLoss{{name}}: gradient_ptr[class_ind] *= sample_weight def __reduce__(self): - return MultinomialLogLoss{{name}}, () + return MultinomialLogLoss{{name_suffix}}, () {{endfor}} -{{for name, c_type, np_type in dtypes}} +{{for name_suffix, c_type, np_type in dtypes}} -cdef inline {{c_type}} _soft_thresholding{{name}}({{c_type}} x, {{c_type}} shrinkage) nogil: - return fmax{{name}}(x - shrinkage, 0) - fmax{{name}}(- x - shrinkage, 0) +cdef inline {{c_type}} _soft_thresholding{{name_suffix}}({{c_type}} x, {{c_type}} shrinkage) nogil: + return fmax{{name_suffix}}(x - shrinkage, 0) - fmax{{name_suffix}}(- x - shrinkage, 0) {{endfor}} -{{for name, c_type, np_type in dtypes}} +{{for name_suffix, c_type, np_type in dtypes}} -def sag{{name}}(SequentialDataset{{name}} dataset, +def sag{{name_suffix}}(SequentialDataset{{name_suffix}} dataset, np.ndarray[{{c_type}}, ndim=2, mode='c'] weights_array, np.ndarray[{{c_type}}, ndim=1, mode='c'] intercept_array, int n_samples, @@ -356,11 +356,11 @@ def sag{{name}}(SequentialDataset{{name}} dataset, # Wether the loss function is multinomial cdef bint multinomial = False # Multinomial loss function - cdef MultinomialLogLoss{{name}} multiloss + cdef MultinomialLogLoss{{name_suffix}} multiloss if loss_function == "multinomial": multinomial = True - multiloss = MultinomialLogLoss{{name}}() + multiloss = MultinomialLogLoss{{name_suffix}}() elif loss_function == "log": loss = Log() elif loss_function == "squared": @@ -395,7 +395,7 @@ def sag{{name}}(SequentialDataset{{name}} dataset, # make the weight updates if sample_itr > 0: - status = lagged_update{{name}}(weights, wscale, xnnz, + status = lagged_update{{name_suffix}}(weights, wscale, xnnz, n_samples, n_classes, sample_itr, cumulative_sums, @@ -410,7 +410,7 @@ def sag{{name}}(SequentialDataset{{name}} dataset, break # find the current prediction - predict_sample{{name}}(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, + predict_sample{{name_suffix}}(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, intercept, prediction, n_classes) # compute the gradient for this sample, given the prediction @@ -455,7 +455,7 @@ def sag{{name}}(SequentialDataset{{name}} dataset, num_seen * intercept_decay) # check to see that the intercept is not inf or NaN - if not skl_isfinite{{name}}(intercept[class_ind]): + if not skl_isfinite{{name_suffix}}(intercept[class_ind]): status = -1 break # Break from the n_samples outer loop if an error happened @@ -484,7 +484,7 @@ def sag{{name}}(SequentialDataset{{name}} dataset, if verbose: with gil: print("rescaling...") - status = scale_weights{{name}}( + status = scale_weights{{name_suffix}}( weights, &wscale, n_features, n_samples, n_classes, sample_itr, cumulative_sums, cumulative_sums_prox, @@ -500,7 +500,7 @@ def sag{{name}}(SequentialDataset{{name}} dataset, # we scale the weights every n_samples iterations and reset the # just-in-time update system for numerical stability. - status = scale_weights{{name}}(weights, &wscale, n_features, + status = scale_weights{{name_suffix}}(weights, &wscale, n_features, n_samples, n_classes, n_samples - 1, cumulative_sums, @@ -514,8 +514,8 @@ def sag{{name}}(SequentialDataset{{name}} dataset, max_change = 0.0 max_weight = 0.0 for idx in range(n_features * n_classes): - max_weight = fmax{{name}}(max_weight, fabs(weights[idx])) - max_change = fmax{{name}}(max_change, + max_weight = fmax{{name_suffix}}(max_weight, fabs(weights[idx])) + max_change = fmax{{name_suffix}}(max_change, fabs(weights[idx] - previous_weights[idx])) previous_weights[idx] = weights[idx] @@ -549,9 +549,9 @@ def sag{{name}}(SequentialDataset{{name}} dataset, {{endfor}} -{{for name, c_type, np_type in dtypes}} +{{for name_suffix, c_type, np_type in dtypes}} -cdef int scale_weights{{name}}({{c_type}}* weights, {{c_type}}* wscale, +cdef int scale_weights{{name_suffix}}({{c_type}}* weights, {{c_type}}* wscale, int n_features, int n_samples, int n_classes, int sample_itr, {{c_type}}* cumulative_sums, @@ -570,7 +570,7 @@ cdef int scale_weights{{name}}({{c_type}}* weights, {{c_type}}* wscale, """ cdef int status - status = lagged_update{{name}}(weights, wscale[0], n_features, + status = lagged_update{{name_suffix}}(weights, wscale[0], n_features, n_samples, n_classes, sample_itr + 1, cumulative_sums, cumulative_sums_prox, @@ -588,9 +588,9 @@ cdef int scale_weights{{name}}({{c_type}}* weights, {{c_type}}* wscale, {{endfor}} -{{for name, c_type, np_type in dtypes}} +{{for name_suffix, c_type, np_type in dtypes}} -cdef int lagged_update{{name}}({{c_type}}* weights, {{c_type}} wscale, int xnnz, +cdef int lagged_update{{name_suffix}}({{c_type}}* weights, {{c_type}} wscale, int xnnz, int n_samples, int n_classes, int sample_itr, {{c_type}}* cumulative_sums, {{c_type}}* cumulative_sums_prox, @@ -626,7 +626,7 @@ cdef int lagged_update{{name}}({{c_type}}* weights, {{c_type}} wscale, int xnnz, weights[idx] -= cum_sum * sum_gradient[idx] if reset: weights[idx] *= wscale - if not skl_isfinite{{name}}(weights[idx]): + if not skl_isfinite{{name_suffix}}(weights[idx]): # returning here does not require the gil as the return # type is a C integer return -1 @@ -639,7 +639,7 @@ cdef int lagged_update{{name}}({{c_type}}* weights, {{c_type}} wscale, int xnnz, # efficient than unrolling all the lagged updates. # Idea taken from scikit-learn-contrib/lightning. weights[idx] -= cum_sum * sum_gradient[idx] - weights[idx] = _soft_thresholding{{name}}(weights[idx], + weights[idx] = _soft_thresholding{{name_suffix}}(weights[idx], cum_sum_prox) else: last_update_ind = feature_hist[feature_ind] @@ -656,13 +656,13 @@ cdef int lagged_update{{name}}({{c_type}}* weights, {{c_type}} wscale, int xnnz, grad_step = cumulative_sums[lagged_ind] prox_step = cumulative_sums_prox[lagged_ind] weights[idx] -= sum_gradient[idx] * grad_step - weights[idx] = _soft_thresholding{{name}}(weights[idx], + weights[idx] = _soft_thresholding{{name_suffix}}(weights[idx], prox_step) if reset: weights[idx] *= wscale # check to see that the weight is not inf or NaN - if not skl_isfinite{{name}}(weights[idx]): + if not skl_isfinite{{name_suffix}}(weights[idx]): return -1 if reset: feature_hist[feature_ind] = sample_itr % n_samples @@ -679,9 +679,9 @@ cdef int lagged_update{{name}}({{c_type}}* weights, {{c_type}} wscale, int xnnz, {{endfor}} -{{for name, c_type, np_type in dtypes}} +{{for name_suffix, c_type, np_type in dtypes}} -cdef void predict_sample{{name}}({{c_type}}* x_data_ptr, int* x_ind_ptr, int xnnz, +cdef void predict_sample{{name_suffix}}({{c_type}}* x_data_ptr, int* x_ind_ptr, int xnnz, {{c_type}}* w_data_ptr, {{c_type}} wscale, {{c_type}}* intercept, {{c_type}}* prediction, int n_classes) nogil: diff --git a/sklearn/utils/_seq_dataset.pxd.tp b/sklearn/utils/_seq_dataset.pxd.tp index 9e4237fc0f51a..428f44a2c0358 100644 --- a/sklearn/utils/_seq_dataset.pxd.tp +++ b/sklearn/utils/_seq_dataset.pxd.tp @@ -12,12 +12,12 @@ Each class is duplicated for all dtypes (float and double). The keywords between double braces are substituted in setup.py. """ -# name, c_type +# name_suffix, c_type dtypes = [('64', 'double'), ('32', 'float')] }} -{{for name, c_type in dtypes}} +{{for name_suffix, c_type in dtypes}} #------------------------------------------------------------------------------ @@ -32,7 +32,7 @@ cimport numpy as np # iterators over the rows of a matrix X and corresponding target values y. -cdef class SequentialDataset{{name}}: +cdef class SequentialDataset{{name_suffix}}: cdef int current_index cdef np.ndarray index cdef int *index_data_ptr @@ -52,7 +52,7 @@ cdef class SequentialDataset{{name}}: int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) nogil -cdef class ArrayDataset{{name}}(SequentialDataset{{name}}): +cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): cdef np.ndarray X cdef np.ndarray Y cdef np.ndarray sample_weights @@ -65,7 +65,7 @@ cdef class ArrayDataset{{name}}(SequentialDataset{{name}}): cdef {{c_type}} *sample_weight_data -cdef class CSRDataset{{name}}(SequentialDataset{{name}}): +cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): cdef np.ndarray X_data cdef np.ndarray X_indptr cdef np.ndarray X_indices diff --git a/sklearn/utils/_seq_dataset.pyx.tp b/sklearn/utils/_seq_dataset.pyx.tp index a5173fb60c0da..8bc901194a24e 100644 --- a/sklearn/utils/_seq_dataset.pyx.tp +++ b/sklearn/utils/_seq_dataset.pyx.tp @@ -20,12 +20,12 @@ Author: Peter Prettenhofer License: BSD 3 clause """ -# name, c_type, np_type +# name_suffix, c_type, np_type dtypes = [('64', 'double', 'np.float64'), ('32', 'float', 'np.float32')] }} -{{for name, c_type, np_type in dtypes}} +{{for name_suffix, c_type, np_type in dtypes}} #------------------------------------------------------------------------------ @@ -43,7 +43,7 @@ np.import_array() from ._random cimport our_rand_r -cdef class SequentialDataset{{name}}: +cdef class SequentialDataset{{name_suffix}}: """Base class for datasets with sequential data access. SequentialDataset is used to iterate over the rows of a matrix X and @@ -215,7 +215,7 @@ cdef class SequentialDataset{{name}}: return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx -cdef class ArrayDataset{{name}}(SequentialDataset{{name}}): +cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): """Dataset backed by a two-dimensional numpy array. The dtype of the numpy array is expected to be ``{{np_type}}`` ({{c_type}}) @@ -284,7 +284,7 @@ cdef class ArrayDataset{{name}}(SequentialDataset{{name}}): sample_weight[0] = self.sample_weight_data[sample_idx] -cdef class CSRDataset{{name}}(SequentialDataset{{name}}): +cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """ def __cinit__(self, np.ndarray[{{c_type}}, ndim=1, mode='c'] X_data, diff --git a/sklearn/utils/_weight_vector.pxd.tp b/sklearn/utils/_weight_vector.pxd.tp index c947872bee026..6fd3af5c5482e 100644 --- a/sklearn/utils/_weight_vector.pxd.tp +++ b/sklearn/utils/_weight_vector.pxd.tp @@ -12,7 +12,7 @@ Each class is duplicated for all dtypes (float and double). The keywords between double braces are substituted in setup.py. """ -# name, c_type +# name_suffix, c_type dtypes = [('64', 'double'), ('32', 'float')] @@ -20,13 +20,13 @@ dtypes = [('64', 'double'), cimport numpy as np -{{for name, c_type in dtypes}} +{{for name_suffix, c_type in dtypes}} cdef extern from "math.h": cdef extern {{c_type}} sqrt({{c_type}} x) -cdef class WeightVector{{name}}(object): +cdef class WeightVector{{name_suffix}}(object): cdef readonly {{c_type}}[::1] w cdef readonly {{c_type}}[::1] aw cdef {{c_type}} *w_data_ptr diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index 4c28d19dad017..aa3db0b44910b 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -12,7 +12,7 @@ Each class is duplicated for all dtypes (float and double). The keywords between double braces are substituted in setup.py. """ -# name, c_type +# name_suffix, c_type dtypes = [('64', 'double'), ('32', 'float')] @@ -39,9 +39,9 @@ from ._cython_blas cimport _dot, _scal, _axpy np.import_array() -{{for name, c_type in dtypes}} +{{for name_suffix, c_type in dtypes}} -cdef class WeightVector{{name}}(object): +cdef class WeightVector{{name_suffix}}(object): """Dense vector represented by a scalar and a numpy array. The class provides methods to ``add`` a sparse vector From 393f835030cbb0048f7ee6e0dac796b14c3a6780 Mon Sep 17 00:00:00 2001 From: mbatoul Date: Thu, 5 Aug 2021 15:46:11 +0200 Subject: [PATCH 35/39] Set `reset_wscale_threshold` at `1e-6` for float32 and `1e-09` for float64. --- sklearn/utils/_weight_vector.pyx.tp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index aa3db0b44910b..b942cc45e1c99 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -12,9 +12,9 @@ Each class is duplicated for all dtypes (float and double). The keywords between double braces are substituted in setup.py. """ -# name_suffix, c_type -dtypes = [('64', 'double'), - ('32', 'float')] +# name_suffix, c_type, reset_wscale_threshold +dtypes = [('64', 'double', 1e-9), + ('32', 'float', 1e-6)] }} @@ -39,7 +39,7 @@ from ._cython_blas cimport _dot, _scal, _axpy np.import_array() -{{for name_suffix, c_type in dtypes}} +{{for name_suffix, c_type, reset_wscale_threshold in dtypes}} cdef class WeightVector{{name_suffix}}(object): """Dense vector represented by a scalar and a numpy array. @@ -194,8 +194,10 @@ cdef class WeightVector{{name_suffix}}(object): small we call ``reset_swcale``.""" self.wscale *= c self.sq_norm *= (c * c) - cdef {{c_type}} epsilon = 1e-9 - if self.wscale < epsilon: + + cdef {{c_type}} threshold = {{reset_wscale_threshold}} + + if self.wscale < threshold: self.reset_wscale() cdef void reset_wscale(self) nogil: From 3c56e7ca699769f191680511a42b171598ac940c Mon Sep 17 00:00:00 2001 From: mbatoul Date: Fri, 6 Aug 2021 12:19:21 +0200 Subject: [PATCH 36/39] Update `setup.cfg` file. --- setup.cfg | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 8ee90da7436c0..34d462f506f7f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,6 +65,8 @@ allow_redefinition = True [check-manifest] # ignore files missing in VCS ignore = - sklearn/linear_model/_sag_fast.pyx - sklearn/utils/_seq_dataset.pxd sklearn/utils/_seq_dataset.pyx + sklearn/utils/_seq_dataset.pxd + sklearn/utils/_weight_vector.pyx + sklearn/utils/_weight_vector.pxd + sklearn/linear_model/_sag_fast.pyx From d52c6767fdb3b2626c829ec3666b47587222c77c Mon Sep 17 00:00:00 2001 From: Mathis Batoul Date: Fri, 6 Aug 2021 14:47:25 +0200 Subject: [PATCH 37/39] Keep alphanumerical order of files in `setup.cfg`. Co-authored-by: Julien Jerphanion --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 34d462f506f7f..7d1ca5c07cc33 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,8 +65,8 @@ allow_redefinition = True [check-manifest] # ignore files missing in VCS ignore = + sklearn/linear_model/_sag_fast.pyx sklearn/utils/_seq_dataset.pyx sklearn/utils/_seq_dataset.pxd sklearn/utils/_weight_vector.pyx sklearn/utils/_weight_vector.pxd - sklearn/linear_model/_sag_fast.pyx From 033f3f4b5ebda3fd117604c5a2ae283f3912cf90 Mon Sep 17 00:00:00 2001 From: Mathis Batoul Date: Mon, 9 Aug 2021 21:08:39 +0200 Subject: [PATCH 38/39] Stop storing `reset_wscale_threshold` in a variable. Co-authored-by: Olivier Grisel --- sklearn/utils/_weight_vector.pyx.tp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index b942cc45e1c99..a5bda689ae247 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -195,9 +195,7 @@ cdef class WeightVector{{name_suffix}}(object): self.wscale *= c self.sq_norm *= (c * c) - cdef {{c_type}} threshold = {{reset_wscale_threshold}} - - if self.wscale < threshold: + if self.wscale < {{reset_wscale_threshold}}: self.reset_wscale() cdef void reset_wscale(self) nogil: From 6abf315074d0b030d87c88d7793d3b39aecab2b8 Mon Sep 17 00:00:00 2001 From: Mathis Batoul Date: Mon, 9 Aug 2021 21:11:47 +0200 Subject: [PATCH 39/39] Improve docstrings. Co-authored-by: Guillaume Lemaitre --- sklearn/linear_model/_sag_fast.pyx.tp | 2 +- sklearn/utils/_weight_vector.pxd.tp | 6 +++--- sklearn/utils/_weight_vector.pyx.tp | 2 +- sklearn/utils/tests/test_cython_templating.py | 1 + sklearn/utils/tests/test_weight_vector.py | 1 + 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_sag_fast.pyx.tp b/sklearn/linear_model/_sag_fast.pyx.tp index 1a4f188a91fbc..6ca141fe99305 100644 --- a/sklearn/linear_model/_sag_fast.pyx.tp +++ b/sklearn/linear_model/_sag_fast.pyx.tp @@ -19,7 +19,7 @@ Authors: Danny Sullivan License: BSD 3 clause """ -# name_suffix, c_type +# name_suffix, c_type, np_type dtypes = [('64', 'double', 'np.float64'), ('32', 'float', 'np.float32')] diff --git a/sklearn/utils/_weight_vector.pxd.tp b/sklearn/utils/_weight_vector.pxd.tp index 6fd3af5c5482e..8b0fc234713fb 100644 --- a/sklearn/utils/_weight_vector.pxd.tp +++ b/sklearn/utils/_weight_vector.pxd.tp @@ -1,7 +1,7 @@ {{py: """ -Efficient (dense) parameter vector implementation for linear models. +Efficient (dense) parameter vector implementation for linear models. Template file for easily generate fused types consistent code using Tempita (https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py). @@ -37,9 +37,9 @@ cdef class WeightVector{{name_suffix}}(object): cdef int n_features cdef {{c_type}} sq_norm - cdef void add(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, + cdef void add(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz, {{c_type}} c) nogil - cdef void add_average(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, + cdef void add_average(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz, {{c_type}} c, {{c_type}} num_iter) nogil cdef {{c_type}} dot(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz) nogil diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp index a5bda689ae247..0e8ec45121438 100644 --- a/sklearn/utils/_weight_vector.pyx.tp +++ b/sklearn/utils/_weight_vector.pyx.tp @@ -1,7 +1,7 @@ {{py: """ -Efficient (dense) parameter vector implementation for linear models. +Efficient (dense) parameter vector implementation for linear models. Template file for easily generate fused types consistent code using Tempita (https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py). diff --git a/sklearn/utils/tests/test_cython_templating.py b/sklearn/utils/tests/test_cython_templating.py index 06ad89135fb87..572d1db523cf8 100644 --- a/sklearn/utils/tests/test_cython_templating.py +++ b/sklearn/utils/tests/test_cython_templating.py @@ -4,6 +4,7 @@ def test_files_generated_by_templates_are_git_ignored(): + """Check the consistence of the files generated from template files.""" gitignore_file = pathlib.Path(sklearn.__file__).parent.parent / ".gitignore" if not gitignore_file.exists(): pytest.skip("Tests are not run from the source folder") diff --git a/sklearn/utils/tests/test_weight_vector.py b/sklearn/utils/tests/test_weight_vector.py index 350520e02fab8..627d46d1fda06 100644 --- a/sklearn/utils/tests/test_weight_vector.py +++ b/sklearn/utils/tests/test_weight_vector.py @@ -14,6 +14,7 @@ ], ) def test_type_invariance(dtype, WeightVector): + """Check the `dtype` consistency of `WeightVector`.""" weights = np.random.rand(100).astype(dtype) average_weights = np.random.rand(100).astype(dtype)