From 1b182c6a2e12e90db70085ebb1d5a199078146b2 Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Thu, 10 Oct 2019 16:25:49 -0400 Subject: [PATCH 01/19] Adding jitter to LassoLars fit --- doc/modules/linear_model.rst | 2 +- sklearn/linear_model/least_angle.py | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d663cc0ce3dca..194801f6c5b50 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -480,7 +480,7 @@ function of the norm of its coefficients. >>> reg.fit([[0, 0], [1, 1]], [0, 1]) LassoLars(alpha=0.1) >>> reg.coef_ - array([0.717157..., 0. ]) + array([0.717..., 0. ]) .. topic:: Examples: diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index d6b9609efb383..296f329825293 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -809,6 +809,11 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): setting ``fit_path`` to ``False`` will lead to a speedup, especially with a small alpha. + jitter : float, default=0.0001 + Uniform noise parameter, added to the y values, to satisfy \ + the model's assumption of one-at-a-time computations \ + (Efron et al. 2004). + Attributes ---------- alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \ @@ -855,7 +860,8 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): def __init__(self, fit_intercept=True, verbose=False, normalize=True, precompute='auto', n_nonzero_coefs=500, - eps=np.finfo(np.float).eps, copy_X=True, fit_path=True): + eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, + jitter=10e-5): self.fit_intercept = fit_intercept self.verbose = verbose self.normalize = normalize @@ -864,6 +870,7 @@ def __init__(self, fit_intercept=True, verbose=False, normalize=True, self.eps = eps self.copy_X = copy_X self.fit_path = fit_path + self.jitter = jitter @staticmethod def _get_gram(precompute, X, y): @@ -963,6 +970,13 @@ def fit(self, X, y, Xy=None): else: max_iter = self.max_iter + noise = np.random.uniform(high=self.jitter, size=len(y)) + + if y.ndim == 2: + noise = noise.reshape(-1,1) + + y = y + noise + self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path, Xy=Xy) @@ -1030,6 +1044,11 @@ class LassoLars(Lars): setting ``fit_path`` to ``False`` will lead to a speedup, especially with a small alpha. + jitter : float, default=0.0001 + Uniform noise parameter, added to the y values, to satisfy \ + the model's assumption of one-at-a-time computations \ + (Efron et al. 2004). + positive : bool, default=False Restrict coefficients to be >= 0. Be aware that you might want to remove fit_intercept which is set True by default. @@ -1074,7 +1093,7 @@ class LassoLars(Lars): >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1]) LassoLars(alpha=0.01) >>> print(reg.coef_) - [ 0. -0.963257...] + [ 0. -0.9632...] See also -------- @@ -1092,7 +1111,7 @@ class LassoLars(Lars): def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - positive=False): + positive=False, jitter=10e-5): self.alpha = alpha self.fit_intercept = fit_intercept self.max_iter = max_iter @@ -1103,6 +1122,7 @@ def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, self.copy_X = copy_X self.eps = eps self.fit_path = fit_path + self.jitter = jitter ############################################################################### From 7db0301213fe5ba38f01114b9d285a1d94d77cbe Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Thu, 10 Oct 2019 21:09:53 -0400 Subject: [PATCH 02/19] CircleCI fail --- sklearn/linear_model/least_angle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index 296f329825293..36ebf271fb04c 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -973,7 +973,7 @@ def fit(self, X, y, Xy=None): noise = np.random.uniform(high=self.jitter, size=len(y)) if y.ndim == 2: - noise = noise.reshape(-1,1) + noise = noise.reshape(-1, 1) y = y + noise From d075950ec41960c256cdf979d75299e07837f0fc Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Mon, 14 Oct 2019 20:24:58 -0400 Subject: [PATCH 03/19] MR comments --- sklearn/linear_model/least_angle.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index 36ebf271fb04c..e98817d98adc6 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -24,6 +24,7 @@ from ..exceptions import ConvergenceWarning SOLVE_TRIANGULAR_ARGS = {'check_finite': False} +JITTER = 10e-4 def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0, @@ -809,9 +810,9 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): setting ``fit_path`` to ``False`` will lead to a speedup, especially with a small alpha. - jitter : float, default=0.0001 - Uniform noise parameter, added to the y values, to satisfy \ - the model's assumption of one-at-a-time computations \ + jitter : float, default=0.001 + Uniform noise parameter, added to the y values, to satisfy + the model's assumption of one-at-a-time computations (Efron et al. 2004). Attributes @@ -861,7 +862,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): def __init__(self, fit_intercept=True, verbose=False, normalize=True, precompute='auto', n_nonzero_coefs=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - jitter=10e-5): + jitter=JITTER): self.fit_intercept = fit_intercept self.verbose = verbose self.normalize = normalize @@ -970,7 +971,7 @@ def fit(self, X, y, Xy=None): else: max_iter = self.max_iter - noise = np.random.uniform(high=self.jitter, size=len(y)) + noise = np.random.RandomState(0).uniform(high=self.jitter, size=len(y)) if y.ndim == 2: noise = noise.reshape(-1, 1) @@ -1044,7 +1045,7 @@ class LassoLars(Lars): setting ``fit_path`` to ``False`` will lead to a speedup, especially with a small alpha. - jitter : float, default=0.0001 + jitter : float, default=0.001 Uniform noise parameter, added to the y values, to satisfy \ the model's assumption of one-at-a-time computations \ (Efron et al. 2004). @@ -1111,7 +1112,7 @@ class LassoLars(Lars): def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - positive=False, jitter=10e-5): + positive=False, jitter=JITTER): self.alpha = alpha self.fit_intercept = fit_intercept self.max_iter = max_iter From 7b43af873e94865b1abfab9c2111ee1d45f3cfe1 Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Fri, 25 Oct 2019 10:51:44 -0400 Subject: [PATCH 04/19] Jitter becomes default, added test based on issue description --- sklearn/linear_model/least_angle.py | 17 ++++++------- .../linear_model/tests/test_least_angle.py | 25 +++++++++++++++++++ 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index e98817d98adc6..384c45df0ffbd 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -24,7 +24,7 @@ from ..exceptions import ConvergenceWarning SOLVE_TRIANGULAR_ARGS = {'check_finite': False} -JITTER = 10e-4 +DEFAULT_JITTER = None def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0, @@ -862,7 +862,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): def __init__(self, fit_intercept=True, verbose=False, normalize=True, precompute='auto', n_nonzero_coefs=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - jitter=JITTER): + jitter=DEFAULT_JITTER): self.fit_intercept = fit_intercept self.verbose = verbose self.normalize = normalize @@ -971,12 +971,11 @@ def fit(self, X, y, Xy=None): else: max_iter = self.max_iter - noise = np.random.RandomState(0).uniform(high=self.jitter, size=len(y)) - - if y.ndim == 2: - noise = noise.reshape(-1, 1) - - y = y + noise + if self.jitter: + noise = np.random.RandomState(0).uniform(high=self.jitter, size=len(y)) + if y.ndim == 2: + noise = noise.reshape(-1, 1) + y = y + noise self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path, Xy=Xy) @@ -1112,7 +1111,7 @@ class LassoLars(Lars): def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - positive=False, jitter=JITTER): + positive=False, jitter=DEFAULT_JITTER): self.alpha = alpha self.fit_intercept = fit_intercept self.max_iter = max_iter diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index f913f87c2fdbe..c0138d5b93dac 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -730,3 +730,28 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X): y = X[:, 2] lasso_lars.fit(X, y, copy_X=copy_X) assert copy_X == np.array_equal(X, X_copy) + +def test_lars_with_jitter(): + """ + Test that user input of a small amount of jitter, using example provided in issue #2746 + + """ + + A = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]]) + b = np.array([-2.5, -2.5]) + expected_output = np.array([0, 2.5, 0, 2.5, 0]) + alpha = 0.001 + fit_intercept = False + + lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept) + lars_with_jiggle = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept, + jitter=10e-5) + + lars.fit(A, b) + lars_with_jiggle.fit(A, b) + + w_nojiggle = lars.coef_ + w_jiggle = lars_with_jiggle.coef_ + + assert np.array_equal(w_jiggle, w_nojiggle) == False + assert_array_almost_equal(w_jiggle, expected_output, decimal=2) From b6c11fa7d91aa5c3addbc5a05a753419338b29c8 Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Fri, 25 Oct 2019 12:48:55 -0400 Subject: [PATCH 05/19] flake8 fixes --- sklearn/linear_model/sag_fast.pyx | 1357 +++++++++++++++++ .../linear_model/tests/test_least_angle.py | 9 +- sklearn/utils/seq_dataset.pxd | 116 ++ sklearn/utils/seq_dataset.pyx | 653 ++++++++ 4 files changed, 2132 insertions(+), 3 deletions(-) create mode 100644 sklearn/linear_model/sag_fast.pyx create mode 100644 sklearn/utils/seq_dataset.pxd create mode 100644 sklearn/utils/seq_dataset.pyx diff --git a/sklearn/linear_model/sag_fast.pyx b/sklearn/linear_model/sag_fast.pyx new file mode 100644 index 0000000000000..6d48b65bda560 --- /dev/null +++ b/sklearn/linear_model/sag_fast.pyx @@ -0,0 +1,1357 @@ + +#------------------------------------------------------------------------------ + +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# +# Authors: Danny Sullivan +# Tom Dupre la Tour +# Arthur Mensch y: + return x + return y + +cdef inline float fmax32(float x, float y) nogil: + if x > y: + return x + return y + +cdef double _logsumexp64(double* arr, int n_classes) nogil: + """Computes the sum of arr assuming arr is in the log domain. + + Returns log(sum(exp(arr))) while minimizing the possibility of + over/underflow. + """ + # Use the max to normalize, as with the log this is what accumulates + # the less errors + cdef double vmax = arr[0] + cdef double out = 0.0 + cdef int i + + for i in range(1, n_classes): + if vmax < arr[i]: + vmax = arr[i] + + for i in range(n_classes): + out += exp(arr[i] - vmax) + + return log(out) + vmax + +cdef float _logsumexp32(float* arr, int n_classes) nogil: + """Computes the sum of arr assuming arr is in the log domain. + + Returns log(sum(exp(arr))) while minimizing the possibility of + over/underflow. + """ + # Use the max to normalize, as with the log this is what accumulates + # the less errors + cdef float vmax = arr[0] + cdef float out = 0.0 + cdef int i + + for i in range(1, n_classes): + if vmax < arr[i]: + vmax = arr[i] + + for i in range(n_classes): + out += exp(arr[i] - vmax) + + return log(out) + vmax + +cdef class MultinomialLogLoss64: + cdef double _loss(self, double* prediction, double y, int n_classes, + double sample_weight) nogil: + r"""Multinomial Logistic regression loss. + + The multinomial logistic loss for one sample is: + loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction)) + = sw (logsumexp(prediction) - prediction[y]) + + where: + prediction = dot(x_sample, weights) + intercept + \delta_{y,c} = 1 if (y == c) else 0 + sw = sample_weight + + Parameters + ---------- + prediction : pointer to a np.ndarray[double] of shape (n_classes,) + Prediction of the multinomial classifier, for current sample. + + y : double, between 0 and n_classes - 1 + Indice of the correct class for current sample (i.e. label encoded). + + n_classes : integer + Total number of classes. + + sample_weight : double + Weight of current sample. + + Returns + ------- + loss : double + Multinomial loss for current sample. + + Reference + --------- + Bishop, C. M. (2006). Pattern recognition and machine learning. + Springer. (Chapter 4.3.4) + """ + cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes) + cdef double loss + + # y is the indice of the correct class of current sample. + loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight + return loss + + cdef void _dloss(self, double* prediction, double y, int n_classes, + double sample_weight, double* gradient_ptr) nogil: + r"""Multinomial Logistic regression gradient of the loss. + + The gradient of the multinomial logistic loss with respect to a class c, + and for one sample is: + grad_c = - sw * (p[c] - \delta_{y,c}) + + where: + p[c] = exp(logsumexp(prediction) - prediction[c]) + prediction = dot(sample, weights) + intercept + \delta_{y,c} = 1 if (y == c) else 0 + sw = sample_weight + + Note that to obtain the true gradient, this value has to be multiplied + by the sample vector x. + + Parameters + ---------- + prediction : pointer to a np.ndarray[double] of shape (n_classes,) + Prediction of the multinomial classifier, for current sample. + + y : double, between 0 and n_classes - 1 + Indice of the correct class for current sample (i.e. label encoded) + + n_classes : integer + Total number of classes. + + sample_weight : double + Weight of current sample. + + gradient_ptr : pointer to a np.ndarray[double] of shape (n_classes,) + Gradient vector to be filled. + + Reference + --------- + Bishop, C. M. (2006). Pattern recognition and machine learning. + Springer. (Chapter 4.3.4) + """ + cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes) + cdef int class_ind + + for class_ind in range(n_classes): + gradient_ptr[class_ind] = exp(prediction[class_ind] - + logsumexp_prediction) + + # y is the indice of the correct class of current sample. + if class_ind == y: + gradient_ptr[class_ind] -= 1.0 + + gradient_ptr[class_ind] *= sample_weight + + def __reduce__(self): + return MultinomialLogLoss64, () + +cdef class MultinomialLogLoss32: + cdef float _loss(self, float* prediction, float y, int n_classes, + float sample_weight) nogil: + r"""Multinomial Logistic regression loss. + + The multinomial logistic loss for one sample is: + loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction)) + = sw (logsumexp(prediction) - prediction[y]) + + where: + prediction = dot(x_sample, weights) + intercept + \delta_{y,c} = 1 if (y == c) else 0 + sw = sample_weight + + Parameters + ---------- + prediction : pointer to a np.ndarray[float] of shape (n_classes,) + Prediction of the multinomial classifier, for current sample. + + y : float, between 0 and n_classes - 1 + Indice of the correct class for current sample (i.e. label encoded). + + n_classes : integer + Total number of classes. + + sample_weight : float + Weight of current sample. + + Returns + ------- + loss : float + Multinomial loss for current sample. + + Reference + --------- + Bishop, C. M. (2006). Pattern recognition and machine learning. + Springer. (Chapter 4.3.4) + """ + cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes) + cdef float loss + + # y is the indice of the correct class of current sample. + loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight + return loss + + cdef void _dloss(self, float* prediction, float y, int n_classes, + float sample_weight, float* gradient_ptr) nogil: + r"""Multinomial Logistic regression gradient of the loss. + + The gradient of the multinomial logistic loss with respect to a class c, + and for one sample is: + grad_c = - sw * (p[c] - \delta_{y,c}) + + where: + p[c] = exp(logsumexp(prediction) - prediction[c]) + prediction = dot(sample, weights) + intercept + \delta_{y,c} = 1 if (y == c) else 0 + sw = sample_weight + + Note that to obtain the true gradient, this value has to be multiplied + by the sample vector x. + + Parameters + ---------- + prediction : pointer to a np.ndarray[float] of shape (n_classes,) + Prediction of the multinomial classifier, for current sample. + + y : float, between 0 and n_classes - 1 + Indice of the correct class for current sample (i.e. label encoded) + + n_classes : integer + Total number of classes. + + sample_weight : float + Weight of current sample. + + gradient_ptr : pointer to a np.ndarray[float] of shape (n_classes,) + Gradient vector to be filled. + + Reference + --------- + Bishop, C. M. (2006). Pattern recognition and machine learning. + Springer. (Chapter 4.3.4) + """ + cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes) + cdef int class_ind + + for class_ind in range(n_classes): + gradient_ptr[class_ind] = exp(prediction[class_ind] - + logsumexp_prediction) + + # y is the indice of the correct class of current sample. + if class_ind == y: + gradient_ptr[class_ind] -= 1.0 + + gradient_ptr[class_ind] *= sample_weight + + def __reduce__(self): + return MultinomialLogLoss32, () + +cdef inline double _soft_thresholding64(double x, double shrinkage) nogil: + return fmax64(x - shrinkage, 0) - fmax64(- x - shrinkage, 0) + +cdef inline float _soft_thresholding32(float x, float shrinkage) nogil: + return fmax32(x - shrinkage, 0) - fmax32(- x - shrinkage, 0) + +def sag64(SequentialDataset64 dataset, + np.ndarray[double, ndim=2, mode='c'] weights_array, + np.ndarray[double, ndim=1, mode='c'] intercept_array, + int n_samples, + int n_features, + int n_classes, + double tol, + int max_iter, + str loss_function, + double step_size, + double alpha, + double beta, + np.ndarray[double, ndim=2, mode='c'] sum_gradient_init, + np.ndarray[double, ndim=2, mode='c'] gradient_memory_init, + np.ndarray[bint, ndim=1, mode='c'] seen_init, + int num_seen, + bint fit_intercept, + np.ndarray[double, ndim=1, mode='c'] intercept_sum_gradient_init, + double intercept_decay, + bint saga, + bint verbose): + """Stochastic Average Gradient (SAG) and SAGA solvers. + + Used in Ridge and LogisticRegression. + + Reference + --------- + Schmidt, M., Roux, N. L., & Bach, F. (2013). + Minimizing finite sums with the stochastic average gradient + https://hal.inria.fr/hal-00860051/document + (section 4.3) + + Defazio, A., Bach, F., Lacoste-Julien, S. (2014), + SAGA: A Fast Incremental Gradient Method With Support + for Non-Strongly Convex Composite Objectives + https://arxiv.org/abs/1407.0202 + + """ + # the data pointer for x, the current sample + cdef double *x_data_ptr = NULL + # the index pointer for the column of the data + cdef int *x_ind_ptr = NULL + # the number of non-zero features for current sample + cdef int xnnz = -1 + # the label value for current sample + # the label value for curent sample + cdef double y + # the sample weight + cdef double sample_weight + + # helper variable for indexes + cdef int f_idx, s_idx, feature_ind, class_ind, j + # the number of pass through all samples + cdef int n_iter = 0 + # helper to track iterations through samples + cdef int sample_itr + # the index (row number) of the current sample + cdef int sample_ind + + # the maximum change in weights, used to compute stopping criteria + cdef double max_change + # a holder variable for the max weight, used to compute stopping criteria + cdef double max_weight + + # the start time of the fit + cdef time_t start_time + # the end time of the fit + cdef time_t end_time + + # precomputation since the step size does not change in this implementation + cdef double wscale_update = 1.0 - step_size * alpha + + # vector of booleans indicating whether this sample has been seen + cdef bint* seen = seen_init.data + + # helper for cumulative sum + cdef double cum_sum + + # the pointer to the coef_ or weights + cdef double* weights = weights_array.data + # the pointer to the intercept_array + cdef double* intercept = intercept_array.data + + # the pointer to the intercept_sum_gradient + cdef double* intercept_sum_gradient = \ + intercept_sum_gradient_init.data + + # the sum of gradients for each feature + cdef double* sum_gradient = sum_gradient_init.data + # the previously seen gradient for each sample + cdef double* gradient_memory = gradient_memory_init.data + + # the cumulative sums needed for JIT params + cdef np.ndarray[double, ndim=1] cumulative_sums_array = \ + np.empty(n_samples, dtype=np.float64, order="c") + cdef double* cumulative_sums = cumulative_sums_array.data + + # the index for the last time this feature was updated + cdef np.ndarray[int, ndim=1] feature_hist_array = \ + np.zeros(n_features, dtype=np.int32, order="c") + cdef int* feature_hist = feature_hist_array.data + + # the previous weights to use to compute stopping criteria + cdef np.ndarray[double, ndim=2] previous_weights_array = \ + np.zeros((n_features, n_classes), dtype=np.float64, order="c") + cdef double* previous_weights = previous_weights_array.data + + cdef np.ndarray[double, ndim=1] prediction_array = \ + np.zeros(n_classes, dtype=np.float64, order="c") + cdef double* prediction = prediction_array.data + + cdef np.ndarray[double, ndim=1] gradient_array = \ + np.zeros(n_classes, dtype=np.float64, order="c") + cdef double* gradient = gradient_array.data + + # Intermediate variable that need declaration since cython cannot infer when templating + cdef double val + + # Bias correction term in saga + cdef double gradient_correction + + # the scalar used for multiplying z + cdef double wscale = 1.0 + + # return value (-1 if an error occurred, 0 otherwise) + cdef int status = 0 + + # the cumulative sums for each iteration for the sparse implementation + cumulative_sums[0] = 0.0 + + # the multipliative scale needed for JIT params + cdef np.ndarray[double, ndim=1] cumulative_sums_prox_array + cdef double* cumulative_sums_prox + + cdef bint prox = beta > 0 and saga + + # Loss function to optimize + cdef LossFunction loss + # Wether the loss function is multinomial + cdef bint multinomial = False + # Multinomial loss function + cdef MultinomialLogLoss64 multiloss + + if loss_function == "multinomial": + multinomial = True + multiloss = MultinomialLogLoss64() + elif loss_function == "log": + loss = Log() + elif loss_function == "squared": + loss = SquaredLoss() + else: + raise ValueError("Invalid loss parameter: got %s instead of " + "one of ('log', 'squared', 'multinomial')" + % loss_function) + + if prox: + cumulative_sums_prox_array = np.empty(n_samples, + dtype=np.float64, order="c") + cumulative_sums_prox = cumulative_sums_prox_array.data + else: + cumulative_sums_prox = NULL + + with nogil: + start_time = time(NULL) + for n_iter in range(max_iter): + for sample_itr in range(n_samples): + # extract a random sample + sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz, + &y, &sample_weight) + + # cached index for gradient_memory + s_idx = sample_ind * n_classes + + # update the number of samples seen and the seen array + if seen[sample_ind] == 0: + num_seen += 1 + seen[sample_ind] = 1 + + # make the weight updates + if sample_itr > 0: + status = lagged_update64(weights, wscale, xnnz, + n_samples, n_classes, + sample_itr, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, + sum_gradient, + x_ind_ptr, + False, + n_iter) + if status == -1: + break + + # find the current prediction + predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, + intercept, prediction, n_classes) + + # compute the gradient for this sample, given the prediction + if multinomial: + multiloss._dloss(prediction, y, n_classes, sample_weight, + gradient) + else: + gradient[0] = loss._dloss(prediction[0], y) * sample_weight + + # L2 regularization by simply rescaling the weights + wscale *= wscale_update + + # make the updates to the sum of gradients + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + val = x_data_ptr[j] + f_idx = feature_ind * n_classes + for class_ind in range(n_classes): + gradient_correction = \ + val * (gradient[class_ind] - + gradient_memory[s_idx + class_ind]) + if saga: + weights[f_idx + class_ind] -= \ + (gradient_correction * step_size + * (1 - 1. / num_seen) / wscale) + sum_gradient[f_idx + class_ind] += gradient_correction + + # fit the intercept + if fit_intercept: + for class_ind in range(n_classes): + gradient_correction = (gradient[class_ind] - + gradient_memory[s_idx + class_ind]) + intercept_sum_gradient[class_ind] += gradient_correction + gradient_correction *= step_size * (1. - 1. / num_seen) + if saga: + intercept[class_ind] -= \ + (step_size * intercept_sum_gradient[class_ind] / + num_seen * intercept_decay) + gradient_correction + else: + intercept[class_ind] -= \ + (step_size * intercept_sum_gradient[class_ind] / + num_seen * intercept_decay) + + # check to see that the intercept is not inf or NaN + if not skl_isfinite64(intercept[class_ind]): + status = -1 + break + # Break from the n_samples outer loop if an error happened + # in the fit_intercept n_classes inner loop + if status == -1: + break + + # update the gradient memory for this sample + for class_ind in range(n_classes): + gradient_memory[s_idx + class_ind] = gradient[class_ind] + + if sample_itr == 0: + cumulative_sums[0] = step_size / (wscale * num_seen) + if prox: + cumulative_sums_prox[0] = step_size * beta / wscale + else: + cumulative_sums[sample_itr] = \ + (cumulative_sums[sample_itr - 1] + + step_size / (wscale * num_seen)) + if prox: + cumulative_sums_prox[sample_itr] = \ + (cumulative_sums_prox[sample_itr - 1] + + step_size * beta / wscale) + # If wscale gets too small, we need to reset the scale. + if wscale < 1e-9: + if verbose: + with gil: + print("rescaling...") + status = scale_weights64( + weights, &wscale, n_features, n_samples, n_classes, + sample_itr, cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, sum_gradient, n_iter) + if status == -1: + break + + # Break from the n_iter outer loop if an error happened in the + # n_samples inner loop + if status == -1: + break + + # we scale the weights every n_samples iterations and reset the + # just-in-time update system for numerical stability. + status = scale_weights64(weights, &wscale, n_features, + n_samples, + n_classes, n_samples - 1, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, sum_gradient, n_iter) + + if status == -1: + break + # check if the stopping criteria is reached + max_change = 0.0 + max_weight = 0.0 + for idx in range(n_features * n_classes): + max_weight = fmax64(max_weight, fabs(weights[idx])) + max_change = fmax64(max_change, + fabs(weights[idx] - + previous_weights[idx])) + previous_weights[idx] = weights[idx] + if ((max_weight != 0 and max_change / max_weight <= tol) + or max_weight == 0 and max_change == 0): + if verbose: + end_time = time(NULL) + with gil: + print("convergence after %d epochs took %d seconds" % + (n_iter + 1, end_time - start_time)) + break + elif verbose: + printf('Epoch %d, change: %.8f\n', n_iter + 1, + max_change / max_weight) + n_iter += 1 + # We do the error treatment here based on error code in status to avoid + # re-acquiring the GIL within the cython code, which slows the computation + # when the sag/saga solver is used concurrently in multiple Python threads. + if status == -1: + raise ValueError(("Floating-point under-/overflow occurred at epoch" + " #%d. Scaling input data with StandardScaler or" + " MinMaxScaler might help.") % n_iter) + + if verbose and n_iter >= max_iter: + end_time = time(NULL) + print(("max_iter reached after %d seconds") % + (end_time - start_time)) + + return num_seen, n_iter + +def sag32(SequentialDataset32 dataset, + np.ndarray[float, ndim=2, mode='c'] weights_array, + np.ndarray[float, ndim=1, mode='c'] intercept_array, + int n_samples, + int n_features, + int n_classes, + double tol, + int max_iter, + str loss_function, + double step_size, + double alpha, + double beta, + np.ndarray[float, ndim=2, mode='c'] sum_gradient_init, + np.ndarray[float, ndim=2, mode='c'] gradient_memory_init, + np.ndarray[bint, ndim=1, mode='c'] seen_init, + int num_seen, + bint fit_intercept, + np.ndarray[float, ndim=1, mode='c'] intercept_sum_gradient_init, + double intercept_decay, + bint saga, + bint verbose): + """Stochastic Average Gradient (SAG) and SAGA solvers. + + Used in Ridge and LogisticRegression. + + Reference + --------- + Schmidt, M., Roux, N. L., & Bach, F. (2013). + Minimizing finite sums with the stochastic average gradient + https://hal.inria.fr/hal-00860051/document + (section 4.3) + + Defazio, A., Bach, F., Lacoste-Julien, S. (2014), + SAGA: A Fast Incremental Gradient Method With Support + for Non-Strongly Convex Composite Objectives + https://arxiv.org/abs/1407.0202 + + """ + # the data pointer for x, the current sample + cdef float *x_data_ptr = NULL + # the index pointer for the column of the data + cdef int *x_ind_ptr = NULL + # the number of non-zero features for current sample + cdef int xnnz = -1 + # the label value for current sample + # the label value for curent sample + cdef float y + # the sample weight + cdef float sample_weight + + # helper variable for indexes + cdef int f_idx, s_idx, feature_ind, class_ind, j + # the number of pass through all samples + cdef int n_iter = 0 + # helper to track iterations through samples + cdef int sample_itr + # the index (row number) of the current sample + cdef int sample_ind + + # the maximum change in weights, used to compute stopping criteria + cdef float max_change + # a holder variable for the max weight, used to compute stopping criteria + cdef float max_weight + + # the start time of the fit + cdef time_t start_time + # the end time of the fit + cdef time_t end_time + + # precomputation since the step size does not change in this implementation + cdef float wscale_update = 1.0 - step_size * alpha + + # vector of booleans indicating whether this sample has been seen + cdef bint* seen = seen_init.data + + # helper for cumulative sum + cdef float cum_sum + + # the pointer to the coef_ or weights + cdef float* weights = weights_array.data + # the pointer to the intercept_array + cdef float* intercept = intercept_array.data + + # the pointer to the intercept_sum_gradient + cdef float* intercept_sum_gradient = \ + intercept_sum_gradient_init.data + + # the sum of gradients for each feature + cdef float* sum_gradient = sum_gradient_init.data + # the previously seen gradient for each sample + cdef float* gradient_memory = gradient_memory_init.data + + # the cumulative sums needed for JIT params + cdef np.ndarray[float, ndim=1] cumulative_sums_array = \ + np.empty(n_samples, dtype=np.float32, order="c") + cdef float* cumulative_sums = cumulative_sums_array.data + + # the index for the last time this feature was updated + cdef np.ndarray[int, ndim=1] feature_hist_array = \ + np.zeros(n_features, dtype=np.int32, order="c") + cdef int* feature_hist = feature_hist_array.data + + # the previous weights to use to compute stopping criteria + cdef np.ndarray[float, ndim=2] previous_weights_array = \ + np.zeros((n_features, n_classes), dtype=np.float32, order="c") + cdef float* previous_weights = previous_weights_array.data + + cdef np.ndarray[float, ndim=1] prediction_array = \ + np.zeros(n_classes, dtype=np.float32, order="c") + cdef float* prediction = prediction_array.data + + cdef np.ndarray[float, ndim=1] gradient_array = \ + np.zeros(n_classes, dtype=np.float32, order="c") + cdef float* gradient = gradient_array.data + + # Intermediate variable that need declaration since cython cannot infer when templating + cdef float val + + # Bias correction term in saga + cdef float gradient_correction + + # the scalar used for multiplying z + cdef float wscale = 1.0 + + # return value (-1 if an error occurred, 0 otherwise) + cdef int status = 0 + + # the cumulative sums for each iteration for the sparse implementation + cumulative_sums[0] = 0.0 + + # the multipliative scale needed for JIT params + cdef np.ndarray[float, ndim=1] cumulative_sums_prox_array + cdef float* cumulative_sums_prox + + cdef bint prox = beta > 0 and saga + + # Loss function to optimize + cdef LossFunction loss + # Wether the loss function is multinomial + cdef bint multinomial = False + # Multinomial loss function + cdef MultinomialLogLoss32 multiloss + + if loss_function == "multinomial": + multinomial = True + multiloss = MultinomialLogLoss32() + elif loss_function == "log": + loss = Log() + elif loss_function == "squared": + loss = SquaredLoss() + else: + raise ValueError("Invalid loss parameter: got %s instead of " + "one of ('log', 'squared', 'multinomial')" + % loss_function) + + if prox: + cumulative_sums_prox_array = np.empty(n_samples, + dtype=np.float32, order="c") + cumulative_sums_prox = cumulative_sums_prox_array.data + else: + cumulative_sums_prox = NULL + + with nogil: + start_time = time(NULL) + for n_iter in range(max_iter): + for sample_itr in range(n_samples): + # extract a random sample + sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz, + &y, &sample_weight) + + # cached index for gradient_memory + s_idx = sample_ind * n_classes + + # update the number of samples seen and the seen array + if seen[sample_ind] == 0: + num_seen += 1 + seen[sample_ind] = 1 + + # make the weight updates + if sample_itr > 0: + status = lagged_update32(weights, wscale, xnnz, + n_samples, n_classes, + sample_itr, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, + sum_gradient, + x_ind_ptr, + False, + n_iter) + if status == -1: + break + + # find the current prediction + predict_sample32(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, + intercept, prediction, n_classes) + + # compute the gradient for this sample, given the prediction + if multinomial: + multiloss._dloss(prediction, y, n_classes, sample_weight, + gradient) + else: + gradient[0] = loss._dloss(prediction[0], y) * sample_weight + + # L2 regularization by simply rescaling the weights + wscale *= wscale_update + + # make the updates to the sum of gradients + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + val = x_data_ptr[j] + f_idx = feature_ind * n_classes + for class_ind in range(n_classes): + gradient_correction = \ + val * (gradient[class_ind] - + gradient_memory[s_idx + class_ind]) + if saga: + weights[f_idx + class_ind] -= \ + (gradient_correction * step_size + * (1 - 1. / num_seen) / wscale) + sum_gradient[f_idx + class_ind] += gradient_correction + + # fit the intercept + if fit_intercept: + for class_ind in range(n_classes): + gradient_correction = (gradient[class_ind] - + gradient_memory[s_idx + class_ind]) + intercept_sum_gradient[class_ind] += gradient_correction + gradient_correction *= step_size * (1. - 1. / num_seen) + if saga: + intercept[class_ind] -= \ + (step_size * intercept_sum_gradient[class_ind] / + num_seen * intercept_decay) + gradient_correction + else: + intercept[class_ind] -= \ + (step_size * intercept_sum_gradient[class_ind] / + num_seen * intercept_decay) + + # check to see that the intercept is not inf or NaN + if not skl_isfinite32(intercept[class_ind]): + status = -1 + break + # Break from the n_samples outer loop if an error happened + # in the fit_intercept n_classes inner loop + if status == -1: + break + + # update the gradient memory for this sample + for class_ind in range(n_classes): + gradient_memory[s_idx + class_ind] = gradient[class_ind] + + if sample_itr == 0: + cumulative_sums[0] = step_size / (wscale * num_seen) + if prox: + cumulative_sums_prox[0] = step_size * beta / wscale + else: + cumulative_sums[sample_itr] = \ + (cumulative_sums[sample_itr - 1] + + step_size / (wscale * num_seen)) + if prox: + cumulative_sums_prox[sample_itr] = \ + (cumulative_sums_prox[sample_itr - 1] + + step_size * beta / wscale) + # If wscale gets too small, we need to reset the scale. + if wscale < 1e-9: + if verbose: + with gil: + print("rescaling...") + status = scale_weights32( + weights, &wscale, n_features, n_samples, n_classes, + sample_itr, cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, sum_gradient, n_iter) + if status == -1: + break + + # Break from the n_iter outer loop if an error happened in the + # n_samples inner loop + if status == -1: + break + + # we scale the weights every n_samples iterations and reset the + # just-in-time update system for numerical stability. + status = scale_weights32(weights, &wscale, n_features, + n_samples, + n_classes, n_samples - 1, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, sum_gradient, n_iter) + + if status == -1: + break + # check if the stopping criteria is reached + max_change = 0.0 + max_weight = 0.0 + for idx in range(n_features * n_classes): + max_weight = fmax32(max_weight, fabs(weights[idx])) + max_change = fmax32(max_change, + fabs(weights[idx] - + previous_weights[idx])) + previous_weights[idx] = weights[idx] + if ((max_weight != 0 and max_change / max_weight <= tol) + or max_weight == 0 and max_change == 0): + if verbose: + end_time = time(NULL) + with gil: + print("convergence after %d epochs took %d seconds" % + (n_iter + 1, end_time - start_time)) + break + elif verbose: + printf('Epoch %d, change: %.8f\n', n_iter + 1, + max_change / max_weight) + n_iter += 1 + # We do the error treatment here based on error code in status to avoid + # re-acquiring the GIL within the cython code, which slows the computation + # when the sag/saga solver is used concurrently in multiple Python threads. + if status == -1: + raise ValueError(("Floating-point under-/overflow occurred at epoch" + " #%d. Scaling input data with StandardScaler or" + " MinMaxScaler might help.") % n_iter) + + if verbose and n_iter >= max_iter: + end_time = time(NULL) + print(("max_iter reached after %d seconds") % + (end_time - start_time)) + + return num_seen, n_iter + +cdef int scale_weights64(double* weights, double* wscale, + int n_features, + int n_samples, int n_classes, int sample_itr, + double* cumulative_sums, + double* cumulative_sums_prox, + int* feature_hist, + bint prox, + double* sum_gradient, + int n_iter) nogil: + """Scale the weights with wscale for numerical stability. + + wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr) + can become very small, so we reset it every n_samples iterations to 1.0 for + numerical stability. To be able to scale, we first need to update every + coefficients and reset the just-in-time update system. + This also limits the size of `cumulative_sums`. + """ + + cdef int status + status = lagged_update64(weights, wscale[0], n_features, + n_samples, n_classes, sample_itr + 1, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, + sum_gradient, + NULL, + True, + n_iter) + # if lagged update succeeded, reset wscale to 1.0 + if status == 0: + wscale[0] = 1.0 + return status + +cdef int scale_weights32(float* weights, float* wscale, + int n_features, + int n_samples, int n_classes, int sample_itr, + float* cumulative_sums, + float* cumulative_sums_prox, + int* feature_hist, + bint prox, + float* sum_gradient, + int n_iter) nogil: + """Scale the weights with wscale for numerical stability. + + wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr) + can become very small, so we reset it every n_samples iterations to 1.0 for + numerical stability. To be able to scale, we first need to update every + coefficients and reset the just-in-time update system. + This also limits the size of `cumulative_sums`. + """ + + cdef int status + status = lagged_update32(weights, wscale[0], n_features, + n_samples, n_classes, sample_itr + 1, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, + sum_gradient, + NULL, + True, + n_iter) + # if lagged update succeeded, reset wscale to 1.0 + if status == 0: + wscale[0] = 1.0 + return status + +cdef int lagged_update64(double* weights, double wscale, int xnnz, + int n_samples, int n_classes, int sample_itr, + double* cumulative_sums, + double* cumulative_sums_prox, + int* feature_hist, + bint prox, + double* sum_gradient, + int* x_ind_ptr, + bint reset, + int n_iter) nogil: + """Hard perform the JIT updates for non-zero features of present sample. + The updates that awaits are kept in memory using cumulative_sums, + cumulative_sums_prox, wscale and feature_hist. See original SAGA paper + (Defazio et al. 2014) for details. If reset=True, we also reset wscale to + 1 (this is done at the end of each epoch). + """ + cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind + cdef double cum_sum, grad_step, prox_step, cum_sum_prox + for feature_ind in range(xnnz): + if not reset: + feature_ind = x_ind_ptr[feature_ind] + f_idx = feature_ind * n_classes + + cum_sum = cumulative_sums[sample_itr - 1] + if prox: + cum_sum_prox = cumulative_sums_prox[sample_itr - 1] + if feature_hist[feature_ind] != 0: + cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1] + if prox: + cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1] + if not prox: + for class_ind in range(n_classes): + idx = f_idx + class_ind + weights[idx] -= cum_sum * sum_gradient[idx] + if reset: + weights[idx] *= wscale + if not skl_isfinite64(weights[idx]): + # returning here does not require the gil as the return + # type is a C integer + return -1 + else: + for class_ind in range(n_classes): + idx = f_idx + class_ind + if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox: + # In this case, we can perform all the gradient steps and + # all the proximal steps in this order, which is more + # efficient than unrolling all the lagged updates. + # Idea taken from scikit-learn-contrib/lightning. + weights[idx] -= cum_sum * sum_gradient[idx] + weights[idx] = _soft_thresholding64(weights[idx], + cum_sum_prox) + else: + last_update_ind = feature_hist[feature_ind] + if last_update_ind == -1: + last_update_ind = sample_itr - 1 + for lagged_ind in range(sample_itr - 1, + last_update_ind - 1, -1): + if lagged_ind > 0: + grad_step = (cumulative_sums[lagged_ind] + - cumulative_sums[lagged_ind - 1]) + prox_step = (cumulative_sums_prox[lagged_ind] + - cumulative_sums_prox[lagged_ind - 1]) + else: + grad_step = cumulative_sums[lagged_ind] + prox_step = cumulative_sums_prox[lagged_ind] + weights[idx] -= sum_gradient[idx] * grad_step + weights[idx] = _soft_thresholding64(weights[idx], + prox_step) + + if reset: + weights[idx] *= wscale + # check to see that the weight is not inf or NaN + if not skl_isfinite64(weights[idx]): + return -1 + if reset: + feature_hist[feature_ind] = sample_itr % n_samples + else: + feature_hist[feature_ind] = sample_itr + + if reset: + cumulative_sums[sample_itr - 1] = 0.0 + if prox: + cumulative_sums_prox[sample_itr - 1] = 0.0 + + return 0 + +cdef int lagged_update32(float* weights, float wscale, int xnnz, + int n_samples, int n_classes, int sample_itr, + float* cumulative_sums, + float* cumulative_sums_prox, + int* feature_hist, + bint prox, + float* sum_gradient, + int* x_ind_ptr, + bint reset, + int n_iter) nogil: + """Hard perform the JIT updates for non-zero features of present sample. + The updates that awaits are kept in memory using cumulative_sums, + cumulative_sums_prox, wscale and feature_hist. See original SAGA paper + (Defazio et al. 2014) for details. If reset=True, we also reset wscale to + 1 (this is done at the end of each epoch). + """ + cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind + cdef float cum_sum, grad_step, prox_step, cum_sum_prox + for feature_ind in range(xnnz): + if not reset: + feature_ind = x_ind_ptr[feature_ind] + f_idx = feature_ind * n_classes + + cum_sum = cumulative_sums[sample_itr - 1] + if prox: + cum_sum_prox = cumulative_sums_prox[sample_itr - 1] + if feature_hist[feature_ind] != 0: + cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1] + if prox: + cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1] + if not prox: + for class_ind in range(n_classes): + idx = f_idx + class_ind + weights[idx] -= cum_sum * sum_gradient[idx] + if reset: + weights[idx] *= wscale + if not skl_isfinite32(weights[idx]): + # returning here does not require the gil as the return + # type is a C integer + return -1 + else: + for class_ind in range(n_classes): + idx = f_idx + class_ind + if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox: + # In this case, we can perform all the gradient steps and + # all the proximal steps in this order, which is more + # efficient than unrolling all the lagged updates. + # Idea taken from scikit-learn-contrib/lightning. + weights[idx] -= cum_sum * sum_gradient[idx] + weights[idx] = _soft_thresholding32(weights[idx], + cum_sum_prox) + else: + last_update_ind = feature_hist[feature_ind] + if last_update_ind == -1: + last_update_ind = sample_itr - 1 + for lagged_ind in range(sample_itr - 1, + last_update_ind - 1, -1): + if lagged_ind > 0: + grad_step = (cumulative_sums[lagged_ind] + - cumulative_sums[lagged_ind - 1]) + prox_step = (cumulative_sums_prox[lagged_ind] + - cumulative_sums_prox[lagged_ind - 1]) + else: + grad_step = cumulative_sums[lagged_ind] + prox_step = cumulative_sums_prox[lagged_ind] + weights[idx] -= sum_gradient[idx] * grad_step + weights[idx] = _soft_thresholding32(weights[idx], + prox_step) + + if reset: + weights[idx] *= wscale + # check to see that the weight is not inf or NaN + if not skl_isfinite32(weights[idx]): + return -1 + if reset: + feature_hist[feature_ind] = sample_itr % n_samples + else: + feature_hist[feature_ind] = sample_itr + + if reset: + cumulative_sums[sample_itr - 1] = 0.0 + if prox: + cumulative_sums_prox[sample_itr - 1] = 0.0 + + return 0 + +cdef void predict_sample64(double* x_data_ptr, int* x_ind_ptr, int xnnz, + double* w_data_ptr, double wscale, + double* intercept, double* prediction, + int n_classes) nogil: + """Compute the prediction given sparse sample x and dense weight w. + + Parameters + ---------- + x_data_ptr : pointer + Pointer to the data of the sample x + + x_ind_ptr : pointer + Pointer to the indices of the sample x + + xnnz : int + Number of non-zero element in the sample x + + w_data_ptr : pointer + Pointer to the data of the weights w + + wscale : double + Scale of the weights w + + intercept : pointer + Pointer to the intercept + + prediction : pointer + Pointer to store the resulting prediction + + n_classes : int + Number of classes in multinomial case. Equals 1 in binary case. + + """ + cdef int feature_ind, class_ind, j + cdef double innerprod + + for class_ind in range(n_classes): + innerprod = 0.0 + # Compute the dot product only on non-zero elements of x + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] * + x_data_ptr[j]) + + prediction[class_ind] = wscale * innerprod + intercept[class_ind] + + +cdef void predict_sample32(float* x_data_ptr, int* x_ind_ptr, int xnnz, + float* w_data_ptr, float wscale, + float* intercept, float* prediction, + int n_classes) nogil: + """Compute the prediction given sparse sample x and dense weight w. + + Parameters + ---------- + x_data_ptr : pointer + Pointer to the data of the sample x + + x_ind_ptr : pointer + Pointer to the indices of the sample x + + xnnz : int + Number of non-zero element in the sample x + + w_data_ptr : pointer + Pointer to the data of the weights w + + wscale : float + Scale of the weights w + + intercept : pointer + Pointer to the intercept + + prediction : pointer + Pointer to store the resulting prediction + + n_classes : int + Number of classes in multinomial case. Equals 1 in binary case. + + """ + cdef int feature_ind, class_ind, j + cdef float innerprod + + for class_ind in range(n_classes): + innerprod = 0.0 + # Compute the dot product only on non-zero elements of x + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] * + x_data_ptr[j]) + + prediction[class_ind] = wscale * innerprod + intercept[class_ind] + + + +def _multinomial_grad_loss_all_samples( + SequentialDataset64 dataset, + np.ndarray[double, ndim=2, mode='c'] weights_array, + np.ndarray[double, ndim=1, mode='c'] intercept_array, + int n_samples, int n_features, int n_classes): + """Compute multinomial gradient and loss across all samples. + + Used for testing purpose only. + """ + cdef double* weights = weights_array.data + cdef double* intercept = intercept_array.data + + cdef double *x_data_ptr = NULL + cdef int *x_ind_ptr = NULL + cdef int xnnz = -1 + cdef double y + cdef double sample_weight + + cdef double wscale = 1.0 + cdef int i, j, class_ind, feature_ind + cdef double val + cdef double sum_loss = 0.0 + + cdef MultinomialLogLoss64 multiloss = MultinomialLogLoss64() + + cdef np.ndarray[double, ndim=2] sum_gradient_array = \ + np.zeros((n_features, n_classes), dtype=np.double, order="c") + cdef double* sum_gradient = sum_gradient_array.data + + cdef np.ndarray[double, ndim=1] prediction_array = \ + np.zeros(n_classes, dtype=np.double, order="c") + cdef double* prediction = prediction_array.data + + cdef np.ndarray[double, ndim=1] gradient_array = \ + np.zeros(n_classes, dtype=np.double, order="c") + cdef double* gradient = gradient_array.data + + with nogil: + for i in range(n_samples): + # get next sample on the dataset + dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz, + &y, &sample_weight) + + # prediction of the multinomial classifier for the sample + predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, + intercept, prediction, n_classes) + + # compute the gradient for this sample, given the prediction + multiloss._dloss(prediction, y, n_classes, sample_weight, gradient) + + # compute the loss for this sample, given the prediction + sum_loss += multiloss._loss(prediction, y, n_classes, sample_weight) + + # update the sum of the gradient + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + val = x_data_ptr[j] + for class_ind in range(n_classes): + sum_gradient[feature_ind * n_classes + class_ind] += \ + gradient[class_ind] * val + + return sum_loss, sum_gradient_array diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index 160a334c7f833..d3fb5a31fbe8d 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -731,9 +731,11 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X): lasso_lars.fit(X, y, copy_X=copy_X) assert copy_X == np.array_equal(X, X_copy) + def test_lars_with_jitter(): """ - Test that user input of a small amount of jitter, using example provided in issue #2746 + Test that user input of a small amount of jitter, + using example provided in issue #2746 """ @@ -744,7 +746,8 @@ def test_lars_with_jitter(): fit_intercept = False lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept) - lars_with_jiggle = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept, + lars_with_jiggle = linear_model.LassoLars(alpha=alpha, + fit_intercept=fit_intercept, jitter=10e-5) lars.fit(A, b) @@ -753,5 +756,5 @@ def test_lars_with_jitter(): w_nojiggle = lars.coef_ w_jiggle = lars_with_jiggle.coef_ - assert np.array_equal(w_jiggle, w_nojiggle) == False + assert not np.array_equal(w_jiggle, w_nojiggle) assert_array_almost_equal(w_jiggle, expected_output, decimal=2) diff --git a/sklearn/utils/seq_dataset.pxd b/sklearn/utils/seq_dataset.pxd new file mode 100644 index 0000000000000..67ce3b68b4474 --- /dev/null +++ b/sklearn/utils/seq_dataset.pxd @@ -0,0 +1,116 @@ + +#------------------------------------------------------------------------------ + +""" +Dataset abstractions for sequential data access. +WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp +""" + +cimport numpy as np + +# SequentialDataset and its two concrete subclasses are (optionally randomized) +# iterators over the rows of a matrix X and corresponding target values y. + + +cdef class SequentialDataset64: + cdef int current_index + cdef np.ndarray index + cdef int *index_data_ptr + cdef Py_ssize_t n_samples + cdef np.uint32_t seed + + cdef void shuffle(self, np.uint32_t seed) nogil + cdef int _get_next_index(self) nogil + cdef int _get_random_index(self) nogil + + cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight, + int current_index) nogil + cdef void next(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight) nogil + cdef int random(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight) nogil + + +cdef class ArrayDataset64(SequentialDataset64): + cdef np.ndarray X + cdef np.ndarray Y + cdef np.ndarray sample_weights + cdef Py_ssize_t n_features + cdef np.npy_intp X_stride + cdef double *X_data_ptr + cdef double *Y_data_ptr + cdef np.ndarray feature_indices + cdef int *feature_indices_ptr + cdef double *sample_weight_data + + +cdef class CSRDataset64(SequentialDataset64): + cdef np.ndarray X_data + cdef np.ndarray X_indptr + cdef np.ndarray X_indices + cdef np.ndarray Y + cdef np.ndarray sample_weights + cdef double *X_data_ptr + cdef int *X_indptr_ptr + cdef int *X_indices_ptr + cdef double *Y_data_ptr + cdef double *sample_weight_data + +#------------------------------------------------------------------------------ + +""" +Dataset abstractions for sequential data access. +WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp +""" + +cimport numpy as np + +# SequentialDataset and its two concrete subclasses are (optionally randomized) +# iterators over the rows of a matrix X and corresponding target values y. + + +cdef class SequentialDataset32: + cdef int current_index + cdef np.ndarray index + cdef int *index_data_ptr + cdef Py_ssize_t n_samples + cdef np.uint32_t seed + + cdef void shuffle(self, np.uint32_t seed) nogil + cdef int _get_next_index(self) nogil + cdef int _get_random_index(self) nogil + + cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight, + int current_index) nogil + cdef void next(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight) nogil + cdef int random(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight) nogil + + +cdef class ArrayDataset32(SequentialDataset32): + cdef np.ndarray X + cdef np.ndarray Y + cdef np.ndarray sample_weights + cdef Py_ssize_t n_features + cdef np.npy_intp X_stride + cdef float *X_data_ptr + cdef float *Y_data_ptr + cdef np.ndarray feature_indices + cdef int *feature_indices_ptr + cdef float *sample_weight_data + + +cdef class CSRDataset32(SequentialDataset32): + cdef np.ndarray X_data + cdef np.ndarray X_indptr + cdef np.ndarray X_indices + cdef np.ndarray Y + cdef np.ndarray sample_weights + cdef float *X_data_ptr + cdef int *X_indptr_ptr + cdef int *X_indices_ptr + cdef float *Y_data_ptr + cdef float *sample_weight_data diff --git a/sklearn/utils/seq_dataset.pyx b/sklearn/utils/seq_dataset.pyx new file mode 100644 index 0000000000000..6fa274771defe --- /dev/null +++ b/sklearn/utils/seq_dataset.pyx @@ -0,0 +1,653 @@ +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False + +#------------------------------------------------------------------------------ + +""" +Dataset abstractions for sequential data access. +WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp +""" + +cimport cython +from libc.limits cimport INT_MAX +cimport numpy as np +import numpy as np + +np.import_array() + +from ._random cimport our_rand_r + +cdef class SequentialDataset64: + """Base class for datasets with sequential data access. + + SequentialDataset is used to iterate over the rows of a matrix X and + corresponding target values y, i.e. to iterate over samples. + There are two methods to get the next sample: + - next : Iterate sequentially (optionally randomized) + - random : Iterate randomly (with replacement) + + Attributes + ---------- + index : np.ndarray + Index array for fast shuffling. + + index_data_ptr : int + Pointer to the index array. + + current_index : int + Index of current sample in ``index``. + The index of current sample in the data is given by + index_data_ptr[current_index]. + + n_samples : Py_ssize_t + Number of samples in the dataset. + + seed : np.uint32_t + Seed used for random sampling. + + """ + + cdef void next(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight) nogil: + """Get the next example ``x`` from the dataset. + + This method gets the next sample looping sequentially over all samples. + The order can be shuffled with the method ``shuffle``. + Shuffling once before iterating over all samples corresponds to a + random draw without replacement. It is used for instance in SGD solver. + + Parameters + ---------- + x_data_ptr : double** + A pointer to the double array which holds the feature + values of the next example. + + x_ind_ptr : np.intc** + A pointer to the int array which holds the feature + indices of the next example. + + nnz : int* + A pointer to an int holding the number of non-zero + values of the next example. + + y : double* + The target value of the next example. + + sample_weight : double* + The weight of the next example. + """ + cdef int current_index = self._get_next_index() + self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, + current_index) + + cdef int random(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight) nogil: + """Get a random example ``x`` from the dataset. + + This method gets next sample chosen randomly over a uniform + distribution. It corresponds to a random draw with replacement. + It is used for instance in SAG solver. + + Parameters + ---------- + x_data_ptr : double** + A pointer to the double array which holds the feature + values of the next example. + + x_ind_ptr : np.intc** + A pointer to the int array which holds the feature + indices of the next example. + + nnz : int* + A pointer to an int holding the number of non-zero + values of the next example. + + y : double* + The target value of the next example. + + sample_weight : double* + The weight of the next example. + + Returns + ------- + current_index : int + Index of current sample. + """ + cdef int current_index = self._get_random_index() + self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, + current_index) + return current_index + + cdef void shuffle(self, np.uint32_t seed) nogil: + """Permutes the ordering of examples.""" + # Fisher-Yates shuffle + cdef int *ind = self.index_data_ptr + cdef int n = self.n_samples + cdef unsigned i, j + for i in range(n - 1): + j = i + our_rand_r(&seed) % (n - i) + ind[i], ind[j] = ind[j], ind[i] + + cdef int _get_next_index(self) nogil: + cdef int current_index = self.current_index + if current_index >= (self.n_samples - 1): + current_index = -1 + + current_index += 1 + self.current_index = current_index + return self.current_index + + cdef int _get_random_index(self) nogil: + cdef int n = self.n_samples + cdef int current_index = our_rand_r(&self.seed) % n + self.current_index = current_index + return current_index + + cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight, + int current_index) nogil: + pass + + def _shuffle_py(self, np.uint32_t seed): + """python function used for easy testing""" + self.shuffle(seed) + + def _next_py(self): + """python function used for easy testing""" + cdef int current_index = self._get_next_index() + return self._sample_py(current_index) + + def _random_py(self): + """python function used for easy testing""" + cdef int current_index = self._get_random_index() + return self._sample_py(current_index) + + def _sample_py(self, int current_index): + """python function used for easy testing""" + cdef double* x_data_ptr + cdef int* x_indices_ptr + cdef int nnz, j + cdef double y, sample_weight + + # call _sample in cython + self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight, + current_index) + + # transform the pointed data in numpy CSR array + cdef np.ndarray[double, ndim=1] x_data = np.empty(nnz, + dtype=np.float64) + cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32) + cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz], + dtype=np.int32) + + for j in range(nnz): + x_data[j] = x_data_ptr[j] + x_indices[j] = x_indices_ptr[j] + + cdef int sample_idx = self.index_data_ptr[current_index] + + return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx + + +cdef class ArrayDataset64(SequentialDataset64): + """Dataset backed by a two-dimensional numpy array. + + The dtype of the numpy array is expected to be ``np.float64`` (double) + and C-style memory layout. + """ + + def __cinit__(self, np.ndarray[double, ndim=2, mode='c'] X, + np.ndarray[double, ndim=1, mode='c'] Y, + np.ndarray[double, ndim=1, mode='c'] sample_weights, + np.uint32_t seed=1): + """A ``SequentialDataset`` backed by a two-dimensional numpy array. + + Parameters + ---------- + X : ndarray, dtype=double, ndim=2, mode='c' + The sample array, of shape(n_samples, n_features) + + Y : ndarray, dtype=double, ndim=1, mode='c' + The target array, of shape(n_samples, ) + + sample_weights : ndarray, dtype=double, ndim=1, mode='c' + The weight of each sample, of shape(n_samples,) + """ + if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX: + raise ValueError("More than %d samples or features not supported;" + " got (%d, %d)." + % (INT_MAX, X.shape[0], X.shape[1])) + + # keep a reference to the data to prevent garbage collection + self.X = X + self.Y = Y + self.sample_weights = sample_weights + + self.n_samples = X.shape[0] + self.n_features = X.shape[1] + + cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \ + np.arange(0, self.n_features, dtype=np.intc) + self.feature_indices = feature_indices + self.feature_indices_ptr = feature_indices.data + + self.current_index = -1 + self.X_stride = X.strides[0] / X.itemsize + self.X_data_ptr = X.data + self.Y_data_ptr = Y.data + self.sample_weight_data = sample_weights.data + + # Use index array for fast shuffling + cdef np.ndarray[int, ndim=1, mode='c'] index = \ + np.arange(0, self.n_samples, dtype=np.intc) + self.index = index + self.index_data_ptr = index.data + # seed should not be 0 for our_rand_r + self.seed = max(seed, 1) + + cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight, + int current_index) nogil: + cdef long long sample_idx = self.index_data_ptr[current_index] + cdef long long offset = sample_idx * self.X_stride + + y[0] = self.Y_data_ptr[sample_idx] + x_data_ptr[0] = self.X_data_ptr + offset + x_ind_ptr[0] = self.feature_indices_ptr + nnz[0] = self.n_features + sample_weight[0] = self.sample_weight_data[sample_idx] + + +cdef class CSRDataset64(SequentialDataset64): + """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """ + + def __cinit__(self, np.ndarray[double, ndim=1, mode='c'] X_data, + np.ndarray[int, ndim=1, mode='c'] X_indptr, + np.ndarray[int, ndim=1, mode='c'] X_indices, + np.ndarray[double, ndim=1, mode='c'] Y, + np.ndarray[double, ndim=1, mode='c'] sample_weights, + np.uint32_t seed=1): + """Dataset backed by a scipy sparse CSR matrix. + + The feature indices of ``x`` are given by x_ind_ptr[0:nnz]. + The corresponding feature values are given by + x_data_ptr[0:nnz]. + + Parameters + ---------- + X_data : ndarray, dtype=double, ndim=1, mode='c' + The data array of the CSR features matrix. + + X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c' + The index pointer array of the CSR features matrix. + + X_indices : ndarray, dtype=np.intc, ndim=1, mode='c' + The column indices array of the CSR features matrix. + + Y : ndarray, dtype=double, ndim=1, mode='c' + The target values. + + sample_weights : ndarray, dtype=double, ndim=1, mode='c' + The weight of each sample. + """ + # keep a reference to the data to prevent garbage collection + self.X_data = X_data + self.X_indptr = X_indptr + self.X_indices = X_indices + self.Y = Y + self.sample_weights = sample_weights + + self.n_samples = Y.shape[0] + self.current_index = -1 + self.X_data_ptr = X_data.data + self.X_indptr_ptr = X_indptr.data + self.X_indices_ptr = X_indices.data + + self.Y_data_ptr = Y.data + self.sample_weight_data = sample_weights.data + + # Use index array for fast shuffling + cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples, + dtype=np.intc) + self.index = idx + self.index_data_ptr = idx.data + # seed should not be 0 for our_rand_r + self.seed = max(seed, 1) + + cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight, + int current_index) nogil: + cdef long long sample_idx = self.index_data_ptr[current_index] + cdef long long offset = self.X_indptr_ptr[sample_idx] + y[0] = self.Y_data_ptr[sample_idx] + x_data_ptr[0] = self.X_data_ptr + offset + x_ind_ptr[0] = self.X_indices_ptr + offset + nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset + sample_weight[0] = self.sample_weight_data[sample_idx] + + +#------------------------------------------------------------------------------ + +""" +Dataset abstractions for sequential data access. +WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp +""" + +cimport cython +from libc.limits cimport INT_MAX +cimport numpy as np +import numpy as np + +np.import_array() + +from ._random cimport our_rand_r + +cdef class SequentialDataset32: + """Base class for datasets with sequential data access. + + SequentialDataset is used to iterate over the rows of a matrix X and + corresponding target values y, i.e. to iterate over samples. + There are two methods to get the next sample: + - next : Iterate sequentially (optionally randomized) + - random : Iterate randomly (with replacement) + + Attributes + ---------- + index : np.ndarray + Index array for fast shuffling. + + index_data_ptr : int + Pointer to the index array. + + current_index : int + Index of current sample in ``index``. + The index of current sample in the data is given by + index_data_ptr[current_index]. + + n_samples : Py_ssize_t + Number of samples in the dataset. + + seed : np.uint32_t + Seed used for random sampling. + + """ + + cdef void next(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight) nogil: + """Get the next example ``x`` from the dataset. + + This method gets the next sample looping sequentially over all samples. + The order can be shuffled with the method ``shuffle``. + Shuffling once before iterating over all samples corresponds to a + random draw without replacement. It is used for instance in SGD solver. + + Parameters + ---------- + x_data_ptr : float** + A pointer to the float array which holds the feature + values of the next example. + + x_ind_ptr : np.intc** + A pointer to the int array which holds the feature + indices of the next example. + + nnz : int* + A pointer to an int holding the number of non-zero + values of the next example. + + y : float* + The target value of the next example. + + sample_weight : float* + The weight of the next example. + """ + cdef int current_index = self._get_next_index() + self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, + current_index) + + cdef int random(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight) nogil: + """Get a random example ``x`` from the dataset. + + This method gets next sample chosen randomly over a uniform + distribution. It corresponds to a random draw with replacement. + It is used for instance in SAG solver. + + Parameters + ---------- + x_data_ptr : float** + A pointer to the float array which holds the feature + values of the next example. + + x_ind_ptr : np.intc** + A pointer to the int array which holds the feature + indices of the next example. + + nnz : int* + A pointer to an int holding the number of non-zero + values of the next example. + + y : float* + The target value of the next example. + + sample_weight : float* + The weight of the next example. + + Returns + ------- + current_index : int + Index of current sample. + """ + cdef int current_index = self._get_random_index() + self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, + current_index) + return current_index + + cdef void shuffle(self, np.uint32_t seed) nogil: + """Permutes the ordering of examples.""" + # Fisher-Yates shuffle + cdef int *ind = self.index_data_ptr + cdef int n = self.n_samples + cdef unsigned i, j + for i in range(n - 1): + j = i + our_rand_r(&seed) % (n - i) + ind[i], ind[j] = ind[j], ind[i] + + cdef int _get_next_index(self) nogil: + cdef int current_index = self.current_index + if current_index >= (self.n_samples - 1): + current_index = -1 + + current_index += 1 + self.current_index = current_index + return self.current_index + + cdef int _get_random_index(self) nogil: + cdef int n = self.n_samples + cdef int current_index = our_rand_r(&self.seed) % n + self.current_index = current_index + return current_index + + cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight, + int current_index) nogil: + pass + + def _shuffle_py(self, np.uint32_t seed): + """python function used for easy testing""" + self.shuffle(seed) + + def _next_py(self): + """python function used for easy testing""" + cdef int current_index = self._get_next_index() + return self._sample_py(current_index) + + def _random_py(self): + """python function used for easy testing""" + cdef int current_index = self._get_random_index() + return self._sample_py(current_index) + + def _sample_py(self, int current_index): + """python function used for easy testing""" + cdef float* x_data_ptr + cdef int* x_indices_ptr + cdef int nnz, j + cdef float y, sample_weight + + # call _sample in cython + self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight, + current_index) + + # transform the pointed data in numpy CSR array + cdef np.ndarray[float, ndim=1] x_data = np.empty(nnz, + dtype=np.float32) + cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32) + cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz], + dtype=np.int32) + + for j in range(nnz): + x_data[j] = x_data_ptr[j] + x_indices[j] = x_indices_ptr[j] + + cdef int sample_idx = self.index_data_ptr[current_index] + + return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx + + +cdef class ArrayDataset32(SequentialDataset32): + """Dataset backed by a two-dimensional numpy array. + + The dtype of the numpy array is expected to be ``np.float32`` (float) + and C-style memory layout. + """ + + def __cinit__(self, np.ndarray[float, ndim=2, mode='c'] X, + np.ndarray[float, ndim=1, mode='c'] Y, + np.ndarray[float, ndim=1, mode='c'] sample_weights, + np.uint32_t seed=1): + """A ``SequentialDataset`` backed by a two-dimensional numpy array. + + Parameters + ---------- + X : ndarray, dtype=float, ndim=2, mode='c' + The sample array, of shape(n_samples, n_features) + + Y : ndarray, dtype=float, ndim=1, mode='c' + The target array, of shape(n_samples, ) + + sample_weights : ndarray, dtype=float, ndim=1, mode='c' + The weight of each sample, of shape(n_samples,) + """ + if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX: + raise ValueError("More than %d samples or features not supported;" + " got (%d, %d)." + % (INT_MAX, X.shape[0], X.shape[1])) + + # keep a reference to the data to prevent garbage collection + self.X = X + self.Y = Y + self.sample_weights = sample_weights + + self.n_samples = X.shape[0] + self.n_features = X.shape[1] + + cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \ + np.arange(0, self.n_features, dtype=np.intc) + self.feature_indices = feature_indices + self.feature_indices_ptr = feature_indices.data + + self.current_index = -1 + self.X_stride = X.strides[0] / X.itemsize + self.X_data_ptr = X.data + self.Y_data_ptr = Y.data + self.sample_weight_data = sample_weights.data + + # Use index array for fast shuffling + cdef np.ndarray[int, ndim=1, mode='c'] index = \ + np.arange(0, self.n_samples, dtype=np.intc) + self.index = index + self.index_data_ptr = index.data + # seed should not be 0 for our_rand_r + self.seed = max(seed, 1) + + cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight, + int current_index) nogil: + cdef long long sample_idx = self.index_data_ptr[current_index] + cdef long long offset = sample_idx * self.X_stride + + y[0] = self.Y_data_ptr[sample_idx] + x_data_ptr[0] = self.X_data_ptr + offset + x_ind_ptr[0] = self.feature_indices_ptr + nnz[0] = self.n_features + sample_weight[0] = self.sample_weight_data[sample_idx] + + +cdef class CSRDataset32(SequentialDataset32): + """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """ + + def __cinit__(self, np.ndarray[float, ndim=1, mode='c'] X_data, + np.ndarray[int, ndim=1, mode='c'] X_indptr, + np.ndarray[int, ndim=1, mode='c'] X_indices, + np.ndarray[float, ndim=1, mode='c'] Y, + np.ndarray[float, ndim=1, mode='c'] sample_weights, + np.uint32_t seed=1): + """Dataset backed by a scipy sparse CSR matrix. + + The feature indices of ``x`` are given by x_ind_ptr[0:nnz]. + The corresponding feature values are given by + x_data_ptr[0:nnz]. + + Parameters + ---------- + X_data : ndarray, dtype=float, ndim=1, mode='c' + The data array of the CSR features matrix. + + X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c' + The index pointer array of the CSR features matrix. + + X_indices : ndarray, dtype=np.intc, ndim=1, mode='c' + The column indices array of the CSR features matrix. + + Y : ndarray, dtype=float, ndim=1, mode='c' + The target values. + + sample_weights : ndarray, dtype=float, ndim=1, mode='c' + The weight of each sample. + """ + # keep a reference to the data to prevent garbage collection + self.X_data = X_data + self.X_indptr = X_indptr + self.X_indices = X_indices + self.Y = Y + self.sample_weights = sample_weights + + self.n_samples = Y.shape[0] + self.current_index = -1 + self.X_data_ptr = X_data.data + self.X_indptr_ptr = X_indptr.data + self.X_indices_ptr = X_indices.data + + self.Y_data_ptr = Y.data + self.sample_weight_data = sample_weights.data + + # Use index array for fast shuffling + cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples, + dtype=np.intc) + self.index = idx + self.index_data_ptr = idx.data + # seed should not be 0 for our_rand_r + self.seed = max(seed, 1) + + cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight, + int current_index) nogil: + cdef long long sample_idx = self.index_data_ptr[current_index] + cdef long long offset = self.X_indptr_ptr[sample_idx] + y[0] = self.Y_data_ptr[sample_idx] + x_data_ptr[0] = self.X_data_ptr + offset + x_ind_ptr[0] = self.X_indices_ptr + offset + nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset + sample_weight[0] = self.sample_weight_data[sample_idx] + From 9c77fbea1b021898a9c888587468ff22443cb87b Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Fri, 25 Oct 2019 14:07:30 -0400 Subject: [PATCH 06/19] Removing unexpected cython files --- sklearn/linear_model/_least_angle.py | 7 +- sklearn/linear_model/sag_fast.pyx | 1357 -------------------------- sklearn/utils/seq_dataset.pxd | 116 --- sklearn/utils/seq_dataset.pyx | 653 ------------- 4 files changed, 4 insertions(+), 2129 deletions(-) delete mode 100644 sklearn/linear_model/sag_fast.pyx delete mode 100644 sklearn/utils/seq_dataset.pxd delete mode 100644 sklearn/utils/seq_dataset.pyx diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 9012268978a57..ed17b8b7f30c4 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -810,7 +810,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): setting ``fit_path`` to ``False`` will lead to a speedup, especially with a small alpha. - jitter : float, default=0.001 + jitter : float, default=None Uniform noise parameter, added to the y values, to satisfy the model's assumption of one-at-a-time computations (Efron et al. 2004). @@ -972,7 +972,8 @@ def fit(self, X, y, Xy=None): max_iter = self.max_iter if self.jitter: - noise = np.random.RandomState(0).uniform(high=self.jitter, size=len(y)) + noise = np.random.RandomState(0).uniform(high=self.jitter, + size=len(y)) if y.ndim == 2: noise = noise.reshape(-1, 1) y = y + noise @@ -1044,7 +1045,7 @@ class LassoLars(Lars): setting ``fit_path`` to ``False`` will lead to a speedup, especially with a small alpha. - jitter : float, default=0.001 + jitter : float, default=None Uniform noise parameter, added to the y values, to satisfy \ the model's assumption of one-at-a-time computations \ (Efron et al. 2004). diff --git a/sklearn/linear_model/sag_fast.pyx b/sklearn/linear_model/sag_fast.pyx deleted file mode 100644 index 6d48b65bda560..0000000000000 --- a/sklearn/linear_model/sag_fast.pyx +++ /dev/null @@ -1,1357 +0,0 @@ - -#------------------------------------------------------------------------------ - -# cython: cdivision=True -# cython: boundscheck=False -# cython: wraparound=False -# -# Authors: Danny Sullivan -# Tom Dupre la Tour -# Arthur Mensch y: - return x - return y - -cdef inline float fmax32(float x, float y) nogil: - if x > y: - return x - return y - -cdef double _logsumexp64(double* arr, int n_classes) nogil: - """Computes the sum of arr assuming arr is in the log domain. - - Returns log(sum(exp(arr))) while minimizing the possibility of - over/underflow. - """ - # Use the max to normalize, as with the log this is what accumulates - # the less errors - cdef double vmax = arr[0] - cdef double out = 0.0 - cdef int i - - for i in range(1, n_classes): - if vmax < arr[i]: - vmax = arr[i] - - for i in range(n_classes): - out += exp(arr[i] - vmax) - - return log(out) + vmax - -cdef float _logsumexp32(float* arr, int n_classes) nogil: - """Computes the sum of arr assuming arr is in the log domain. - - Returns log(sum(exp(arr))) while minimizing the possibility of - over/underflow. - """ - # Use the max to normalize, as with the log this is what accumulates - # the less errors - cdef float vmax = arr[0] - cdef float out = 0.0 - cdef int i - - for i in range(1, n_classes): - if vmax < arr[i]: - vmax = arr[i] - - for i in range(n_classes): - out += exp(arr[i] - vmax) - - return log(out) + vmax - -cdef class MultinomialLogLoss64: - cdef double _loss(self, double* prediction, double y, int n_classes, - double sample_weight) nogil: - r"""Multinomial Logistic regression loss. - - The multinomial logistic loss for one sample is: - loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction)) - = sw (logsumexp(prediction) - prediction[y]) - - where: - prediction = dot(x_sample, weights) + intercept - \delta_{y,c} = 1 if (y == c) else 0 - sw = sample_weight - - Parameters - ---------- - prediction : pointer to a np.ndarray[double] of shape (n_classes,) - Prediction of the multinomial classifier, for current sample. - - y : double, between 0 and n_classes - 1 - Indice of the correct class for current sample (i.e. label encoded). - - n_classes : integer - Total number of classes. - - sample_weight : double - Weight of current sample. - - Returns - ------- - loss : double - Multinomial loss for current sample. - - Reference - --------- - Bishop, C. M. (2006). Pattern recognition and machine learning. - Springer. (Chapter 4.3.4) - """ - cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes) - cdef double loss - - # y is the indice of the correct class of current sample. - loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight - return loss - - cdef void _dloss(self, double* prediction, double y, int n_classes, - double sample_weight, double* gradient_ptr) nogil: - r"""Multinomial Logistic regression gradient of the loss. - - The gradient of the multinomial logistic loss with respect to a class c, - and for one sample is: - grad_c = - sw * (p[c] - \delta_{y,c}) - - where: - p[c] = exp(logsumexp(prediction) - prediction[c]) - prediction = dot(sample, weights) + intercept - \delta_{y,c} = 1 if (y == c) else 0 - sw = sample_weight - - Note that to obtain the true gradient, this value has to be multiplied - by the sample vector x. - - Parameters - ---------- - prediction : pointer to a np.ndarray[double] of shape (n_classes,) - Prediction of the multinomial classifier, for current sample. - - y : double, between 0 and n_classes - 1 - Indice of the correct class for current sample (i.e. label encoded) - - n_classes : integer - Total number of classes. - - sample_weight : double - Weight of current sample. - - gradient_ptr : pointer to a np.ndarray[double] of shape (n_classes,) - Gradient vector to be filled. - - Reference - --------- - Bishop, C. M. (2006). Pattern recognition and machine learning. - Springer. (Chapter 4.3.4) - """ - cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes) - cdef int class_ind - - for class_ind in range(n_classes): - gradient_ptr[class_ind] = exp(prediction[class_ind] - - logsumexp_prediction) - - # y is the indice of the correct class of current sample. - if class_ind == y: - gradient_ptr[class_ind] -= 1.0 - - gradient_ptr[class_ind] *= sample_weight - - def __reduce__(self): - return MultinomialLogLoss64, () - -cdef class MultinomialLogLoss32: - cdef float _loss(self, float* prediction, float y, int n_classes, - float sample_weight) nogil: - r"""Multinomial Logistic regression loss. - - The multinomial logistic loss for one sample is: - loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction)) - = sw (logsumexp(prediction) - prediction[y]) - - where: - prediction = dot(x_sample, weights) + intercept - \delta_{y,c} = 1 if (y == c) else 0 - sw = sample_weight - - Parameters - ---------- - prediction : pointer to a np.ndarray[float] of shape (n_classes,) - Prediction of the multinomial classifier, for current sample. - - y : float, between 0 and n_classes - 1 - Indice of the correct class for current sample (i.e. label encoded). - - n_classes : integer - Total number of classes. - - sample_weight : float - Weight of current sample. - - Returns - ------- - loss : float - Multinomial loss for current sample. - - Reference - --------- - Bishop, C. M. (2006). Pattern recognition and machine learning. - Springer. (Chapter 4.3.4) - """ - cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes) - cdef float loss - - # y is the indice of the correct class of current sample. - loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight - return loss - - cdef void _dloss(self, float* prediction, float y, int n_classes, - float sample_weight, float* gradient_ptr) nogil: - r"""Multinomial Logistic regression gradient of the loss. - - The gradient of the multinomial logistic loss with respect to a class c, - and for one sample is: - grad_c = - sw * (p[c] - \delta_{y,c}) - - where: - p[c] = exp(logsumexp(prediction) - prediction[c]) - prediction = dot(sample, weights) + intercept - \delta_{y,c} = 1 if (y == c) else 0 - sw = sample_weight - - Note that to obtain the true gradient, this value has to be multiplied - by the sample vector x. - - Parameters - ---------- - prediction : pointer to a np.ndarray[float] of shape (n_classes,) - Prediction of the multinomial classifier, for current sample. - - y : float, between 0 and n_classes - 1 - Indice of the correct class for current sample (i.e. label encoded) - - n_classes : integer - Total number of classes. - - sample_weight : float - Weight of current sample. - - gradient_ptr : pointer to a np.ndarray[float] of shape (n_classes,) - Gradient vector to be filled. - - Reference - --------- - Bishop, C. M. (2006). Pattern recognition and machine learning. - Springer. (Chapter 4.3.4) - """ - cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes) - cdef int class_ind - - for class_ind in range(n_classes): - gradient_ptr[class_ind] = exp(prediction[class_ind] - - logsumexp_prediction) - - # y is the indice of the correct class of current sample. - if class_ind == y: - gradient_ptr[class_ind] -= 1.0 - - gradient_ptr[class_ind] *= sample_weight - - def __reduce__(self): - return MultinomialLogLoss32, () - -cdef inline double _soft_thresholding64(double x, double shrinkage) nogil: - return fmax64(x - shrinkage, 0) - fmax64(- x - shrinkage, 0) - -cdef inline float _soft_thresholding32(float x, float shrinkage) nogil: - return fmax32(x - shrinkage, 0) - fmax32(- x - shrinkage, 0) - -def sag64(SequentialDataset64 dataset, - np.ndarray[double, ndim=2, mode='c'] weights_array, - np.ndarray[double, ndim=1, mode='c'] intercept_array, - int n_samples, - int n_features, - int n_classes, - double tol, - int max_iter, - str loss_function, - double step_size, - double alpha, - double beta, - np.ndarray[double, ndim=2, mode='c'] sum_gradient_init, - np.ndarray[double, ndim=2, mode='c'] gradient_memory_init, - np.ndarray[bint, ndim=1, mode='c'] seen_init, - int num_seen, - bint fit_intercept, - np.ndarray[double, ndim=1, mode='c'] intercept_sum_gradient_init, - double intercept_decay, - bint saga, - bint verbose): - """Stochastic Average Gradient (SAG) and SAGA solvers. - - Used in Ridge and LogisticRegression. - - Reference - --------- - Schmidt, M., Roux, N. L., & Bach, F. (2013). - Minimizing finite sums with the stochastic average gradient - https://hal.inria.fr/hal-00860051/document - (section 4.3) - - Defazio, A., Bach, F., Lacoste-Julien, S. (2014), - SAGA: A Fast Incremental Gradient Method With Support - for Non-Strongly Convex Composite Objectives - https://arxiv.org/abs/1407.0202 - - """ - # the data pointer for x, the current sample - cdef double *x_data_ptr = NULL - # the index pointer for the column of the data - cdef int *x_ind_ptr = NULL - # the number of non-zero features for current sample - cdef int xnnz = -1 - # the label value for current sample - # the label value for curent sample - cdef double y - # the sample weight - cdef double sample_weight - - # helper variable for indexes - cdef int f_idx, s_idx, feature_ind, class_ind, j - # the number of pass through all samples - cdef int n_iter = 0 - # helper to track iterations through samples - cdef int sample_itr - # the index (row number) of the current sample - cdef int sample_ind - - # the maximum change in weights, used to compute stopping criteria - cdef double max_change - # a holder variable for the max weight, used to compute stopping criteria - cdef double max_weight - - # the start time of the fit - cdef time_t start_time - # the end time of the fit - cdef time_t end_time - - # precomputation since the step size does not change in this implementation - cdef double wscale_update = 1.0 - step_size * alpha - - # vector of booleans indicating whether this sample has been seen - cdef bint* seen = seen_init.data - - # helper for cumulative sum - cdef double cum_sum - - # the pointer to the coef_ or weights - cdef double* weights = weights_array.data - # the pointer to the intercept_array - cdef double* intercept = intercept_array.data - - # the pointer to the intercept_sum_gradient - cdef double* intercept_sum_gradient = \ - intercept_sum_gradient_init.data - - # the sum of gradients for each feature - cdef double* sum_gradient = sum_gradient_init.data - # the previously seen gradient for each sample - cdef double* gradient_memory = gradient_memory_init.data - - # the cumulative sums needed for JIT params - cdef np.ndarray[double, ndim=1] cumulative_sums_array = \ - np.empty(n_samples, dtype=np.float64, order="c") - cdef double* cumulative_sums = cumulative_sums_array.data - - # the index for the last time this feature was updated - cdef np.ndarray[int, ndim=1] feature_hist_array = \ - np.zeros(n_features, dtype=np.int32, order="c") - cdef int* feature_hist = feature_hist_array.data - - # the previous weights to use to compute stopping criteria - cdef np.ndarray[double, ndim=2] previous_weights_array = \ - np.zeros((n_features, n_classes), dtype=np.float64, order="c") - cdef double* previous_weights = previous_weights_array.data - - cdef np.ndarray[double, ndim=1] prediction_array = \ - np.zeros(n_classes, dtype=np.float64, order="c") - cdef double* prediction = prediction_array.data - - cdef np.ndarray[double, ndim=1] gradient_array = \ - np.zeros(n_classes, dtype=np.float64, order="c") - cdef double* gradient = gradient_array.data - - # Intermediate variable that need declaration since cython cannot infer when templating - cdef double val - - # Bias correction term in saga - cdef double gradient_correction - - # the scalar used for multiplying z - cdef double wscale = 1.0 - - # return value (-1 if an error occurred, 0 otherwise) - cdef int status = 0 - - # the cumulative sums for each iteration for the sparse implementation - cumulative_sums[0] = 0.0 - - # the multipliative scale needed for JIT params - cdef np.ndarray[double, ndim=1] cumulative_sums_prox_array - cdef double* cumulative_sums_prox - - cdef bint prox = beta > 0 and saga - - # Loss function to optimize - cdef LossFunction loss - # Wether the loss function is multinomial - cdef bint multinomial = False - # Multinomial loss function - cdef MultinomialLogLoss64 multiloss - - if loss_function == "multinomial": - multinomial = True - multiloss = MultinomialLogLoss64() - elif loss_function == "log": - loss = Log() - elif loss_function == "squared": - loss = SquaredLoss() - else: - raise ValueError("Invalid loss parameter: got %s instead of " - "one of ('log', 'squared', 'multinomial')" - % loss_function) - - if prox: - cumulative_sums_prox_array = np.empty(n_samples, - dtype=np.float64, order="c") - cumulative_sums_prox = cumulative_sums_prox_array.data - else: - cumulative_sums_prox = NULL - - with nogil: - start_time = time(NULL) - for n_iter in range(max_iter): - for sample_itr in range(n_samples): - # extract a random sample - sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz, - &y, &sample_weight) - - # cached index for gradient_memory - s_idx = sample_ind * n_classes - - # update the number of samples seen and the seen array - if seen[sample_ind] == 0: - num_seen += 1 - seen[sample_ind] = 1 - - # make the weight updates - if sample_itr > 0: - status = lagged_update64(weights, wscale, xnnz, - n_samples, n_classes, - sample_itr, - cumulative_sums, - cumulative_sums_prox, - feature_hist, - prox, - sum_gradient, - x_ind_ptr, - False, - n_iter) - if status == -1: - break - - # find the current prediction - predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, - intercept, prediction, n_classes) - - # compute the gradient for this sample, given the prediction - if multinomial: - multiloss._dloss(prediction, y, n_classes, sample_weight, - gradient) - else: - gradient[0] = loss._dloss(prediction[0], y) * sample_weight - - # L2 regularization by simply rescaling the weights - wscale *= wscale_update - - # make the updates to the sum of gradients - for j in range(xnnz): - feature_ind = x_ind_ptr[j] - val = x_data_ptr[j] - f_idx = feature_ind * n_classes - for class_ind in range(n_classes): - gradient_correction = \ - val * (gradient[class_ind] - - gradient_memory[s_idx + class_ind]) - if saga: - weights[f_idx + class_ind] -= \ - (gradient_correction * step_size - * (1 - 1. / num_seen) / wscale) - sum_gradient[f_idx + class_ind] += gradient_correction - - # fit the intercept - if fit_intercept: - for class_ind in range(n_classes): - gradient_correction = (gradient[class_ind] - - gradient_memory[s_idx + class_ind]) - intercept_sum_gradient[class_ind] += gradient_correction - gradient_correction *= step_size * (1. - 1. / num_seen) - if saga: - intercept[class_ind] -= \ - (step_size * intercept_sum_gradient[class_ind] / - num_seen * intercept_decay) + gradient_correction - else: - intercept[class_ind] -= \ - (step_size * intercept_sum_gradient[class_ind] / - num_seen * intercept_decay) - - # check to see that the intercept is not inf or NaN - if not skl_isfinite64(intercept[class_ind]): - status = -1 - break - # Break from the n_samples outer loop if an error happened - # in the fit_intercept n_classes inner loop - if status == -1: - break - - # update the gradient memory for this sample - for class_ind in range(n_classes): - gradient_memory[s_idx + class_ind] = gradient[class_ind] - - if sample_itr == 0: - cumulative_sums[0] = step_size / (wscale * num_seen) - if prox: - cumulative_sums_prox[0] = step_size * beta / wscale - else: - cumulative_sums[sample_itr] = \ - (cumulative_sums[sample_itr - 1] + - step_size / (wscale * num_seen)) - if prox: - cumulative_sums_prox[sample_itr] = \ - (cumulative_sums_prox[sample_itr - 1] + - step_size * beta / wscale) - # If wscale gets too small, we need to reset the scale. - if wscale < 1e-9: - if verbose: - with gil: - print("rescaling...") - status = scale_weights64( - weights, &wscale, n_features, n_samples, n_classes, - sample_itr, cumulative_sums, - cumulative_sums_prox, - feature_hist, - prox, sum_gradient, n_iter) - if status == -1: - break - - # Break from the n_iter outer loop if an error happened in the - # n_samples inner loop - if status == -1: - break - - # we scale the weights every n_samples iterations and reset the - # just-in-time update system for numerical stability. - status = scale_weights64(weights, &wscale, n_features, - n_samples, - n_classes, n_samples - 1, - cumulative_sums, - cumulative_sums_prox, - feature_hist, - prox, sum_gradient, n_iter) - - if status == -1: - break - # check if the stopping criteria is reached - max_change = 0.0 - max_weight = 0.0 - for idx in range(n_features * n_classes): - max_weight = fmax64(max_weight, fabs(weights[idx])) - max_change = fmax64(max_change, - fabs(weights[idx] - - previous_weights[idx])) - previous_weights[idx] = weights[idx] - if ((max_weight != 0 and max_change / max_weight <= tol) - or max_weight == 0 and max_change == 0): - if verbose: - end_time = time(NULL) - with gil: - print("convergence after %d epochs took %d seconds" % - (n_iter + 1, end_time - start_time)) - break - elif verbose: - printf('Epoch %d, change: %.8f\n', n_iter + 1, - max_change / max_weight) - n_iter += 1 - # We do the error treatment here based on error code in status to avoid - # re-acquiring the GIL within the cython code, which slows the computation - # when the sag/saga solver is used concurrently in multiple Python threads. - if status == -1: - raise ValueError(("Floating-point under-/overflow occurred at epoch" - " #%d. Scaling input data with StandardScaler or" - " MinMaxScaler might help.") % n_iter) - - if verbose and n_iter >= max_iter: - end_time = time(NULL) - print(("max_iter reached after %d seconds") % - (end_time - start_time)) - - return num_seen, n_iter - -def sag32(SequentialDataset32 dataset, - np.ndarray[float, ndim=2, mode='c'] weights_array, - np.ndarray[float, ndim=1, mode='c'] intercept_array, - int n_samples, - int n_features, - int n_classes, - double tol, - int max_iter, - str loss_function, - double step_size, - double alpha, - double beta, - np.ndarray[float, ndim=2, mode='c'] sum_gradient_init, - np.ndarray[float, ndim=2, mode='c'] gradient_memory_init, - np.ndarray[bint, ndim=1, mode='c'] seen_init, - int num_seen, - bint fit_intercept, - np.ndarray[float, ndim=1, mode='c'] intercept_sum_gradient_init, - double intercept_decay, - bint saga, - bint verbose): - """Stochastic Average Gradient (SAG) and SAGA solvers. - - Used in Ridge and LogisticRegression. - - Reference - --------- - Schmidt, M., Roux, N. L., & Bach, F. (2013). - Minimizing finite sums with the stochastic average gradient - https://hal.inria.fr/hal-00860051/document - (section 4.3) - - Defazio, A., Bach, F., Lacoste-Julien, S. (2014), - SAGA: A Fast Incremental Gradient Method With Support - for Non-Strongly Convex Composite Objectives - https://arxiv.org/abs/1407.0202 - - """ - # the data pointer for x, the current sample - cdef float *x_data_ptr = NULL - # the index pointer for the column of the data - cdef int *x_ind_ptr = NULL - # the number of non-zero features for current sample - cdef int xnnz = -1 - # the label value for current sample - # the label value for curent sample - cdef float y - # the sample weight - cdef float sample_weight - - # helper variable for indexes - cdef int f_idx, s_idx, feature_ind, class_ind, j - # the number of pass through all samples - cdef int n_iter = 0 - # helper to track iterations through samples - cdef int sample_itr - # the index (row number) of the current sample - cdef int sample_ind - - # the maximum change in weights, used to compute stopping criteria - cdef float max_change - # a holder variable for the max weight, used to compute stopping criteria - cdef float max_weight - - # the start time of the fit - cdef time_t start_time - # the end time of the fit - cdef time_t end_time - - # precomputation since the step size does not change in this implementation - cdef float wscale_update = 1.0 - step_size * alpha - - # vector of booleans indicating whether this sample has been seen - cdef bint* seen = seen_init.data - - # helper for cumulative sum - cdef float cum_sum - - # the pointer to the coef_ or weights - cdef float* weights = weights_array.data - # the pointer to the intercept_array - cdef float* intercept = intercept_array.data - - # the pointer to the intercept_sum_gradient - cdef float* intercept_sum_gradient = \ - intercept_sum_gradient_init.data - - # the sum of gradients for each feature - cdef float* sum_gradient = sum_gradient_init.data - # the previously seen gradient for each sample - cdef float* gradient_memory = gradient_memory_init.data - - # the cumulative sums needed for JIT params - cdef np.ndarray[float, ndim=1] cumulative_sums_array = \ - np.empty(n_samples, dtype=np.float32, order="c") - cdef float* cumulative_sums = cumulative_sums_array.data - - # the index for the last time this feature was updated - cdef np.ndarray[int, ndim=1] feature_hist_array = \ - np.zeros(n_features, dtype=np.int32, order="c") - cdef int* feature_hist = feature_hist_array.data - - # the previous weights to use to compute stopping criteria - cdef np.ndarray[float, ndim=2] previous_weights_array = \ - np.zeros((n_features, n_classes), dtype=np.float32, order="c") - cdef float* previous_weights = previous_weights_array.data - - cdef np.ndarray[float, ndim=1] prediction_array = \ - np.zeros(n_classes, dtype=np.float32, order="c") - cdef float* prediction = prediction_array.data - - cdef np.ndarray[float, ndim=1] gradient_array = \ - np.zeros(n_classes, dtype=np.float32, order="c") - cdef float* gradient = gradient_array.data - - # Intermediate variable that need declaration since cython cannot infer when templating - cdef float val - - # Bias correction term in saga - cdef float gradient_correction - - # the scalar used for multiplying z - cdef float wscale = 1.0 - - # return value (-1 if an error occurred, 0 otherwise) - cdef int status = 0 - - # the cumulative sums for each iteration for the sparse implementation - cumulative_sums[0] = 0.0 - - # the multipliative scale needed for JIT params - cdef np.ndarray[float, ndim=1] cumulative_sums_prox_array - cdef float* cumulative_sums_prox - - cdef bint prox = beta > 0 and saga - - # Loss function to optimize - cdef LossFunction loss - # Wether the loss function is multinomial - cdef bint multinomial = False - # Multinomial loss function - cdef MultinomialLogLoss32 multiloss - - if loss_function == "multinomial": - multinomial = True - multiloss = MultinomialLogLoss32() - elif loss_function == "log": - loss = Log() - elif loss_function == "squared": - loss = SquaredLoss() - else: - raise ValueError("Invalid loss parameter: got %s instead of " - "one of ('log', 'squared', 'multinomial')" - % loss_function) - - if prox: - cumulative_sums_prox_array = np.empty(n_samples, - dtype=np.float32, order="c") - cumulative_sums_prox = cumulative_sums_prox_array.data - else: - cumulative_sums_prox = NULL - - with nogil: - start_time = time(NULL) - for n_iter in range(max_iter): - for sample_itr in range(n_samples): - # extract a random sample - sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz, - &y, &sample_weight) - - # cached index for gradient_memory - s_idx = sample_ind * n_classes - - # update the number of samples seen and the seen array - if seen[sample_ind] == 0: - num_seen += 1 - seen[sample_ind] = 1 - - # make the weight updates - if sample_itr > 0: - status = lagged_update32(weights, wscale, xnnz, - n_samples, n_classes, - sample_itr, - cumulative_sums, - cumulative_sums_prox, - feature_hist, - prox, - sum_gradient, - x_ind_ptr, - False, - n_iter) - if status == -1: - break - - # find the current prediction - predict_sample32(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, - intercept, prediction, n_classes) - - # compute the gradient for this sample, given the prediction - if multinomial: - multiloss._dloss(prediction, y, n_classes, sample_weight, - gradient) - else: - gradient[0] = loss._dloss(prediction[0], y) * sample_weight - - # L2 regularization by simply rescaling the weights - wscale *= wscale_update - - # make the updates to the sum of gradients - for j in range(xnnz): - feature_ind = x_ind_ptr[j] - val = x_data_ptr[j] - f_idx = feature_ind * n_classes - for class_ind in range(n_classes): - gradient_correction = \ - val * (gradient[class_ind] - - gradient_memory[s_idx + class_ind]) - if saga: - weights[f_idx + class_ind] -= \ - (gradient_correction * step_size - * (1 - 1. / num_seen) / wscale) - sum_gradient[f_idx + class_ind] += gradient_correction - - # fit the intercept - if fit_intercept: - for class_ind in range(n_classes): - gradient_correction = (gradient[class_ind] - - gradient_memory[s_idx + class_ind]) - intercept_sum_gradient[class_ind] += gradient_correction - gradient_correction *= step_size * (1. - 1. / num_seen) - if saga: - intercept[class_ind] -= \ - (step_size * intercept_sum_gradient[class_ind] / - num_seen * intercept_decay) + gradient_correction - else: - intercept[class_ind] -= \ - (step_size * intercept_sum_gradient[class_ind] / - num_seen * intercept_decay) - - # check to see that the intercept is not inf or NaN - if not skl_isfinite32(intercept[class_ind]): - status = -1 - break - # Break from the n_samples outer loop if an error happened - # in the fit_intercept n_classes inner loop - if status == -1: - break - - # update the gradient memory for this sample - for class_ind in range(n_classes): - gradient_memory[s_idx + class_ind] = gradient[class_ind] - - if sample_itr == 0: - cumulative_sums[0] = step_size / (wscale * num_seen) - if prox: - cumulative_sums_prox[0] = step_size * beta / wscale - else: - cumulative_sums[sample_itr] = \ - (cumulative_sums[sample_itr - 1] + - step_size / (wscale * num_seen)) - if prox: - cumulative_sums_prox[sample_itr] = \ - (cumulative_sums_prox[sample_itr - 1] + - step_size * beta / wscale) - # If wscale gets too small, we need to reset the scale. - if wscale < 1e-9: - if verbose: - with gil: - print("rescaling...") - status = scale_weights32( - weights, &wscale, n_features, n_samples, n_classes, - sample_itr, cumulative_sums, - cumulative_sums_prox, - feature_hist, - prox, sum_gradient, n_iter) - if status == -1: - break - - # Break from the n_iter outer loop if an error happened in the - # n_samples inner loop - if status == -1: - break - - # we scale the weights every n_samples iterations and reset the - # just-in-time update system for numerical stability. - status = scale_weights32(weights, &wscale, n_features, - n_samples, - n_classes, n_samples - 1, - cumulative_sums, - cumulative_sums_prox, - feature_hist, - prox, sum_gradient, n_iter) - - if status == -1: - break - # check if the stopping criteria is reached - max_change = 0.0 - max_weight = 0.0 - for idx in range(n_features * n_classes): - max_weight = fmax32(max_weight, fabs(weights[idx])) - max_change = fmax32(max_change, - fabs(weights[idx] - - previous_weights[idx])) - previous_weights[idx] = weights[idx] - if ((max_weight != 0 and max_change / max_weight <= tol) - or max_weight == 0 and max_change == 0): - if verbose: - end_time = time(NULL) - with gil: - print("convergence after %d epochs took %d seconds" % - (n_iter + 1, end_time - start_time)) - break - elif verbose: - printf('Epoch %d, change: %.8f\n', n_iter + 1, - max_change / max_weight) - n_iter += 1 - # We do the error treatment here based on error code in status to avoid - # re-acquiring the GIL within the cython code, which slows the computation - # when the sag/saga solver is used concurrently in multiple Python threads. - if status == -1: - raise ValueError(("Floating-point under-/overflow occurred at epoch" - " #%d. Scaling input data with StandardScaler or" - " MinMaxScaler might help.") % n_iter) - - if verbose and n_iter >= max_iter: - end_time = time(NULL) - print(("max_iter reached after %d seconds") % - (end_time - start_time)) - - return num_seen, n_iter - -cdef int scale_weights64(double* weights, double* wscale, - int n_features, - int n_samples, int n_classes, int sample_itr, - double* cumulative_sums, - double* cumulative_sums_prox, - int* feature_hist, - bint prox, - double* sum_gradient, - int n_iter) nogil: - """Scale the weights with wscale for numerical stability. - - wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr) - can become very small, so we reset it every n_samples iterations to 1.0 for - numerical stability. To be able to scale, we first need to update every - coefficients and reset the just-in-time update system. - This also limits the size of `cumulative_sums`. - """ - - cdef int status - status = lagged_update64(weights, wscale[0], n_features, - n_samples, n_classes, sample_itr + 1, - cumulative_sums, - cumulative_sums_prox, - feature_hist, - prox, - sum_gradient, - NULL, - True, - n_iter) - # if lagged update succeeded, reset wscale to 1.0 - if status == 0: - wscale[0] = 1.0 - return status - -cdef int scale_weights32(float* weights, float* wscale, - int n_features, - int n_samples, int n_classes, int sample_itr, - float* cumulative_sums, - float* cumulative_sums_prox, - int* feature_hist, - bint prox, - float* sum_gradient, - int n_iter) nogil: - """Scale the weights with wscale for numerical stability. - - wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr) - can become very small, so we reset it every n_samples iterations to 1.0 for - numerical stability. To be able to scale, we first need to update every - coefficients and reset the just-in-time update system. - This also limits the size of `cumulative_sums`. - """ - - cdef int status - status = lagged_update32(weights, wscale[0], n_features, - n_samples, n_classes, sample_itr + 1, - cumulative_sums, - cumulative_sums_prox, - feature_hist, - prox, - sum_gradient, - NULL, - True, - n_iter) - # if lagged update succeeded, reset wscale to 1.0 - if status == 0: - wscale[0] = 1.0 - return status - -cdef int lagged_update64(double* weights, double wscale, int xnnz, - int n_samples, int n_classes, int sample_itr, - double* cumulative_sums, - double* cumulative_sums_prox, - int* feature_hist, - bint prox, - double* sum_gradient, - int* x_ind_ptr, - bint reset, - int n_iter) nogil: - """Hard perform the JIT updates for non-zero features of present sample. - The updates that awaits are kept in memory using cumulative_sums, - cumulative_sums_prox, wscale and feature_hist. See original SAGA paper - (Defazio et al. 2014) for details. If reset=True, we also reset wscale to - 1 (this is done at the end of each epoch). - """ - cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind - cdef double cum_sum, grad_step, prox_step, cum_sum_prox - for feature_ind in range(xnnz): - if not reset: - feature_ind = x_ind_ptr[feature_ind] - f_idx = feature_ind * n_classes - - cum_sum = cumulative_sums[sample_itr - 1] - if prox: - cum_sum_prox = cumulative_sums_prox[sample_itr - 1] - if feature_hist[feature_ind] != 0: - cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1] - if prox: - cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1] - if not prox: - for class_ind in range(n_classes): - idx = f_idx + class_ind - weights[idx] -= cum_sum * sum_gradient[idx] - if reset: - weights[idx] *= wscale - if not skl_isfinite64(weights[idx]): - # returning here does not require the gil as the return - # type is a C integer - return -1 - else: - for class_ind in range(n_classes): - idx = f_idx + class_ind - if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox: - # In this case, we can perform all the gradient steps and - # all the proximal steps in this order, which is more - # efficient than unrolling all the lagged updates. - # Idea taken from scikit-learn-contrib/lightning. - weights[idx] -= cum_sum * sum_gradient[idx] - weights[idx] = _soft_thresholding64(weights[idx], - cum_sum_prox) - else: - last_update_ind = feature_hist[feature_ind] - if last_update_ind == -1: - last_update_ind = sample_itr - 1 - for lagged_ind in range(sample_itr - 1, - last_update_ind - 1, -1): - if lagged_ind > 0: - grad_step = (cumulative_sums[lagged_ind] - - cumulative_sums[lagged_ind - 1]) - prox_step = (cumulative_sums_prox[lagged_ind] - - cumulative_sums_prox[lagged_ind - 1]) - else: - grad_step = cumulative_sums[lagged_ind] - prox_step = cumulative_sums_prox[lagged_ind] - weights[idx] -= sum_gradient[idx] * grad_step - weights[idx] = _soft_thresholding64(weights[idx], - prox_step) - - if reset: - weights[idx] *= wscale - # check to see that the weight is not inf or NaN - if not skl_isfinite64(weights[idx]): - return -1 - if reset: - feature_hist[feature_ind] = sample_itr % n_samples - else: - feature_hist[feature_ind] = sample_itr - - if reset: - cumulative_sums[sample_itr - 1] = 0.0 - if prox: - cumulative_sums_prox[sample_itr - 1] = 0.0 - - return 0 - -cdef int lagged_update32(float* weights, float wscale, int xnnz, - int n_samples, int n_classes, int sample_itr, - float* cumulative_sums, - float* cumulative_sums_prox, - int* feature_hist, - bint prox, - float* sum_gradient, - int* x_ind_ptr, - bint reset, - int n_iter) nogil: - """Hard perform the JIT updates for non-zero features of present sample. - The updates that awaits are kept in memory using cumulative_sums, - cumulative_sums_prox, wscale and feature_hist. See original SAGA paper - (Defazio et al. 2014) for details. If reset=True, we also reset wscale to - 1 (this is done at the end of each epoch). - """ - cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind - cdef float cum_sum, grad_step, prox_step, cum_sum_prox - for feature_ind in range(xnnz): - if not reset: - feature_ind = x_ind_ptr[feature_ind] - f_idx = feature_ind * n_classes - - cum_sum = cumulative_sums[sample_itr - 1] - if prox: - cum_sum_prox = cumulative_sums_prox[sample_itr - 1] - if feature_hist[feature_ind] != 0: - cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1] - if prox: - cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1] - if not prox: - for class_ind in range(n_classes): - idx = f_idx + class_ind - weights[idx] -= cum_sum * sum_gradient[idx] - if reset: - weights[idx] *= wscale - if not skl_isfinite32(weights[idx]): - # returning here does not require the gil as the return - # type is a C integer - return -1 - else: - for class_ind in range(n_classes): - idx = f_idx + class_ind - if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox: - # In this case, we can perform all the gradient steps and - # all the proximal steps in this order, which is more - # efficient than unrolling all the lagged updates. - # Idea taken from scikit-learn-contrib/lightning. - weights[idx] -= cum_sum * sum_gradient[idx] - weights[idx] = _soft_thresholding32(weights[idx], - cum_sum_prox) - else: - last_update_ind = feature_hist[feature_ind] - if last_update_ind == -1: - last_update_ind = sample_itr - 1 - for lagged_ind in range(sample_itr - 1, - last_update_ind - 1, -1): - if lagged_ind > 0: - grad_step = (cumulative_sums[lagged_ind] - - cumulative_sums[lagged_ind - 1]) - prox_step = (cumulative_sums_prox[lagged_ind] - - cumulative_sums_prox[lagged_ind - 1]) - else: - grad_step = cumulative_sums[lagged_ind] - prox_step = cumulative_sums_prox[lagged_ind] - weights[idx] -= sum_gradient[idx] * grad_step - weights[idx] = _soft_thresholding32(weights[idx], - prox_step) - - if reset: - weights[idx] *= wscale - # check to see that the weight is not inf or NaN - if not skl_isfinite32(weights[idx]): - return -1 - if reset: - feature_hist[feature_ind] = sample_itr % n_samples - else: - feature_hist[feature_ind] = sample_itr - - if reset: - cumulative_sums[sample_itr - 1] = 0.0 - if prox: - cumulative_sums_prox[sample_itr - 1] = 0.0 - - return 0 - -cdef void predict_sample64(double* x_data_ptr, int* x_ind_ptr, int xnnz, - double* w_data_ptr, double wscale, - double* intercept, double* prediction, - int n_classes) nogil: - """Compute the prediction given sparse sample x and dense weight w. - - Parameters - ---------- - x_data_ptr : pointer - Pointer to the data of the sample x - - x_ind_ptr : pointer - Pointer to the indices of the sample x - - xnnz : int - Number of non-zero element in the sample x - - w_data_ptr : pointer - Pointer to the data of the weights w - - wscale : double - Scale of the weights w - - intercept : pointer - Pointer to the intercept - - prediction : pointer - Pointer to store the resulting prediction - - n_classes : int - Number of classes in multinomial case. Equals 1 in binary case. - - """ - cdef int feature_ind, class_ind, j - cdef double innerprod - - for class_ind in range(n_classes): - innerprod = 0.0 - # Compute the dot product only on non-zero elements of x - for j in range(xnnz): - feature_ind = x_ind_ptr[j] - innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] * - x_data_ptr[j]) - - prediction[class_ind] = wscale * innerprod + intercept[class_ind] - - -cdef void predict_sample32(float* x_data_ptr, int* x_ind_ptr, int xnnz, - float* w_data_ptr, float wscale, - float* intercept, float* prediction, - int n_classes) nogil: - """Compute the prediction given sparse sample x and dense weight w. - - Parameters - ---------- - x_data_ptr : pointer - Pointer to the data of the sample x - - x_ind_ptr : pointer - Pointer to the indices of the sample x - - xnnz : int - Number of non-zero element in the sample x - - w_data_ptr : pointer - Pointer to the data of the weights w - - wscale : float - Scale of the weights w - - intercept : pointer - Pointer to the intercept - - prediction : pointer - Pointer to store the resulting prediction - - n_classes : int - Number of classes in multinomial case. Equals 1 in binary case. - - """ - cdef int feature_ind, class_ind, j - cdef float innerprod - - for class_ind in range(n_classes): - innerprod = 0.0 - # Compute the dot product only on non-zero elements of x - for j in range(xnnz): - feature_ind = x_ind_ptr[j] - innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] * - x_data_ptr[j]) - - prediction[class_ind] = wscale * innerprod + intercept[class_ind] - - - -def _multinomial_grad_loss_all_samples( - SequentialDataset64 dataset, - np.ndarray[double, ndim=2, mode='c'] weights_array, - np.ndarray[double, ndim=1, mode='c'] intercept_array, - int n_samples, int n_features, int n_classes): - """Compute multinomial gradient and loss across all samples. - - Used for testing purpose only. - """ - cdef double* weights = weights_array.data - cdef double* intercept = intercept_array.data - - cdef double *x_data_ptr = NULL - cdef int *x_ind_ptr = NULL - cdef int xnnz = -1 - cdef double y - cdef double sample_weight - - cdef double wscale = 1.0 - cdef int i, j, class_ind, feature_ind - cdef double val - cdef double sum_loss = 0.0 - - cdef MultinomialLogLoss64 multiloss = MultinomialLogLoss64() - - cdef np.ndarray[double, ndim=2] sum_gradient_array = \ - np.zeros((n_features, n_classes), dtype=np.double, order="c") - cdef double* sum_gradient = sum_gradient_array.data - - cdef np.ndarray[double, ndim=1] prediction_array = \ - np.zeros(n_classes, dtype=np.double, order="c") - cdef double* prediction = prediction_array.data - - cdef np.ndarray[double, ndim=1] gradient_array = \ - np.zeros(n_classes, dtype=np.double, order="c") - cdef double* gradient = gradient_array.data - - with nogil: - for i in range(n_samples): - # get next sample on the dataset - dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz, - &y, &sample_weight) - - # prediction of the multinomial classifier for the sample - predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, - intercept, prediction, n_classes) - - # compute the gradient for this sample, given the prediction - multiloss._dloss(prediction, y, n_classes, sample_weight, gradient) - - # compute the loss for this sample, given the prediction - sum_loss += multiloss._loss(prediction, y, n_classes, sample_weight) - - # update the sum of the gradient - for j in range(xnnz): - feature_ind = x_ind_ptr[j] - val = x_data_ptr[j] - for class_ind in range(n_classes): - sum_gradient[feature_ind * n_classes + class_ind] += \ - gradient[class_ind] * val - - return sum_loss, sum_gradient_array diff --git a/sklearn/utils/seq_dataset.pxd b/sklearn/utils/seq_dataset.pxd deleted file mode 100644 index 67ce3b68b4474..0000000000000 --- a/sklearn/utils/seq_dataset.pxd +++ /dev/null @@ -1,116 +0,0 @@ - -#------------------------------------------------------------------------------ - -""" -Dataset abstractions for sequential data access. -WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp -""" - -cimport numpy as np - -# SequentialDataset and its two concrete subclasses are (optionally randomized) -# iterators over the rows of a matrix X and corresponding target values y. - - -cdef class SequentialDataset64: - cdef int current_index - cdef np.ndarray index - cdef int *index_data_ptr - cdef Py_ssize_t n_samples - cdef np.uint32_t seed - - cdef void shuffle(self, np.uint32_t seed) nogil - cdef int _get_next_index(self) nogil - cdef int _get_random_index(self) nogil - - cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, - int *nnz, double *y, double *sample_weight, - int current_index) nogil - cdef void next(self, double **x_data_ptr, int **x_ind_ptr, - int *nnz, double *y, double *sample_weight) nogil - cdef int random(self, double **x_data_ptr, int **x_ind_ptr, - int *nnz, double *y, double *sample_weight) nogil - - -cdef class ArrayDataset64(SequentialDataset64): - cdef np.ndarray X - cdef np.ndarray Y - cdef np.ndarray sample_weights - cdef Py_ssize_t n_features - cdef np.npy_intp X_stride - cdef double *X_data_ptr - cdef double *Y_data_ptr - cdef np.ndarray feature_indices - cdef int *feature_indices_ptr - cdef double *sample_weight_data - - -cdef class CSRDataset64(SequentialDataset64): - cdef np.ndarray X_data - cdef np.ndarray X_indptr - cdef np.ndarray X_indices - cdef np.ndarray Y - cdef np.ndarray sample_weights - cdef double *X_data_ptr - cdef int *X_indptr_ptr - cdef int *X_indices_ptr - cdef double *Y_data_ptr - cdef double *sample_weight_data - -#------------------------------------------------------------------------------ - -""" -Dataset abstractions for sequential data access. -WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp -""" - -cimport numpy as np - -# SequentialDataset and its two concrete subclasses are (optionally randomized) -# iterators over the rows of a matrix X and corresponding target values y. - - -cdef class SequentialDataset32: - cdef int current_index - cdef np.ndarray index - cdef int *index_data_ptr - cdef Py_ssize_t n_samples - cdef np.uint32_t seed - - cdef void shuffle(self, np.uint32_t seed) nogil - cdef int _get_next_index(self) nogil - cdef int _get_random_index(self) nogil - - cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, - int *nnz, float *y, float *sample_weight, - int current_index) nogil - cdef void next(self, float **x_data_ptr, int **x_ind_ptr, - int *nnz, float *y, float *sample_weight) nogil - cdef int random(self, float **x_data_ptr, int **x_ind_ptr, - int *nnz, float *y, float *sample_weight) nogil - - -cdef class ArrayDataset32(SequentialDataset32): - cdef np.ndarray X - cdef np.ndarray Y - cdef np.ndarray sample_weights - cdef Py_ssize_t n_features - cdef np.npy_intp X_stride - cdef float *X_data_ptr - cdef float *Y_data_ptr - cdef np.ndarray feature_indices - cdef int *feature_indices_ptr - cdef float *sample_weight_data - - -cdef class CSRDataset32(SequentialDataset32): - cdef np.ndarray X_data - cdef np.ndarray X_indptr - cdef np.ndarray X_indices - cdef np.ndarray Y - cdef np.ndarray sample_weights - cdef float *X_data_ptr - cdef int *X_indptr_ptr - cdef int *X_indices_ptr - cdef float *Y_data_ptr - cdef float *sample_weight_data diff --git a/sklearn/utils/seq_dataset.pyx b/sklearn/utils/seq_dataset.pyx deleted file mode 100644 index 6fa274771defe..0000000000000 --- a/sklearn/utils/seq_dataset.pyx +++ /dev/null @@ -1,653 +0,0 @@ -# cython: cdivision=True -# cython: boundscheck=False -# cython: wraparound=False - -#------------------------------------------------------------------------------ - -""" -Dataset abstractions for sequential data access. -WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp -""" - -cimport cython -from libc.limits cimport INT_MAX -cimport numpy as np -import numpy as np - -np.import_array() - -from ._random cimport our_rand_r - -cdef class SequentialDataset64: - """Base class for datasets with sequential data access. - - SequentialDataset is used to iterate over the rows of a matrix X and - corresponding target values y, i.e. to iterate over samples. - There are two methods to get the next sample: - - next : Iterate sequentially (optionally randomized) - - random : Iterate randomly (with replacement) - - Attributes - ---------- - index : np.ndarray - Index array for fast shuffling. - - index_data_ptr : int - Pointer to the index array. - - current_index : int - Index of current sample in ``index``. - The index of current sample in the data is given by - index_data_ptr[current_index]. - - n_samples : Py_ssize_t - Number of samples in the dataset. - - seed : np.uint32_t - Seed used for random sampling. - - """ - - cdef void next(self, double **x_data_ptr, int **x_ind_ptr, - int *nnz, double *y, double *sample_weight) nogil: - """Get the next example ``x`` from the dataset. - - This method gets the next sample looping sequentially over all samples. - The order can be shuffled with the method ``shuffle``. - Shuffling once before iterating over all samples corresponds to a - random draw without replacement. It is used for instance in SGD solver. - - Parameters - ---------- - x_data_ptr : double** - A pointer to the double array which holds the feature - values of the next example. - - x_ind_ptr : np.intc** - A pointer to the int array which holds the feature - indices of the next example. - - nnz : int* - A pointer to an int holding the number of non-zero - values of the next example. - - y : double* - The target value of the next example. - - sample_weight : double* - The weight of the next example. - """ - cdef int current_index = self._get_next_index() - self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, - current_index) - - cdef int random(self, double **x_data_ptr, int **x_ind_ptr, - int *nnz, double *y, double *sample_weight) nogil: - """Get a random example ``x`` from the dataset. - - This method gets next sample chosen randomly over a uniform - distribution. It corresponds to a random draw with replacement. - It is used for instance in SAG solver. - - Parameters - ---------- - x_data_ptr : double** - A pointer to the double array which holds the feature - values of the next example. - - x_ind_ptr : np.intc** - A pointer to the int array which holds the feature - indices of the next example. - - nnz : int* - A pointer to an int holding the number of non-zero - values of the next example. - - y : double* - The target value of the next example. - - sample_weight : double* - The weight of the next example. - - Returns - ------- - current_index : int - Index of current sample. - """ - cdef int current_index = self._get_random_index() - self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, - current_index) - return current_index - - cdef void shuffle(self, np.uint32_t seed) nogil: - """Permutes the ordering of examples.""" - # Fisher-Yates shuffle - cdef int *ind = self.index_data_ptr - cdef int n = self.n_samples - cdef unsigned i, j - for i in range(n - 1): - j = i + our_rand_r(&seed) % (n - i) - ind[i], ind[j] = ind[j], ind[i] - - cdef int _get_next_index(self) nogil: - cdef int current_index = self.current_index - if current_index >= (self.n_samples - 1): - current_index = -1 - - current_index += 1 - self.current_index = current_index - return self.current_index - - cdef int _get_random_index(self) nogil: - cdef int n = self.n_samples - cdef int current_index = our_rand_r(&self.seed) % n - self.current_index = current_index - return current_index - - cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, - int *nnz, double *y, double *sample_weight, - int current_index) nogil: - pass - - def _shuffle_py(self, np.uint32_t seed): - """python function used for easy testing""" - self.shuffle(seed) - - def _next_py(self): - """python function used for easy testing""" - cdef int current_index = self._get_next_index() - return self._sample_py(current_index) - - def _random_py(self): - """python function used for easy testing""" - cdef int current_index = self._get_random_index() - return self._sample_py(current_index) - - def _sample_py(self, int current_index): - """python function used for easy testing""" - cdef double* x_data_ptr - cdef int* x_indices_ptr - cdef int nnz, j - cdef double y, sample_weight - - # call _sample in cython - self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight, - current_index) - - # transform the pointed data in numpy CSR array - cdef np.ndarray[double, ndim=1] x_data = np.empty(nnz, - dtype=np.float64) - cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32) - cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz], - dtype=np.int32) - - for j in range(nnz): - x_data[j] = x_data_ptr[j] - x_indices[j] = x_indices_ptr[j] - - cdef int sample_idx = self.index_data_ptr[current_index] - - return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx - - -cdef class ArrayDataset64(SequentialDataset64): - """Dataset backed by a two-dimensional numpy array. - - The dtype of the numpy array is expected to be ``np.float64`` (double) - and C-style memory layout. - """ - - def __cinit__(self, np.ndarray[double, ndim=2, mode='c'] X, - np.ndarray[double, ndim=1, mode='c'] Y, - np.ndarray[double, ndim=1, mode='c'] sample_weights, - np.uint32_t seed=1): - """A ``SequentialDataset`` backed by a two-dimensional numpy array. - - Parameters - ---------- - X : ndarray, dtype=double, ndim=2, mode='c' - The sample array, of shape(n_samples, n_features) - - Y : ndarray, dtype=double, ndim=1, mode='c' - The target array, of shape(n_samples, ) - - sample_weights : ndarray, dtype=double, ndim=1, mode='c' - The weight of each sample, of shape(n_samples,) - """ - if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX: - raise ValueError("More than %d samples or features not supported;" - " got (%d, %d)." - % (INT_MAX, X.shape[0], X.shape[1])) - - # keep a reference to the data to prevent garbage collection - self.X = X - self.Y = Y - self.sample_weights = sample_weights - - self.n_samples = X.shape[0] - self.n_features = X.shape[1] - - cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \ - np.arange(0, self.n_features, dtype=np.intc) - self.feature_indices = feature_indices - self.feature_indices_ptr = feature_indices.data - - self.current_index = -1 - self.X_stride = X.strides[0] / X.itemsize - self.X_data_ptr = X.data - self.Y_data_ptr = Y.data - self.sample_weight_data = sample_weights.data - - # Use index array for fast shuffling - cdef np.ndarray[int, ndim=1, mode='c'] index = \ - np.arange(0, self.n_samples, dtype=np.intc) - self.index = index - self.index_data_ptr = index.data - # seed should not be 0 for our_rand_r - self.seed = max(seed, 1) - - cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, - int *nnz, double *y, double *sample_weight, - int current_index) nogil: - cdef long long sample_idx = self.index_data_ptr[current_index] - cdef long long offset = sample_idx * self.X_stride - - y[0] = self.Y_data_ptr[sample_idx] - x_data_ptr[0] = self.X_data_ptr + offset - x_ind_ptr[0] = self.feature_indices_ptr - nnz[0] = self.n_features - sample_weight[0] = self.sample_weight_data[sample_idx] - - -cdef class CSRDataset64(SequentialDataset64): - """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """ - - def __cinit__(self, np.ndarray[double, ndim=1, mode='c'] X_data, - np.ndarray[int, ndim=1, mode='c'] X_indptr, - np.ndarray[int, ndim=1, mode='c'] X_indices, - np.ndarray[double, ndim=1, mode='c'] Y, - np.ndarray[double, ndim=1, mode='c'] sample_weights, - np.uint32_t seed=1): - """Dataset backed by a scipy sparse CSR matrix. - - The feature indices of ``x`` are given by x_ind_ptr[0:nnz]. - The corresponding feature values are given by - x_data_ptr[0:nnz]. - - Parameters - ---------- - X_data : ndarray, dtype=double, ndim=1, mode='c' - The data array of the CSR features matrix. - - X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c' - The index pointer array of the CSR features matrix. - - X_indices : ndarray, dtype=np.intc, ndim=1, mode='c' - The column indices array of the CSR features matrix. - - Y : ndarray, dtype=double, ndim=1, mode='c' - The target values. - - sample_weights : ndarray, dtype=double, ndim=1, mode='c' - The weight of each sample. - """ - # keep a reference to the data to prevent garbage collection - self.X_data = X_data - self.X_indptr = X_indptr - self.X_indices = X_indices - self.Y = Y - self.sample_weights = sample_weights - - self.n_samples = Y.shape[0] - self.current_index = -1 - self.X_data_ptr = X_data.data - self.X_indptr_ptr = X_indptr.data - self.X_indices_ptr = X_indices.data - - self.Y_data_ptr = Y.data - self.sample_weight_data = sample_weights.data - - # Use index array for fast shuffling - cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples, - dtype=np.intc) - self.index = idx - self.index_data_ptr = idx.data - # seed should not be 0 for our_rand_r - self.seed = max(seed, 1) - - cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, - int *nnz, double *y, double *sample_weight, - int current_index) nogil: - cdef long long sample_idx = self.index_data_ptr[current_index] - cdef long long offset = self.X_indptr_ptr[sample_idx] - y[0] = self.Y_data_ptr[sample_idx] - x_data_ptr[0] = self.X_data_ptr + offset - x_ind_ptr[0] = self.X_indices_ptr + offset - nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset - sample_weight[0] = self.sample_weight_data[sample_idx] - - -#------------------------------------------------------------------------------ - -""" -Dataset abstractions for sequential data access. -WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp -""" - -cimport cython -from libc.limits cimport INT_MAX -cimport numpy as np -import numpy as np - -np.import_array() - -from ._random cimport our_rand_r - -cdef class SequentialDataset32: - """Base class for datasets with sequential data access. - - SequentialDataset is used to iterate over the rows of a matrix X and - corresponding target values y, i.e. to iterate over samples. - There are two methods to get the next sample: - - next : Iterate sequentially (optionally randomized) - - random : Iterate randomly (with replacement) - - Attributes - ---------- - index : np.ndarray - Index array for fast shuffling. - - index_data_ptr : int - Pointer to the index array. - - current_index : int - Index of current sample in ``index``. - The index of current sample in the data is given by - index_data_ptr[current_index]. - - n_samples : Py_ssize_t - Number of samples in the dataset. - - seed : np.uint32_t - Seed used for random sampling. - - """ - - cdef void next(self, float **x_data_ptr, int **x_ind_ptr, - int *nnz, float *y, float *sample_weight) nogil: - """Get the next example ``x`` from the dataset. - - This method gets the next sample looping sequentially over all samples. - The order can be shuffled with the method ``shuffle``. - Shuffling once before iterating over all samples corresponds to a - random draw without replacement. It is used for instance in SGD solver. - - Parameters - ---------- - x_data_ptr : float** - A pointer to the float array which holds the feature - values of the next example. - - x_ind_ptr : np.intc** - A pointer to the int array which holds the feature - indices of the next example. - - nnz : int* - A pointer to an int holding the number of non-zero - values of the next example. - - y : float* - The target value of the next example. - - sample_weight : float* - The weight of the next example. - """ - cdef int current_index = self._get_next_index() - self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, - current_index) - - cdef int random(self, float **x_data_ptr, int **x_ind_ptr, - int *nnz, float *y, float *sample_weight) nogil: - """Get a random example ``x`` from the dataset. - - This method gets next sample chosen randomly over a uniform - distribution. It corresponds to a random draw with replacement. - It is used for instance in SAG solver. - - Parameters - ---------- - x_data_ptr : float** - A pointer to the float array which holds the feature - values of the next example. - - x_ind_ptr : np.intc** - A pointer to the int array which holds the feature - indices of the next example. - - nnz : int* - A pointer to an int holding the number of non-zero - values of the next example. - - y : float* - The target value of the next example. - - sample_weight : float* - The weight of the next example. - - Returns - ------- - current_index : int - Index of current sample. - """ - cdef int current_index = self._get_random_index() - self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, - current_index) - return current_index - - cdef void shuffle(self, np.uint32_t seed) nogil: - """Permutes the ordering of examples.""" - # Fisher-Yates shuffle - cdef int *ind = self.index_data_ptr - cdef int n = self.n_samples - cdef unsigned i, j - for i in range(n - 1): - j = i + our_rand_r(&seed) % (n - i) - ind[i], ind[j] = ind[j], ind[i] - - cdef int _get_next_index(self) nogil: - cdef int current_index = self.current_index - if current_index >= (self.n_samples - 1): - current_index = -1 - - current_index += 1 - self.current_index = current_index - return self.current_index - - cdef int _get_random_index(self) nogil: - cdef int n = self.n_samples - cdef int current_index = our_rand_r(&self.seed) % n - self.current_index = current_index - return current_index - - cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, - int *nnz, float *y, float *sample_weight, - int current_index) nogil: - pass - - def _shuffle_py(self, np.uint32_t seed): - """python function used for easy testing""" - self.shuffle(seed) - - def _next_py(self): - """python function used for easy testing""" - cdef int current_index = self._get_next_index() - return self._sample_py(current_index) - - def _random_py(self): - """python function used for easy testing""" - cdef int current_index = self._get_random_index() - return self._sample_py(current_index) - - def _sample_py(self, int current_index): - """python function used for easy testing""" - cdef float* x_data_ptr - cdef int* x_indices_ptr - cdef int nnz, j - cdef float y, sample_weight - - # call _sample in cython - self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight, - current_index) - - # transform the pointed data in numpy CSR array - cdef np.ndarray[float, ndim=1] x_data = np.empty(nnz, - dtype=np.float32) - cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32) - cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz], - dtype=np.int32) - - for j in range(nnz): - x_data[j] = x_data_ptr[j] - x_indices[j] = x_indices_ptr[j] - - cdef int sample_idx = self.index_data_ptr[current_index] - - return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx - - -cdef class ArrayDataset32(SequentialDataset32): - """Dataset backed by a two-dimensional numpy array. - - The dtype of the numpy array is expected to be ``np.float32`` (float) - and C-style memory layout. - """ - - def __cinit__(self, np.ndarray[float, ndim=2, mode='c'] X, - np.ndarray[float, ndim=1, mode='c'] Y, - np.ndarray[float, ndim=1, mode='c'] sample_weights, - np.uint32_t seed=1): - """A ``SequentialDataset`` backed by a two-dimensional numpy array. - - Parameters - ---------- - X : ndarray, dtype=float, ndim=2, mode='c' - The sample array, of shape(n_samples, n_features) - - Y : ndarray, dtype=float, ndim=1, mode='c' - The target array, of shape(n_samples, ) - - sample_weights : ndarray, dtype=float, ndim=1, mode='c' - The weight of each sample, of shape(n_samples,) - """ - if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX: - raise ValueError("More than %d samples or features not supported;" - " got (%d, %d)." - % (INT_MAX, X.shape[0], X.shape[1])) - - # keep a reference to the data to prevent garbage collection - self.X = X - self.Y = Y - self.sample_weights = sample_weights - - self.n_samples = X.shape[0] - self.n_features = X.shape[1] - - cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \ - np.arange(0, self.n_features, dtype=np.intc) - self.feature_indices = feature_indices - self.feature_indices_ptr = feature_indices.data - - self.current_index = -1 - self.X_stride = X.strides[0] / X.itemsize - self.X_data_ptr = X.data - self.Y_data_ptr = Y.data - self.sample_weight_data = sample_weights.data - - # Use index array for fast shuffling - cdef np.ndarray[int, ndim=1, mode='c'] index = \ - np.arange(0, self.n_samples, dtype=np.intc) - self.index = index - self.index_data_ptr = index.data - # seed should not be 0 for our_rand_r - self.seed = max(seed, 1) - - cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, - int *nnz, float *y, float *sample_weight, - int current_index) nogil: - cdef long long sample_idx = self.index_data_ptr[current_index] - cdef long long offset = sample_idx * self.X_stride - - y[0] = self.Y_data_ptr[sample_idx] - x_data_ptr[0] = self.X_data_ptr + offset - x_ind_ptr[0] = self.feature_indices_ptr - nnz[0] = self.n_features - sample_weight[0] = self.sample_weight_data[sample_idx] - - -cdef class CSRDataset32(SequentialDataset32): - """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """ - - def __cinit__(self, np.ndarray[float, ndim=1, mode='c'] X_data, - np.ndarray[int, ndim=1, mode='c'] X_indptr, - np.ndarray[int, ndim=1, mode='c'] X_indices, - np.ndarray[float, ndim=1, mode='c'] Y, - np.ndarray[float, ndim=1, mode='c'] sample_weights, - np.uint32_t seed=1): - """Dataset backed by a scipy sparse CSR matrix. - - The feature indices of ``x`` are given by x_ind_ptr[0:nnz]. - The corresponding feature values are given by - x_data_ptr[0:nnz]. - - Parameters - ---------- - X_data : ndarray, dtype=float, ndim=1, mode='c' - The data array of the CSR features matrix. - - X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c' - The index pointer array of the CSR features matrix. - - X_indices : ndarray, dtype=np.intc, ndim=1, mode='c' - The column indices array of the CSR features matrix. - - Y : ndarray, dtype=float, ndim=1, mode='c' - The target values. - - sample_weights : ndarray, dtype=float, ndim=1, mode='c' - The weight of each sample. - """ - # keep a reference to the data to prevent garbage collection - self.X_data = X_data - self.X_indptr = X_indptr - self.X_indices = X_indices - self.Y = Y - self.sample_weights = sample_weights - - self.n_samples = Y.shape[0] - self.current_index = -1 - self.X_data_ptr = X_data.data - self.X_indptr_ptr = X_indptr.data - self.X_indices_ptr = X_indices.data - - self.Y_data_ptr = Y.data - self.sample_weight_data = sample_weights.data - - # Use index array for fast shuffling - cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples, - dtype=np.intc) - self.index = idx - self.index_data_ptr = idx.data - # seed should not be 0 for our_rand_r - self.seed = max(seed, 1) - - cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, - int *nnz, float *y, float *sample_weight, - int current_index) nogil: - cdef long long sample_idx = self.index_data_ptr[current_index] - cdef long long offset = self.X_indptr_ptr[sample_idx] - y[0] = self.Y_data_ptr[sample_idx] - x_data_ptr[0] = self.X_data_ptr + offset - x_ind_ptr[0] = self.X_indices_ptr + offset - nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset - sample_weight[0] = self.sample_weight_data[sample_idx] - From 038b7d1fc3aa3d209bd5fe8efc2fad42e8d574a8 Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Fri, 25 Oct 2019 15:27:21 -0400 Subject: [PATCH 07/19] Better coverage --- .../linear_model/tests/test_least_angle.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index d3fb5a31fbe8d..ec151637b758b 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -732,29 +732,33 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X): assert copy_X == np.array_equal(X, X_copy) -def test_lars_with_jitter(): +@pytest.mark.parametrize('y_list, expected_y', [ + ([-2.5, -2.5], [0, 2.5, 0, 2.5, 0]), + ([[-2.5, -2.5], [-2.5, -2.5]], + [[0, 2.5, 0, 2.5, 0], [0, 2.5, 0, 2.5, 0]])]) +def test_lars_with_jitter(y_list, expected_y): """ Test that user input of a small amount of jitter, using example provided in issue #2746 """ - A = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]]) - b = np.array([-2.5, -2.5]) - expected_output = np.array([0, 2.5, 0, 2.5, 0]) + X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]]) + y = np.array(y_list) + expected_output = np.array(expected_y) alpha = 0.001 fit_intercept = False lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept) - lars_with_jiggle = linear_model.LassoLars(alpha=alpha, + lars_with_jitter = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept, jitter=10e-5) - lars.fit(A, b) - lars_with_jiggle.fit(A, b) + lars.fit(X, y) + lars_with_jitter.fit(X, y) - w_nojiggle = lars.coef_ - w_jiggle = lars_with_jiggle.coef_ + w_nojitter = lars.coef_ + w_jitter = lars_with_jitter.coef_ - assert not np.array_equal(w_jiggle, w_nojiggle) - assert_array_almost_equal(w_jiggle, expected_output, decimal=2) + assert not np.array_equal(w_jitter, w_nojitter) + assert_array_almost_equal(w_jitter, expected_output, decimal=2) From f1cefce833e88b73bd20cfadf623c4e95d76f46f Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Sat, 2 Nov 2019 18:42:37 -0400 Subject: [PATCH 08/19] PR comments --- sklearn/linear_model/_least_angle.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index ed17b8b7f30c4..9a4b2f4746eb1 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -24,7 +24,6 @@ from ..exceptions import ConvergenceWarning SOLVE_TRIANGULAR_ARGS = {'check_finite': False} -DEFAULT_JITTER = None def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0, @@ -862,7 +861,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): def __init__(self, fit_intercept=True, verbose=False, normalize=True, precompute='auto', n_nonzero_coefs=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - jitter=DEFAULT_JITTER): + jitter=None): self.fit_intercept = fit_intercept self.verbose = verbose self.normalize = normalize @@ -1046,8 +1045,8 @@ class LassoLars(Lars): with a small alpha. jitter : float, default=None - Uniform noise parameter, added to the y values, to satisfy \ - the model's assumption of one-at-a-time computations \ + Uniform noise parameter, added to the y values, to satisfy + the model's assumption of one-at-a-time computations (Efron et al. 2004). positive : bool, default=False @@ -1094,7 +1093,7 @@ class LassoLars(Lars): >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1]) LassoLars(alpha=0.01) >>> print(reg.coef_) - [ 0. -0.9632...] + [ 0. -0.963257...] See also -------- @@ -1112,7 +1111,7 @@ class LassoLars(Lars): def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - positive=False, jitter=DEFAULT_JITTER): + positive=False, jitter=None): self.alpha = alpha self.fit_intercept = fit_intercept self.max_iter = max_iter From d82ce51a570aff77484eed7a2bd971a6c1aca629 Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Fri, 13 Dec 2019 11:04:09 -0500 Subject: [PATCH 09/19] PR comments --- doc/modules/linear_model.rst | 2 +- sklearn/linear_model/_least_angle.py | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 330e6a1008c5c..c0799528ecd07 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -511,7 +511,7 @@ function of the norm of its coefficients. >>> reg.fit([[0, 0], [1, 1]], [0, 1]) LassoLars(alpha=0.1) >>> reg.coef_ - array([0.717..., 0. ]) + array([0.0.717157..., 0. ]) .. topic:: Examples: diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 3bc0f42a93b4c..ffe05de762664 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -19,7 +19,7 @@ from ._base import LinearModel from ..base import RegressorMixin, MultiOutputMixin -from ..utils import arrayfuncs, as_float_array, check_X_y +from ..utils import arrayfuncs, as_float_array, check_X_y, check_random_state from ..model_selection import check_cv from ..exceptions import ConvergenceWarning @@ -814,6 +814,11 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): the model's assumption of one-at-a-time computations (Efron et al. 2004). + random_state : int, RandomState instance or None (default) + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + Attributes ---------- alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \ @@ -861,7 +866,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): def __init__(self, fit_intercept=True, verbose=False, normalize=True, precompute='auto', n_nonzero_coefs=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - jitter=None): + jitter=None, random_state=None): self.fit_intercept = fit_intercept self.verbose = verbose self.normalize = normalize @@ -871,6 +876,7 @@ def __init__(self, fit_intercept=True, verbose=False, normalize=True, self.copy_X = copy_X self.fit_path = fit_path self.jitter = jitter + self.random_state = random_state @staticmethod def _get_gram(precompute, X, y): @@ -971,7 +977,9 @@ def fit(self, X, y, Xy=None): max_iter = self.max_iter if self.jitter: - noise = np.random.RandomState(0).uniform(high=self.jitter, + generator = check_random_state(self.random_state) + + noise = generator.uniform(high=self.jitter, size=len(y)) if y.ndim == 2: noise = noise.reshape(-1, 1) @@ -1740,7 +1748,8 @@ class LassoLarsIC(LassoLars): """ def __init__(self, criterion='aic', fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, - eps=np.finfo(np.float).eps, copy_X=True, positive=False): + eps=np.finfo(np.float).eps, copy_X=True, positive=False, + random_state=None): self.criterion = criterion self.fit_intercept = fit_intercept self.positive = positive @@ -1751,6 +1760,7 @@ def __init__(self, criterion='aic', fit_intercept=True, verbose=False, self.precompute = precompute self.eps = eps self.fit_path = True + self.random_state = random_state def _more_tags(self): return {'multioutput': False} From ec5156c4d2602aea88a741e278288e2f2ea1252a Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Fri, 13 Dec 2019 11:29:13 -0500 Subject: [PATCH 10/19] PR comments --- doc/modules/linear_model.rst | 2 +- sklearn/linear_model/_least_angle.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index c0799528ecd07..19205385f311b 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -511,7 +511,7 @@ function of the norm of its coefficients. >>> reg.fit([[0, 0], [1, 1]], [0, 1]) LassoLars(alpha=0.1) >>> reg.coef_ - array([0.0.717157..., 0. ]) + array([0.717157..., 0. ]) .. topic:: Examples: diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index ffe05de762664..0b33de6f6619c 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -979,8 +979,7 @@ def fit(self, X, y, Xy=None): if self.jitter: generator = check_random_state(self.random_state) - noise = generator.uniform(high=self.jitter, - size=len(y)) + noise = generator.uniform(high=self.jitter, size=len(y)) if y.ndim == 2: noise = noise.reshape(-1, 1) y = y + noise From b817abf1bdc516745ccdafcf8ca29b9e2237ddb5 Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Fri, 13 Dec 2019 12:36:01 -0500 Subject: [PATCH 11/19] PR comments --- sklearn/linear_model/_least_angle.py | 20 +++++++++++-------- .../linear_model/tests/test_least_angle.py | 5 +++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 0b33de6f6619c..757ed1cd06372 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -980,8 +980,6 @@ def fit(self, X, y, Xy=None): generator = check_random_state(self.random_state) noise = generator.uniform(high=self.jitter, size=len(y)) - if y.ndim == 2: - noise = noise.reshape(-1, 1) y = y + noise self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path, @@ -1051,11 +1049,6 @@ class LassoLars(Lars): setting ``fit_path`` to ``False`` will lead to a speedup, especially with a small alpha. - jitter : float, default=None - Uniform noise parameter, added to the y values, to satisfy - the model's assumption of one-at-a-time computations - (Efron et al. 2004). - positive : bool, default=False Restrict coefficients to be >= 0. Be aware that you might want to remove fit_intercept which is set True by default. @@ -1066,6 +1059,16 @@ class LassoLars(Lars): algorithm are typically in congruence with the solution of the coordinate descent Lasso estimator. + jitter : float, default=None + Uniform noise parameter, added to the y values, to satisfy + the model's assumption of one-at-a-time computations + (Efron et al. 2004). + + random_state : int, RandomState instance or None (default) + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + Attributes ---------- alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \ @@ -1118,7 +1121,7 @@ class LassoLars(Lars): def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - positive=False, jitter=None): + positive=False, jitter=None, random_state=None): self.alpha = alpha self.fit_intercept = fit_intercept self.max_iter = max_iter @@ -1130,6 +1133,7 @@ def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, self.eps = eps self.fit_path = fit_path self.jitter = jitter + self.random_state = random_state ############################################################################### diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index 979f5edf335bc..8967f017480f4 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -735,7 +735,7 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X): @pytest.mark.parametrize('y_list, expected_y', [ ([-2.5, -2.5], [0, 2.5, 0, 2.5, 0]), ([[-2.5, -2.5], [-2.5, -2.5]], - [[0, 2.5, 0, 2.5, 0], [0, 2.5, 0, 2.5, 0]])]) + [[0, 5, 0, 2.5, 0], [0, 5, 0, 2.5, 0]])]) def test_lars_with_jitter(y_list, expected_y): """ Test that user input of a small amount of jitter, @@ -752,7 +752,8 @@ def test_lars_with_jitter(y_list, expected_y): lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept) lars_with_jitter = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept, - jitter=10e-5) + jitter=10e-8, + random_state=0) lars.fit(X, y) lars_with_jitter.fit(X, y) From 6c3a24f3748e29d3838052544d804aef1484df90 Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Fri, 13 Dec 2019 15:10:20 -0500 Subject: [PATCH 12/19] PR comments --- sklearn/linear_model/_least_angle.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 757ed1cd06372..7aa9a3e61fc2f 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -811,8 +811,7 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): jitter : float, default=None Uniform noise parameter, added to the y values, to satisfy - the model's assumption of one-at-a-time computations - (Efron et al. 2004). + the model's assumption of one-at-a-time computations. random_state : int, RandomState instance or None (default) Determines random number generation for dataset creation. Pass an int @@ -976,10 +975,10 @@ def fit(self, X, y, Xy=None): else: max_iter = self.max_iter - if self.jitter: - generator = check_random_state(self.random_state) + if self.jitter is not None: + rng = check_random_state(self.random_state) - noise = generator.uniform(high=self.jitter, size=len(y)) + noise = rng.uniform(high=self.jitter, size=len(y)) y = y + noise self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path, @@ -1061,8 +1060,7 @@ class LassoLars(Lars): jitter : float, default=None Uniform noise parameter, added to the y values, to satisfy - the model's assumption of one-at-a-time computations - (Efron et al. 2004). + the model's assumption of one-at-a-time computations. random_state : int, RandomState instance or None (default) Determines random number generation for dataset creation. Pass an int From f6e7aa59dced51438cd0c180b54dfe7c9406096b Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Fri, 17 Jan 2020 15:44:17 -0500 Subject: [PATCH 13/19] Linting --- sklearn/linear_model/tests/test_least_angle.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index 08348a8c56032..a848df6149586 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -770,4 +770,3 @@ def test_X_none_gram_not_none(): with pytest.raises(ValueError, match="X cannot be None if Gram is not None"): lars_path(X=None, y=[1], Gram='not None') - From 0a1922567f5c9e0cd9fa5e8d627c38c04949ce00 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 15 Apr 2020 09:04:14 -0400 Subject: [PATCH 14/19] Apply suggestions from code review --- sklearn/linear_model/_least_angle.py | 18 ++++++++++-------- sklearn/linear_model/tests/test_least_angle.py | 11 ++++------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 5c75d3cd45ef8..a61ffcadc77c5 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -800,13 +800,14 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): with a small alpha. jitter : float, default=None - Uniform noise parameter, added to the y values, to satisfy - the model's assumption of one-at-a-time computations. + Upper bound on a uniform noise parameter to be added to the + `y` values, to satisfy the model's assumption of + one-at-a-time computations. Might help with stability. random_state : int, RandomState instance or None (default) - Determines random number generation for dataset creation. Pass an int + Determines random number generation for jittering. Pass an int for reproducible output across multiple function calls. - See :term:`Glossary `. + See :term:`Glossary `. Ignored if `jitter` is None. Attributes ---------- @@ -1049,13 +1050,14 @@ class LassoLars(Lars): coordinate descent Lasso estimator. jitter : float, default=None - Uniform noise parameter, added to the y values, to satisfy - the model's assumption of one-at-a-time computations. + Upper bound on a uniform noise parameter to be added to the + `y` values, to satisfy the model's assumption of + one-at-a-time computations. Might help with stability. random_state : int, RandomState instance or None (default) - Determines random number generation for dataset creation. Pass an int + Determines random number generation for jittering. Pass an int for reproducible output across multiple function calls. - See :term:`Glossary `. + See :term:`Glossary `. Ignored if `jitter` is None. Attributes ---------- diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index a848df6149586..481b44740e3ac 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -733,16 +733,13 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X): assert copy_X == np.array_equal(X, X_copy) -@pytest.mark.parametrize('y_list, expected_y', [ +@pytest.mark.parametrize('y, expected_coef', [ ([-2.5, -2.5], [0, 2.5, 0, 2.5, 0]), ([[-2.5, -2.5], [-2.5, -2.5]], [[0, 5, 0, 2.5, 0], [0, 5, 0, 2.5, 0]])]) def test_lars_with_jitter(y_list, expected_y): - """ - Test that user input of a small amount of jitter, - using example provided in issue #2746 - - """ + # Test that a small amount of jitter helps stability, + # using example provided in issue #2746 X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]]) y = np.array(y_list) @@ -763,7 +760,7 @@ def test_lars_with_jitter(y_list, expected_y): w_jitter = lars_with_jitter.coef_ assert not np.array_equal(w_jitter, w_nojitter) - assert_array_almost_equal(w_jitter, expected_output, decimal=2) + np.testing.assert_allclose(w_jitter, expected_coeff)``` def test_X_none_gram_not_none(): From 58e807bd88e1893cc04c1def332b29fe7fedd451 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 15 Apr 2020 09:22:47 -0400 Subject: [PATCH 15/19] addressed comments --- .../linear_model/tests/test_least_angle.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index 1fe12d1cf9293..fdcbb96c56cf9 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -733,16 +733,16 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X): assert copy_X == np.array_equal(X, X_copy) -@pytest.mark.parametrize('y, expected_coef', [ - ([-2.5, -2.5], [0, 2.5, 0, 2.5, 0]), - ([[-2.5, -2.5], [-2.5, -2.5]], - [[0, 5, 0, 2.5, 0], [0, 5, 0, 2.5, 0]])]) -def test_lars_with_jitter(y, expected_coef): +def test_lars_with_jitter(): # Test that a small amount of jitter helps stability, # using example provided in issue #2746 - X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]]) + X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], + [0.0, -1.0, 0.0, 0.0, 0.0]]) + y = [-2.5, -2.5] + expected_coef = [0, 2.5, 0, 2.5, 0] alpha = 0.001 + # set to False since target is constant and we check the value of coef fit_intercept = False lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept) @@ -754,11 +754,9 @@ def test_lars_with_jitter(y, expected_coef): lars.fit(X, y) lars_with_jitter.fit(X, y) - w_nojitter = lars.coef_ - w_jitter = lars_with_jitter.coef_ - - assert not np.array_equal(w_jitter, w_nojitter) - np.testing.assert_allclose(w_jitter, expected_coef, rtol=1e-3) + assert np.mean((lars.coef_ - lars_with_jitter.coef_)**2) > .1 + np.testing.assert_allclose(lars_with_jitter.coef_, expected_coef, + rtol=1e-3) def test_X_none_gram_not_none(): From 547d4167296e8c746aebee64553c78211c056bcd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 15 Apr 2020 09:24:50 -0400 Subject: [PATCH 16/19] added whatnew entry --- doc/whats_new/v0.23.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 4c489c1887815..e5e0999e20557 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -298,6 +298,10 @@ Changelog of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`. :pr:`16266` by :user:`Rushabh Vasani `. +- |Enhancement| :class:`linear_model.LassoLars` now support a `jitter` + parameter that adds random noise to the target. This might help with + stability in some edge cases. :pr:`15179` by :user:`angelaambroz`. + :mod:`sklearn.metrics` ...................... From e9839f59e303b559e34ddb98148ea2500c3c7dfc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 15 Apr 2020 09:32:34 -0400 Subject: [PATCH 17/19] test both estimators --- .../linear_model/tests/test_least_angle.py | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index fdcbb96c56cf9..e198dfb15e323 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -6,6 +6,7 @@ import pytest from scipy import linalg +from sklearn.base import clone from sklearn.model_selection import train_test_split from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal @@ -17,6 +18,7 @@ from sklearn import linear_model, datasets from sklearn.linear_model._least_angle import _lars_path_residues from sklearn.linear_model import LassoLarsIC, lars_path +from sklearn.linear_model import Lars, LassoLars # TODO: use another dataset that has multiple drops diabetes = datasets.load_diabetes() @@ -733,7 +735,8 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X): assert copy_X == np.array_equal(X, X_copy) -def test_lars_with_jitter(): +@pytest.mark.parametrize('est', (LassoLars(alpha=1e-3), Lars())) +def test_lars_with_jitter(est): # Test that a small amount of jitter helps stability, # using example provided in issue #2746 @@ -741,22 +744,17 @@ def test_lars_with_jitter(): [0.0, -1.0, 0.0, 0.0, 0.0]]) y = [-2.5, -2.5] expected_coef = [0, 2.5, 0, 2.5, 0] - alpha = 0.001 - # set to False since target is constant and we check the value of coef - fit_intercept = False - lars = linear_model.LassoLars(alpha=alpha, fit_intercept=fit_intercept) - lars_with_jitter = linear_model.LassoLars(alpha=alpha, - fit_intercept=fit_intercept, - jitter=10e-8, - random_state=0) + # set to fit_intercept to False since target is constant and we want check + # the value of coef. coef would be all zeros otherwise. + est.set_params(fit_intercept=False) + est_jitter = clone(est).set_params(jitter=10e-8, random_state=0) - lars.fit(X, y) - lars_with_jitter.fit(X, y) + est.fit(X, y) + est_jitter.fit(X, y) - assert np.mean((lars.coef_ - lars_with_jitter.coef_)**2) > .1 - np.testing.assert_allclose(lars_with_jitter.coef_, expected_coef, - rtol=1e-3) + assert np.mean((est.coef_ - est_jitter.coef_)**2) > .1 + np.testing.assert_allclose(est_jitter.coef_, expected_coef, rtol=1e-3) def test_X_none_gram_not_none(): From 09e3cc3b03e679528f92cd90021e15c5e80984de Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 15 Apr 2020 09:33:39 -0400 Subject: [PATCH 18/19] update whatsnew --- doc/whats_new/v0.23.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index e5e0999e20557..c4fa3818aa1bc 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -298,9 +298,10 @@ Changelog of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`. :pr:`16266` by :user:`Rushabh Vasani `. -- |Enhancement| :class:`linear_model.LassoLars` now support a `jitter` - parameter that adds random noise to the target. This might help with - stability in some edge cases. :pr:`15179` by :user:`angelaambroz`. +- |Enhancement| :class:`linear_model.LassoLars` and + :class:`linear_model.Lars` now support a `jitter` parameter that adds + random noise to the target. This might help with stability in some edge + cases. :pr:`15179` by :user:`angelaambroz`. :mod:`sklearn.metrics` ...................... From 24e7b81a04f6eb012e4cb2aa467711d5989bbece Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 Apr 2020 10:00:09 -0400 Subject: [PATCH 19/19] removed random_state for lassolarsIC --- sklearn/linear_model/_least_angle.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 8e940153ed145..bc71d7a1fccbd 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -1743,8 +1743,7 @@ class LassoLarsIC(LassoLars): """ def __init__(self, criterion='aic', fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, - eps=np.finfo(np.float).eps, copy_X=True, positive=False, - random_state=None): + eps=np.finfo(np.float).eps, copy_X=True, positive=False): self.criterion = criterion self.fit_intercept = fit_intercept self.positive = positive @@ -1755,7 +1754,6 @@ def __init__(self, criterion='aic', fit_intercept=True, verbose=False, self.precompute = precompute self.eps = eps self.fit_path = True - self.random_state = random_state def _more_tags(self): return {'multioutput': False}