From 55ff4698a3f2229a8e81965913c5c653d7fa0391 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Tue, 26 Feb 2019 18:22:41 +0100 Subject: [PATCH 1/5] use pytest and not unitest in test_sgd.py --- sklearn/linear_model/tests/test_sgd.py | 2222 +++++++++++++----------- 1 file changed, 1164 insertions(+), 1058 deletions(-) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 629933a3bc8e9..236d3a9bea426 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -1,6 +1,5 @@ from distutils.version import LooseVersion import pickle -import unittest import pytest import numpy as np @@ -21,7 +20,6 @@ from sklearn import linear_model, datasets, metrics from sklearn.base import clone, is_classifier -from sklearn.linear_model import SGDClassifier, SGDRegressor from sklearn.preprocessing import LabelEncoder, scale, MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.exceptions import ConvergenceWarning @@ -38,8 +36,17 @@ "ignore:max_iter and tol parameters have been") -class SparseSGDClassifier(SGDClassifier): +def _update_kwargs(kwargs): + if "random_state" not in kwargs: + kwargs["random_state"] = 42 + if "tol" not in kwargs: + kwargs["tol"] = None + if "max_iter" not in kwargs: + kwargs["max_iter"] = 5 + + +class _SparseSGDClassifier(linear_model.SGDClassifier): def fit(self, X, y, *args, **kw): X = sp.csr_matrix(X) return super().fit(X, y, *args, **kw) @@ -57,19 +64,39 @@ def predict_proba(self, X): return super().predict_proba(X) -class SparseSGDRegressor(SGDRegressor): - +class _SparseSGDRegressor(linear_model.SGDRegressor): def fit(self, X, y, *args, **kw): X = sp.csr_matrix(X) - return SGDRegressor.fit(self, X, y, *args, **kw) + return linear_model.SGDRegressor.fit(self, X, y, *args, **kw) def partial_fit(self, X, y, *args, **kw): X = sp.csr_matrix(X) - return SGDRegressor.partial_fit(self, X, y, *args, **kw) + return linear_model.SGDRegressor.partial_fit(self, X, y, *args, **kw) def decision_function(self, X, *args, **kw): X = sp.csr_matrix(X) - return SGDRegressor.decision_function(self, X, *args, **kw) + return linear_model.SGDRegressor.decision_function(self, X, *args, + **kw) + + +def SGDClassifier(**kwargs): + _update_kwargs(kwargs) + return linear_model.SGDClassifier(**kwargs) + + +def SGDRegressor(**kwargs): + _update_kwargs(kwargs) + return linear_model.SGDRegressor(**kwargs) + + +def SparseSGDClassifier(**kwargs): + _update_kwargs(kwargs) + return _SparseSGDClassifier(**kwargs) + + +def SparseSGDRegressor(**kwargs): + _update_kwargs(kwargs) + return _SparseSGDRegressor(**kwargs) # Test Data @@ -113,1102 +140,1181 @@ def decision_function(self, X, *args, **kw): ############################################################################### # Tests common to classification and regression -class CommonTest: - - def factory(self, **kwargs): - if "random_state" not in kwargs: - kwargs["random_state"] = 42 +# a simple implementation of ASGD to use for testing +# uses squared loss to find the gradient +def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0): + if weight_init is None: + weights = np.zeros(X.shape[1]) + else: + weights = weight_init + + average_weights = np.zeros(X.shape[1]) + intercept = intercept_init + average_intercept = 0.0 + decay = 1.0 - if "tol" not in kwargs: - kwargs["tol"] = None - if "max_iter" not in kwargs: - kwargs["max_iter"] = 5 + # sparse data has a fixed decay of .01 + if klass in (SparseSGDClassifier, SparseSGDRegressor): + decay = .01 - return self.factory_class(**kwargs) + for i, entry in enumerate(X): + p = np.dot(entry, weights) + p += intercept + gradient = p - y[i] + weights *= 1.0 - (eta * alpha) + weights += -(eta * gradient * entry) + intercept += -(eta * gradient) * decay + + average_weights *= i + average_weights += weights + average_weights /= i + 1.0 + + average_intercept *= i + average_intercept += intercept + average_intercept /= i + 1.0 + + return average_weights, average_intercept + + +def _test_warm_start(klass, X, Y, lr): + # Test that explicit warm restart... + clf = klass(alpha=0.01, eta0=0.01, shuffle=False, + learning_rate=lr) + clf.fit(X, Y) + + clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False, + learning_rate=lr) + clf2.fit(X, Y, + coef_init=clf.coef_.copy(), + intercept_init=clf.intercept_.copy()) + + # ... and implicit warm restart are equivalent. + clf3 = klass(alpha=0.01, eta0=0.01, shuffle=False, + warm_start=True, learning_rate=lr) + clf3.fit(X, Y) + + assert_equal(clf3.t_, clf.t_) + assert_array_almost_equal(clf3.coef_, clf.coef_) + + clf3.set_params(alpha=0.001) + clf3.fit(X, Y) + + assert_equal(clf3.t_, clf2.t_) + assert_array_almost_equal(clf3.coef_, clf2.coef_) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +@pytest.mark.parametrize('lr', + ["constant", "optimal", "invscaling", "adaptive"]) +def test_warm_start(klass, lr): + _test_warm_start(klass, X, Y, lr) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_input_format(klass): + # Input format tests. + clf = klass(alpha=0.01, shuffle=False) + clf.fit(X, Y) + Y_ = np.array(Y)[:, np.newaxis] + + Y_ = np.c_[Y_, Y_] + assert_raises(ValueError, clf.fit, X, Y_) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_clone(klass): + # Test whether clone works ok. + clf = klass(alpha=0.01, penalty='l1') + clf = clone(clf) + clf.set_params(penalty='l2') + clf.fit(X, Y) + + clf2 = klass(alpha=0.01, penalty='l2') + clf2.fit(X, Y) + + assert_array_equal(clf.coef_, clf2.coef_) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_plain_has_no_average_attr(klass): + clf = klass(average=True, eta0=.01) + clf.fit(X, Y) + + assert hasattr(clf, 'average_coef_') + assert hasattr(clf, 'average_intercept_') + assert hasattr(clf, 'standard_intercept_') + assert hasattr(clf, 'standard_coef_') + + clf = klass() + clf.fit(X, Y) + + assert not hasattr(clf, 'average_coef_') + assert not hasattr(clf, 'average_intercept_') + assert not hasattr(clf, 'standard_intercept_') + assert not hasattr(clf, 'standard_coef_') - # a simple implementation of ASGD to use for testing - # uses squared loss to find the gradient - def asgd(self, X, y, eta, alpha, weight_init=None, intercept_init=0.0): - if weight_init is None: - weights = np.zeros(X.shape[1]) + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_late_onset_averaging_not_reached(klass): + clf1 = klass(average=600) + clf2 = klass() + for _ in range(100): + if is_classifier(clf1): + clf1.partial_fit(X, Y, classes=np.unique(Y)) + clf2.partial_fit(X, Y, classes=np.unique(Y)) else: - weights = weight_init - - average_weights = np.zeros(X.shape[1]) - intercept = intercept_init - average_intercept = 0.0 - decay = 1.0 - - # sparse data has a fixed decay of .01 - if (isinstance(self, SparseSGDClassifierTestCase) or - isinstance(self, SparseSGDRegressorTestCase)): - decay = .01 - - for i, entry in enumerate(X): - p = np.dot(entry, weights) - p += intercept - gradient = p - y[i] - weights *= 1.0 - (eta * alpha) - weights += -(eta * gradient * entry) - intercept += -(eta * gradient) * decay - - average_weights *= i - average_weights += weights - average_weights /= i + 1.0 - - average_intercept *= i - average_intercept += intercept - average_intercept /= i + 1.0 - - return average_weights, average_intercept - - def _test_warm_start(self, X, Y, lr): - # Test that explicit warm restart... - clf = self.factory(alpha=0.01, eta0=0.01, shuffle=False, - learning_rate=lr) - clf.fit(X, Y) - - clf2 = self.factory(alpha=0.001, eta0=0.01, shuffle=False, - learning_rate=lr) - clf2.fit(X, Y, - coef_init=clf.coef_.copy(), - intercept_init=clf.intercept_.copy()) - - # ... and implicit warm restart are equivalent. - clf3 = self.factory(alpha=0.01, eta0=0.01, shuffle=False, - warm_start=True, learning_rate=lr) - clf3.fit(X, Y) - - assert_equal(clf3.t_, clf.t_) - assert_array_almost_equal(clf3.coef_, clf.coef_) - - clf3.set_params(alpha=0.001) - clf3.fit(X, Y) - - assert_equal(clf3.t_, clf2.t_) - assert_array_almost_equal(clf3.coef_, clf2.coef_) - - def test_warm_start_constant(self): - self._test_warm_start(X, Y, "constant") - - def test_warm_start_invscaling(self): - self._test_warm_start(X, Y, "invscaling") - - def test_warm_start_optimal(self): - self._test_warm_start(X, Y, "optimal") - - def test_warm_start_adaptive(self): - self._test_warm_start(X, Y, "adaptive") - - def test_input_format(self): - # Input format tests. - clf = self.factory(alpha=0.01, shuffle=False) - clf.fit(X, Y) - Y_ = np.array(Y)[:, np.newaxis] - - Y_ = np.c_[Y_, Y_] - assert_raises(ValueError, clf.fit, X, Y_) - - def test_clone(self): - # Test whether clone works ok. - clf = self.factory(alpha=0.01, penalty='l1') - clf = clone(clf) - clf.set_params(penalty='l2') - clf.fit(X, Y) - - clf2 = self.factory(alpha=0.01, penalty='l2') - clf2.fit(X, Y) - - assert_array_equal(clf.coef_, clf2.coef_) - - def test_plain_has_no_average_attr(self): - clf = self.factory(average=True, eta0=.01) - clf.fit(X, Y) - - assert hasattr(clf, 'average_coef_') - assert hasattr(clf, 'average_intercept_') - assert hasattr(clf, 'standard_intercept_') - assert hasattr(clf, 'standard_coef_') - - clf = self.factory() - clf.fit(X, Y) - - assert not hasattr(clf, 'average_coef_') - assert not hasattr(clf, 'average_intercept_') - assert not hasattr(clf, 'standard_intercept_') - assert not hasattr(clf, 'standard_coef_') - - def test_late_onset_averaging_not_reached(self): - clf1 = self.factory(average=600) - clf2 = self.factory() - for _ in range(100): - if isinstance(clf1, SGDClassifier): - clf1.partial_fit(X, Y, classes=np.unique(Y)) - clf2.partial_fit(X, Y, classes=np.unique(Y)) - else: - clf1.partial_fit(X, Y) - clf2.partial_fit(X, Y) - - assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16) - assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16) - - def test_late_onset_averaging_reached(self): - eta0 = .001 - alpha = .0001 - Y_encode = np.array(Y) - Y_encode[Y_encode == 1] = -1.0 - Y_encode[Y_encode == 2] = 1.0 - - clf1 = self.factory(average=7, learning_rate="constant", - loss='squared_loss', eta0=eta0, - alpha=alpha, max_iter=2, shuffle=False) - clf2 = self.factory(average=0, learning_rate="constant", - loss='squared_loss', eta0=eta0, - alpha=alpha, max_iter=1, shuffle=False) - - clf1.fit(X, Y_encode) - clf2.fit(X, Y_encode) - - average_weights, average_intercept = \ - self.asgd(X, Y_encode, eta0, alpha, - weight_init=clf2.coef_.ravel(), - intercept_init=clf2.intercept_) - - assert_array_almost_equal(clf1.coef_.ravel(), - average_weights.ravel(), - decimal=16) - assert_almost_equal(clf1.intercept_, average_intercept, decimal=16) - - def test_sgd_bad_alpha_for_optimal_learning_rate(self): - # Check whether expected ValueError on bad alpha, i.e. 0 - # since alpha is used to compute the optimal learning rate - assert_raises(ValueError, self.factory, - alpha=0, learning_rate="optimal") - - def test_early_stopping(self): - X = iris.data[iris.target > 0] - Y = iris.target[iris.target > 0] - for early_stopping in [True, False]: - max_iter = 1000 - clf = self.factory(early_stopping=early_stopping, tol=1e-3, - max_iter=max_iter).fit(X, Y) - assert clf.n_iter_ < max_iter - - def test_adaptive_longer_than_constant(self): - clf1 = self.factory(learning_rate="adaptive", eta0=0.01, tol=1e-3, - max_iter=100) - clf1.fit(iris.data, iris.target) - clf2 = self.factory(learning_rate="constant", eta0=0.01, tol=1e-3, - max_iter=100) - clf2.fit(iris.data, iris.target) - assert clf1.n_iter_ > clf2.n_iter_ - - def test_validation_set_not_used_for_training(self): - X, Y = iris.data, iris.target - validation_fraction = 0.4 - seed = 42 - shuffle = False - max_iter = 10 - clf1 = self.factory(early_stopping=True, - random_state=np.random.RandomState(seed), - validation_fraction=validation_fraction, - learning_rate='constant', eta0=0.01, - tol=None, max_iter=max_iter, shuffle=shuffle) - clf1.fit(X, Y) - assert clf1.n_iter_ == max_iter - - clf2 = self.factory(early_stopping=False, - random_state=np.random.RandomState(seed), - learning_rate='constant', eta0=0.01, - tol=None, max_iter=max_iter, shuffle=shuffle) - - if is_classifier(clf2): - cv = StratifiedShuffleSplit(test_size=validation_fraction, - random_state=seed) - else: - cv = ShuffleSplit(test_size=validation_fraction, - random_state=seed) - idx_train, idx_val = next(cv.split(X, Y)) - idx_train = np.sort(idx_train) # remove shuffling - clf2.fit(X[idx_train], Y[idx_train]) - assert clf2.n_iter_ == max_iter - - assert_array_equal(clf1.coef_, clf2.coef_) - - def test_n_iter_no_change(self): - X, Y = iris.data, iris.target - # test that n_iter_ increases monotonically with n_iter_no_change - for early_stopping in [True, False]: - n_iter_list = [self.factory(early_stopping=early_stopping, - n_iter_no_change=n_iter_no_change, - tol=1e-4, max_iter=1000 - ).fit(X, Y).n_iter_ - for n_iter_no_change in [2, 3, 10]] - assert_array_equal(n_iter_list, sorted(n_iter_list)) - - def test_not_enough_sample_for_early_stopping(self): - # test an error is raised if the training or validation set is empty - clf = self.factory(early_stopping=True, validation_fraction=0.99) - with pytest.raises(ValueError): - clf.fit(X3, Y3) + clf1.partial_fit(X, Y) + clf2.partial_fit(X, Y) + + assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16) + assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_late_onset_averaging_reached(klass): + eta0 = .001 + alpha = .0001 + Y_encode = np.array(Y) + Y_encode[Y_encode == 1] = -1.0 + Y_encode[Y_encode == 2] = 1.0 + + clf1 = klass(average=7, learning_rate="constant", + loss='squared_loss', eta0=eta0, + alpha=alpha, max_iter=2, shuffle=False) + clf2 = klass(average=0, learning_rate="constant", + loss='squared_loss', eta0=eta0, + alpha=alpha, max_iter=1, shuffle=False) + + clf1.fit(X, Y_encode) + clf2.fit(X, Y_encode) + + average_weights, average_intercept = \ + asgd(klass, X, Y_encode, eta0, alpha, + weight_init=clf2.coef_.ravel(), + intercept_init=clf2.intercept_) + + assert_array_almost_equal(clf1.coef_.ravel(), + average_weights.ravel(), + decimal=16) + assert_almost_equal(clf1.intercept_, average_intercept, decimal=16) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_sgd_bad_alpha_for_optimal_learning_rate(klass): + # Check whether expected ValueError on bad alpha, i.e. 0 + # since alpha is used to compute the optimal learning rate + assert_raises(ValueError, klass, + alpha=0, learning_rate="optimal") + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_early_stopping(klass): + X = iris.data[iris.target > 0] + Y = iris.target[iris.target > 0] + for early_stopping in [True, False]: + max_iter = 1000 + clf = klass(early_stopping=early_stopping, tol=1e-3, + max_iter=max_iter).fit(X, Y) + assert clf.n_iter_ < max_iter + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_adaptive_longer_than_constant(klass): + clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3, + max_iter=100) + clf1.fit(iris.data, iris.target) + clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3, + max_iter=100) + clf2.fit(iris.data, iris.target) + assert clf1.n_iter_ > clf2.n_iter_ + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_validation_set_not_used_for_training(klass): + X, Y = iris.data, iris.target + validation_fraction = 0.4 + seed = 42 + shuffle = False + max_iter = 10 + clf1 = klass(early_stopping=True, + random_state=np.random.RandomState(seed), + validation_fraction=validation_fraction, + learning_rate='constant', eta0=0.01, + tol=None, max_iter=max_iter, shuffle=shuffle) + clf1.fit(X, Y) + assert clf1.n_iter_ == max_iter + + clf2 = klass(early_stopping=False, + random_state=np.random.RandomState(seed), + learning_rate='constant', eta0=0.01, + tol=None, max_iter=max_iter, shuffle=shuffle) + + if is_classifier(clf2): + cv = StratifiedShuffleSplit(test_size=validation_fraction, + random_state=seed) + else: + cv = ShuffleSplit(test_size=validation_fraction, + random_state=seed) + idx_train, idx_val = next(cv.split(X, Y)) + idx_train = np.sort(idx_train) # remove shuffling + clf2.fit(X[idx_train], Y[idx_train]) + assert clf2.n_iter_ == max_iter + + assert_array_equal(clf1.coef_, clf2.coef_) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_n_iter_no_change(klass): + X, Y = iris.data, iris.target + # test that n_iter_ increases monotonically with n_iter_no_change + for early_stopping in [True, False]: + n_iter_list = [klass(early_stopping=early_stopping, + n_iter_no_change=n_iter_no_change, + tol=1e-4, max_iter=1000 + ).fit(X, Y).n_iter_ + for n_iter_no_change in [2, 3, 10]] + assert_array_equal(n_iter_list, sorted(n_iter_list)) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_not_enough_sample_for_early_stopping(klass): + # test an error is raised if the training or validation set is empty + clf = klass(early_stopping=True, validation_fraction=0.99) + with pytest.raises(ValueError): + clf.fit(X3, Y3) ############################################################################### # Classification Test Case -class DenseSGDClassifierTestCase(unittest.TestCase, CommonTest): - """Test suite for the dense representation variant of SGD""" - factory_class = SGDClassifier - - def test_sgd(self): - # Check that SGD gives any results :-) - - for loss in ("hinge", "squared_hinge", "log", "modified_huber"): - clf = self.factory(penalty='l2', alpha=0.01, fit_intercept=True, - loss=loss, max_iter=10, shuffle=True) - clf.fit(X, Y) - # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7) - assert_array_equal(clf.predict(T), true_result) - - def test_sgd_bad_l1_ratio(self): - # Check whether expected ValueError on bad l1_ratio - assert_raises(ValueError, self.factory, l1_ratio=1.1) - - def test_sgd_bad_learning_rate_schedule(self): - # Check whether expected ValueError on bad learning_rate - assert_raises(ValueError, self.factory, learning_rate="") - - def test_sgd_bad_eta0(self): - # Check whether expected ValueError on bad eta0 - assert_raises(ValueError, self.factory, eta0=0, - learning_rate="constant") - - def test_sgd_bad_alpha(self): - # Check whether expected ValueError on bad alpha - assert_raises(ValueError, self.factory, alpha=-.1) - - def test_sgd_bad_penalty(self): - # Check whether expected ValueError on bad penalty - assert_raises(ValueError, self.factory, penalty='foobar', - l1_ratio=0.85) - - def test_sgd_bad_loss(self): - # Check whether expected ValueError on bad loss - assert_raises(ValueError, self.factory, loss="foobar") - - def test_sgd_max_iter_param(self): - # Test parameter validity check - assert_raises(ValueError, self.factory, max_iter=-10000) - - def test_sgd_shuffle_param(self): - # Test parameter validity check - assert_raises(ValueError, self.factory, shuffle="false") - - def test_sgd_early_stopping_param(self): - # Test parameter validity check - assert_raises(ValueError, self.factory, early_stopping="false") - - def test_sgd_validation_fraction(self): - # Test parameter validity check - assert_raises(ValueError, self.factory, validation_fraction=-.1) - - def test_sgd_n_iter_no_change(self): - # Test parameter validity check - assert_raises(ValueError, self.factory, n_iter_no_change=0) - - def test_argument_coef(self): - # Checks coef_init not allowed as model argument (only fit) - # Provided coef_ does not match dataset - assert_raises(TypeError, self.factory, coef_init=np.zeros((3,))) - - def test_provide_coef(self): - # Checks coef_init shape for the warm starts - # Provided coef_ does not match dataset. - assert_raises(ValueError, self.factory().fit, - X, Y, coef_init=np.zeros((3,))) - - def test_set_intercept(self): - # Checks intercept_ shape for the warm starts - # Provided intercept_ does not match dataset. - assert_raises(ValueError, self.factory().fit, - X, Y, intercept_init=np.zeros((3,))) - - def test_sgd_early_stopping_with_partial_fit(self): - # Test parameter validity check - assert_raises(ValueError, - self.factory(early_stopping=True).partial_fit, X, Y) - - def test_set_intercept_binary(self): - # Checks intercept_ shape for the warm starts in binary case - self.factory().fit(X5, Y5, intercept_init=0) - - def test_average_binary_computed_correctly(self): - # Checks the SGDClassifier correctly computes the average weights - eta = .1 - alpha = 2. - n_samples = 20 - n_features = 10 - rng = np.random.RandomState(0) - X = rng.normal(size=(n_samples, n_features)) - w = rng.normal(size=n_features) - - clf = self.factory(loss='squared_loss', - learning_rate='constant', - eta0=eta, alpha=alpha, - fit_intercept=True, - max_iter=1, average=True, shuffle=False) - - # simple linear function without noise - y = np.dot(X, w) - y = np.sign(y) - - clf.fit(X, y) - - average_weights, average_intercept = self.asgd(X, y, eta, alpha) - average_weights = average_weights.reshape(1, -1) - assert_array_almost_equal(clf.coef_, - average_weights, - decimal=14) - assert_almost_equal(clf.intercept_, average_intercept, decimal=14) - - def test_set_intercept_to_intercept(self): - # Checks intercept_ shape consistency for the warm starts - # Inconsistent intercept_ shape. - clf = self.factory().fit(X5, Y5) - self.factory().fit(X5, Y5, intercept_init=clf.intercept_) - clf = self.factory().fit(X, Y) - self.factory().fit(X, Y, intercept_init=clf.intercept_) - - def test_sgd_at_least_two_labels(self): - # Target must have at least two labels - clf = self.factory(alpha=0.01, max_iter=20) - assert_raises(ValueError, clf.fit, X2, np.ones(9)) - - def test_partial_fit_weight_class_balanced(self): - # partial_fit with class_weight='balanced' not supported""" - regex = (r"class_weight 'balanced' is not supported for " - r"partial_fit\. In order to use 'balanced' weights, " - r"use compute_class_weight\('balanced', classes, y\). " - r"In place of y you can us a large enough sample " - r"of the full training set target to properly " - r"estimate the class frequency distributions\. " - r"Pass the resulting weights as the class_weight " - r"parameter\.") - assert_raises_regexp(ValueError, - regex, - self.factory(class_weight='balanced').partial_fit, - X, Y, classes=np.unique(Y)) - - def test_sgd_multiclass(self): - # Multi-class test case - clf = self.factory(alpha=0.01, max_iter=20).fit(X2, Y2) - assert_equal(clf.coef_.shape, (3, 2)) - assert_equal(clf.intercept_.shape, (3,)) - assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3)) - pred = clf.predict(T2) - assert_array_equal(pred, true_result2) - - def test_sgd_multiclass_average(self): - eta = .001 - alpha = .01 - # Multi-class average test case - clf = self.factory(loss='squared_loss', - learning_rate='constant', - eta0=eta, alpha=alpha, - fit_intercept=True, - max_iter=1, average=True, shuffle=False) - - np_Y2 = np.array(Y2) - clf.fit(X2, np_Y2) - classes = np.unique(np_Y2) - - for i, cl in enumerate(classes): - y_i = np.ones(np_Y2.shape[0]) - y_i[np_Y2 != cl] = -1 - average_coef, average_intercept = self.asgd(X2, y_i, eta, alpha) - assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16) - assert_almost_equal(average_intercept, - clf.intercept_[i], - decimal=16) - - def test_sgd_multiclass_with_init_coef(self): - # Multi-class test case - clf = self.factory(alpha=0.01, max_iter=20) - clf.fit(X2, Y2, coef_init=np.zeros((3, 2)), - intercept_init=np.zeros(3)) - assert_equal(clf.coef_.shape, (3, 2)) - assert clf.intercept_.shape, (3,) - pred = clf.predict(T2) - assert_array_equal(pred, true_result2) - - def test_sgd_multiclass_njobs(self): - # Multi-class test case with multi-core support - clf = self.factory(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2) - assert_equal(clf.coef_.shape, (3, 2)) - assert_equal(clf.intercept_.shape, (3,)) - assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3)) - pred = clf.predict(T2) - assert_array_equal(pred, true_result2) - - def test_set_coef_multiclass(self): - # Checks coef_init and intercept_init shape for multi-class - # problems - # Provided coef_ does not match dataset - clf = self.factory() - assert_raises(ValueError, clf.fit, X2, Y2, coef_init=np.zeros((2, 2))) - - # Provided coef_ does match dataset - clf = self.factory().fit(X2, Y2, coef_init=np.zeros((3, 2))) - - # Provided intercept_ does not match dataset - clf = self.factory() - assert_raises(ValueError, clf.fit, X2, Y2, - intercept_init=np.zeros((1,))) - - # Provided intercept_ does match dataset. - clf = self.factory().fit(X2, Y2, intercept_init=np.zeros((3,))) - - def test_sgd_predict_proba_method_access(self): - # Checks that SGDClassifier predict_proba and predict_log_proba methods - # can either be accessed or raise an appropriate error message - # otherwise. See - # https://github.com/scikit-learn/scikit-learn/issues/10938 for more - # details. - for loss in SGDClassifier.loss_functions: - clf = SGDClassifier(loss=loss) - if loss in ('log', 'modified_huber'): - assert hasattr(clf, 'predict_proba') - assert hasattr(clf, 'predict_log_proba') - else: - message = ("probability estimates are not " - "available for loss={!r}".format(loss)) - assert not hasattr(clf, 'predict_proba') - assert not hasattr(clf, 'predict_log_proba') - with pytest.raises(AttributeError, - match=message): - clf.predict_proba - with pytest.raises(AttributeError, - match=message): - clf.predict_log_proba - - def test_sgd_proba(self): - # Check SGD.predict_proba - - # Hinge loss does not allow for conditional prob estimate. - # We cannot use the factory here, because it defines predict_proba - # anyway. - clf = SGDClassifier(loss="hinge", alpha=0.01, - max_iter=10, tol=None).fit(X, Y) - assert not hasattr(clf, "predict_proba") - assert not hasattr(clf, "predict_log_proba") - - # log and modified_huber losses can output probability estimates - # binary case - for loss in ["log", "modified_huber"]: - clf = self.factory(loss=loss, alpha=0.01, max_iter=10) - clf.fit(X, Y) - p = clf.predict_proba([[3, 2]]) - assert p[0, 1] > 0.5 - p = clf.predict_proba([[-1, -1]]) - assert p[0, 1] < 0.5 - - p = clf.predict_log_proba([[3, 2]]) - assert p[0, 1] > p[0, 0] - p = clf.predict_log_proba([[-1, -1]]) - assert p[0, 1] < p[0, 0] - - # log loss multiclass probability estimates - clf = self.factory(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2) - - d = clf.decision_function([[.1, -.1], [.3, .2]]) - p = clf.predict_proba([[.1, -.1], [.3, .2]]) - assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1)) - assert_almost_equal(p[0].sum(), 1) - assert np.all(p[0] >= 0) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_clf(klass): + # Check that SGD gives any results :-) - p = clf.predict_proba([[-1, -1]]) - d = clf.decision_function([[-1, -1]]) - assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) - - l = clf.predict_log_proba([[3, 2]]) - p = clf.predict_proba([[3, 2]]) - assert_array_almost_equal(np.log(p), l) - - l = clf.predict_log_proba([[-1, -1]]) - p = clf.predict_proba([[-1, -1]]) - assert_array_almost_equal(np.log(p), l) - - # Modified Huber multiclass probability estimates; requires a separate - # test because the hard zero/one probabilities may destroy the - # ordering present in decision_function output. - clf = self.factory(loss="modified_huber", alpha=0.01, max_iter=10) - clf.fit(X2, Y2) - d = clf.decision_function([[3, 2]]) - p = clf.predict_proba([[3, 2]]) - if not isinstance(self, SparseSGDClassifierTestCase): - assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1)) - else: # XXX the sparse test gets a different X2 (?) - assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1)) - - # the following sample produces decision_function values < -1, - # which would cause naive normalization to fail (see comment - # in SGDClassifier.predict_proba) - x = X.mean(axis=0) - d = clf.decision_function([x]) - if np.all(d < -1): # XXX not true in sparse test case (why?) - p = clf.predict_proba([x]) - assert_array_almost_equal(p[0], [1 / 3.] * 3) - - def test_sgd_l1(self): - # Test L1 regularization - n = len(X4) - rng = np.random.RandomState(13) - idx = np.arange(n) - rng.shuffle(idx) - - X = X4[idx, :] - Y = Y4[idx] - - clf = self.factory(penalty='l1', alpha=.2, fit_intercept=False, - max_iter=2000, tol=None, shuffle=False) - clf.fit(X, Y) - assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,))) - pred = clf.predict(X) - assert_array_equal(pred, Y) - - # test sparsify with dense inputs - clf.sparsify() - assert sp.issparse(clf.coef_) - pred = clf.predict(X) - assert_array_equal(pred, Y) - - # pickle and unpickle with sparse coef_ - clf = pickle.loads(pickle.dumps(clf)) - assert sp.issparse(clf.coef_) - pred = clf.predict(X) - assert_array_equal(pred, Y) - - def test_class_weights(self): - # Test class weights. - X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], - [1.0, 1.0], [1.0, 0.0]]) - y = [1, 1, 1, -1, -1] - - clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False, - class_weight=None) - clf.fit(X, y) - assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) - - # we give a small weights to class 1 - clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False, - class_weight={1: 0.001}) - clf.fit(X, y) - - # now the hyperplane should rotate clock-wise and - # the prediction on this point should shift - assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) - - def test_equal_class_weight(self): - # Test if equal class weights approx. equals no class weights. - X = [[1, 0], [1, 0], [0, 1], [0, 1]] - y = [0, 0, 1, 1] - clf = self.factory(alpha=0.1, max_iter=1000, class_weight=None) - clf.fit(X, y) - - X = [[1, 0], [0, 1]] - y = [0, 1] - clf_weighted = self.factory(alpha=0.1, max_iter=1000, - class_weight={0: 0.5, 1: 0.5}) - clf_weighted.fit(X, y) - - # should be similar up to some epsilon due to learning rate schedule - assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2) - - def test_wrong_class_weight_label(self): - # ValueError due to not existing class label. - clf = self.factory(alpha=0.1, max_iter=1000, class_weight={0: 0.5}) - assert_raises(ValueError, clf.fit, X, Y) - - def test_wrong_class_weight_format(self): - # ValueError due to wrong class_weight argument type. - clf = self.factory(alpha=0.1, max_iter=1000, class_weight=[0.5]) - assert_raises(ValueError, clf.fit, X, Y) - - def test_weights_multiplied(self): - # Tests that class_weight and sample_weight are multiplicative - class_weights = {1: .6, 2: .3} - rng = np.random.RandomState(0) - sample_weights = rng.random_sample(Y4.shape[0]) - multiplied_together = np.copy(sample_weights) - multiplied_together[Y4 == 1] *= class_weights[1] - multiplied_together[Y4 == 2] *= class_weights[2] - - clf1 = self.factory(alpha=0.1, max_iter=20, class_weight=class_weights) - clf2 = self.factory(alpha=0.1, max_iter=20) - - clf1.fit(X4, Y4, sample_weight=sample_weights) - clf2.fit(X4, Y4, sample_weight=multiplied_together) - - assert_almost_equal(clf1.coef_, clf2.coef_) - - def test_balanced_weight(self): - # Test class weights for imbalanced data""" - # compute reference metrics on iris dataset that is quite balanced by - # default - X, y = iris.data, iris.target - X = scale(X) - idx = np.arange(X.shape[0]) - rng = np.random.RandomState(6) - rng.shuffle(idx) - X = X[idx] - y = y[idx] - clf = self.factory(alpha=0.0001, max_iter=1000, - class_weight=None, shuffle=False).fit(X, y) - f1 = metrics.f1_score(y, clf.predict(X), average='weighted') - assert_almost_equal(f1, 0.96, decimal=1) - - # make the same prediction using balanced class_weight - clf_balanced = self.factory(alpha=0.0001, max_iter=1000, - class_weight="balanced", - shuffle=False).fit(X, y) - f1 = metrics.f1_score(y, clf_balanced.predict(X), average='weighted') - assert_almost_equal(f1, 0.96, decimal=1) - - # Make sure that in the balanced case it does not change anything - # to use "balanced" - assert_array_almost_equal(clf.coef_, clf_balanced.coef_, 6) - - # build an very very imbalanced dataset out of iris data - X_0 = X[y == 0, :] - y_0 = y[y == 0] - - X_imbalanced = np.vstack([X] + [X_0] * 10) - y_imbalanced = np.concatenate([y] + [y_0] * 10) - - # fit a model on the imbalanced data without class weight info - clf = self.factory(max_iter=1000, class_weight=None, shuffle=False) - clf.fit(X_imbalanced, y_imbalanced) - y_pred = clf.predict(X) - assert_less(metrics.f1_score(y, y_pred, average='weighted'), 0.96) - - # fit a model with balanced class_weight enabled - clf = self.factory(max_iter=1000, class_weight="balanced", - shuffle=False) - clf.fit(X_imbalanced, y_imbalanced) - y_pred = clf.predict(X) - assert_greater(metrics.f1_score(y, y_pred, average='weighted'), 0.96) - - def test_sample_weights(self): - # Test weights on individual samples - X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], - [1.0, 1.0], [1.0, 0.0]]) - y = [1, 1, 1, -1, -1] - - clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False) - clf.fit(X, y) - assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) - - # we give a small weights to class 1 - clf.fit(X, y, sample_weight=[0.001] * 3 + [1] * 2) - - # now the hyperplane should rotate clock-wise and - # the prediction on this point should shift - assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) - - def test_wrong_sample_weights(self): - # Test if ValueError is raised if sample_weight has wrong shape - clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False) - # provided sample_weight too long - assert_raises(ValueError, clf.fit, X, Y, sample_weight=np.arange(7)) - - def test_partial_fit_exception(self): - clf = self.factory(alpha=0.01) - # classes was not specified - assert_raises(ValueError, clf.partial_fit, X3, Y3) - - def test_partial_fit_binary(self): - third = X.shape[0] // 3 - clf = self.factory(alpha=0.01) - classes = np.unique(Y) - - clf.partial_fit(X[:third], Y[:third], classes=classes) - assert_equal(clf.coef_.shape, (1, X.shape[1])) - assert_equal(clf.intercept_.shape, (1,)) - assert_equal(clf.decision_function([[0, 0]]).shape, (1, )) - id1 = id(clf.coef_.data) - - clf.partial_fit(X[third:], Y[third:]) - id2 = id(clf.coef_.data) - # check that coef_ haven't been re-allocated - assert id1, id2 - - y_pred = clf.predict(T) - assert_array_equal(y_pred, true_result) - - def test_partial_fit_multiclass(self): - third = X2.shape[0] // 3 - clf = self.factory(alpha=0.01) - classes = np.unique(Y2) - - clf.partial_fit(X2[:third], Y2[:third], classes=classes) - assert_equal(clf.coef_.shape, (3, X2.shape[1])) - assert_equal(clf.intercept_.shape, (3,)) - assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3)) - id1 = id(clf.coef_.data) - - clf.partial_fit(X2[third:], Y2[third:]) - id2 = id(clf.coef_.data) - # check that coef_ haven't been re-allocated - assert id1, id2 - - def test_partial_fit_multiclass_average(self): - third = X2.shape[0] // 3 - clf = self.factory(alpha=0.01, average=X2.shape[0]) - classes = np.unique(Y2) - - clf.partial_fit(X2[:third], Y2[:third], classes=classes) - assert_equal(clf.coef_.shape, (3, X2.shape[1])) - assert_equal(clf.intercept_.shape, (3,)) - - clf.partial_fit(X2[third:], Y2[third:]) - assert_equal(clf.coef_.shape, (3, X2.shape[1])) - assert_equal(clf.intercept_.shape, (3,)) - - def test_fit_then_partial_fit(self): - # Partial_fit should work after initial fit in the multiclass case. - # Non-regression test for #2496; fit would previously produce a - # Fortran-ordered coef_ that subsequent partial_fit couldn't handle. - clf = self.factory() - clf.fit(X2, Y2) - clf.partial_fit(X2, Y2) # no exception here - - def _test_partial_fit_equal_fit(self, lr): - for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)): - clf = self.factory(alpha=0.01, eta0=0.01, max_iter=2, - learning_rate=lr, shuffle=False) - clf.fit(X_, Y_) - y_pred = clf.decision_function(T_) - t = clf.t_ - - classes = np.unique(Y_) - clf = self.factory(alpha=0.01, eta0=0.01, learning_rate=lr, - shuffle=False) - for i in range(2): - clf.partial_fit(X_, Y_, classes=classes) - y_pred2 = clf.decision_function(T_) - - assert_equal(clf.t_, t) - assert_array_almost_equal(y_pred, y_pred2, decimal=2) - - def test_partial_fit_equal_fit_constant(self): - self._test_partial_fit_equal_fit("constant") - - def test_partial_fit_equal_fit_optimal(self): - self._test_partial_fit_equal_fit("optimal") - - def test_partial_fit_equal_fit_invscaling(self): - self._test_partial_fit_equal_fit("invscaling") - - def test_partial_fit_equal_fit_adaptive(self): - self._test_partial_fit_equal_fit("adaptive") - - def test_regression_losses(self): - clf = self.factory(alpha=0.01, learning_rate="constant", - eta0=0.1, loss="epsilon_insensitive") + for loss in ("hinge", "squared_hinge", "log", "modified_huber"): + clf = klass(penalty='l2', alpha=0.01, fit_intercept=True, + loss=loss, max_iter=10, shuffle=True) clf.fit(X, Y) - assert_equal(1.0, np.mean(clf.predict(X) == Y)) + # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7) + assert_array_equal(clf.predict(T), true_result) - clf = self.factory(alpha=0.01, learning_rate="constant", - eta0=0.1, loss="squared_epsilon_insensitive") - clf.fit(X, Y) - assert_equal(1.0, np.mean(clf.predict(X) == Y)) - clf = self.factory(alpha=0.01, loss="huber") - clf.fit(X, Y) - assert_equal(1.0, np.mean(clf.predict(X) == Y)) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_bad_l1_ratio(klass): + # Check whether expected ValueError on bad l1_ratio + assert_raises(ValueError, klass, l1_ratio=1.1) - clf = self.factory(alpha=0.01, learning_rate="constant", eta0=0.01, - loss="squared_loss") - clf.fit(X, Y) - assert_equal(1.0, np.mean(clf.predict(X) == Y)) - def test_warm_start_multiclass(self): - self._test_warm_start(X2, Y2, "optimal") +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_bad_learning_rate_schedule(klass): + # Check whether expected ValueError on bad learning_rate + assert_raises(ValueError, klass, learning_rate="") - def test_multiple_fit(self): - # Test multiple calls of fit w/ different shaped inputs. - clf = self.factory(alpha=0.01, shuffle=False) - clf.fit(X, Y) - assert hasattr(clf, "coef_") - # Non-regression test: try fitting with a different label set. - y = [["ham", "spam"][i] for i in LabelEncoder().fit_transform(Y)] - clf.fit(X[:, :-1], y) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_bad_eta0(klass): + # Check whether expected ValueError on bad eta0 + assert_raises(ValueError, klass, eta0=0, + learning_rate="constant") -class SparseSGDClassifierTestCase(DenseSGDClassifierTestCase): - """Run exactly the same tests using the sparse representation variant""" +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_sgd_bad_alpha(klass): + # Check whether expected ValueError on bad alpha + assert_raises(ValueError, klass, alpha=-.1) - factory_class = SparseSGDClassifier +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_sgd_bad_penalty(klass): + # Check whether expected ValueError on bad penalty + assert_raises(ValueError, klass, penalty='foobar', + l1_ratio=0.85) -############################################################################### -# Regression Test Case -class DenseSGDRegressorTestCase(unittest.TestCase, CommonTest): - """Test suite for the dense representation variant of SGD""" +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_bad_loss(klass): + # Check whether expected ValueError on bad loss + assert_raises(ValueError, klass, loss="foobar") - factory_class = SGDRegressor - def test_sgd(self): - # Check that SGD gives any results. - clf = self.factory(alpha=0.1, max_iter=2, - fit_intercept=False) - clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) - assert_equal(clf.coef_[0], clf.coef_[1]) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_max_iter_param(klass): + # Test parameter validity check + assert_raises(ValueError, klass, max_iter=-10000) - def test_sgd_bad_penalty(self): - # Check whether expected ValueError on bad penalty - assert_raises(ValueError, self.factory, - penalty='foobar', l1_ratio=0.85) - def test_sgd_bad_loss(self): - # Check whether expected ValueError on bad loss - assert_raises(ValueError, self.factory, loss="foobar") +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_shuffle_param(klass): + # Test parameter validity check + assert_raises(ValueError, klass, shuffle="false") - def test_sgd_averaged_computed_correctly(self): - # Tests the average regressor matches the naive implementation - eta = .001 - alpha = .01 - n_samples = 20 - n_features = 10 - rng = np.random.RandomState(0) - X = rng.normal(size=(n_samples, n_features)) - w = rng.normal(size=n_features) - - # simple linear function without noise - y = np.dot(X, w) - - clf = self.factory(loss='squared_loss', - learning_rate='constant', - eta0=eta, alpha=alpha, - fit_intercept=True, - max_iter=1, average=True, shuffle=False) - - clf.fit(X, y) - average_weights, average_intercept = self.asgd(X, y, eta, alpha) - - assert_array_almost_equal(clf.coef_, - average_weights, - decimal=16) - assert_almost_equal(clf.intercept_, average_intercept, decimal=16) - - def test_sgd_averaged_partial_fit(self): - # Tests whether the partial fit yields the same average as the fit - eta = .001 - alpha = .01 - n_samples = 20 - n_features = 10 - rng = np.random.RandomState(0) - X = rng.normal(size=(n_samples, n_features)) - w = rng.normal(size=n_features) - - # simple linear function without noise - y = np.dot(X, w) - - clf = self.factory(loss='squared_loss', - learning_rate='constant', - eta0=eta, alpha=alpha, - fit_intercept=True, - max_iter=1, average=True, shuffle=False) - - clf.partial_fit(X[:int(n_samples / 2)][:], y[:int(n_samples / 2)]) - clf.partial_fit(X[int(n_samples / 2):][:], y[int(n_samples / 2):]) - average_weights, average_intercept = self.asgd(X, y, eta, alpha) - - assert_array_almost_equal(clf.coef_, - average_weights, - decimal=16) - assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16) - - def test_average_sparse(self): - # Checks the average weights on data with 0s - - eta = .001 - alpha = .01 - clf = self.factory(loss='squared_loss', - learning_rate='constant', - eta0=eta, alpha=alpha, - fit_intercept=True, - max_iter=1, average=True, shuffle=False) - - n_samples = Y3.shape[0] - - clf.partial_fit(X3[:int(n_samples / 2)][:], Y3[:int(n_samples / 2)]) - clf.partial_fit(X3[int(n_samples / 2):][:], Y3[int(n_samples / 2):]) - average_weights, average_intercept = self.asgd(X3, Y3, eta, alpha) - - assert_array_almost_equal(clf.coef_, - average_weights, - decimal=16) - assert_almost_equal(clf.intercept_, average_intercept, decimal=16) - - def test_sgd_least_squares_fit(self): - xmin, xmax = -5, 5 - n_samples = 100 - rng = np.random.RandomState(0) - X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_early_stopping_param(klass): + # Test parameter validity check + assert_raises(ValueError, klass, early_stopping="false") - # simple linear function without noise - y = 0.5 * X.ravel() - clf = self.factory(loss='squared_loss', alpha=0.1, max_iter=20, - fit_intercept=False) - clf.fit(X, y) - score = clf.score(X, y) - assert_greater(score, 0.99) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_validation_fraction(klass): + # Test parameter validity check + assert_raises(ValueError, klass, validation_fraction=-.1) - # simple linear function with noise - y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() - clf = self.factory(loss='squared_loss', alpha=0.1, max_iter=20, - fit_intercept=False) - clf.fit(X, y) - score = clf.score(X, y) - assert_greater(score, 0.5) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_n_iter_no_change(klass): + # Test parameter validity check + assert_raises(ValueError, klass, n_iter_no_change=0) - def test_sgd_epsilon_insensitive(self): - xmin, xmax = -5, 5 - n_samples = 100 - rng = np.random.RandomState(0) - X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) - - # simple linear function without noise - y = 0.5 * X.ravel() - - clf = self.factory(loss='epsilon_insensitive', epsilon=0.01, - alpha=0.1, max_iter=20, - fit_intercept=False) - clf.fit(X, y) - score = clf.score(X, y) - assert score > 0.99 - - # simple linear function with noise - y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() - - clf = self.factory(loss='epsilon_insensitive', epsilon=0.01, - alpha=0.1, max_iter=20, - fit_intercept=False) - clf.fit(X, y) - score = clf.score(X, y) - assert score > 0.5 - - def test_sgd_huber_fit(self): - xmin, xmax = -5, 5 - n_samples = 100 - rng = np.random.RandomState(0) - X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) - # simple linear function without noise - y = 0.5 * X.ravel() +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_argument_coef(klass): + # Checks coef_init not allowed as model argument (only fit) + # Provided coef_ does not match dataset + assert_raises(TypeError, klass, coef_init=np.zeros((3,))) - clf = self.factory(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, - fit_intercept=False) - clf.fit(X, y) - score = clf.score(X, y) - assert_greater(score, 0.99) - # simple linear function with noise - y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_provide_coef(klass): + # Checks coef_init shape for the warm starts + # Provided coef_ does not match dataset. + assert_raises(ValueError, klass().fit, + X, Y, coef_init=np.zeros((3,))) - clf = self.factory(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, - fit_intercept=False) - clf.fit(X, y) - score = clf.score(X, y) - assert_greater(score, 0.5) - def test_elasticnet_convergence(self): - # Check that the SGD output is consistent with coordinate descent +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_set_intercept(klass): + # Checks intercept_ shape for the warm starts + # Provided intercept_ does not match dataset. + assert_raises(ValueError, klass().fit, + X, Y, intercept_init=np.zeros((3,))) + - n_samples, n_features = 1000, 5 - rng = np.random.RandomState(0) - X = rng.randn(n_samples, n_features) - # ground_truth linear model that generate y from X and to which the - # models should converge if the regularizer would be set to 0.0 - ground_truth_coef = rng.randn(n_features) - y = np.dot(X, ground_truth_coef) - - # XXX: alpha = 0.1 seems to cause convergence problems - for alpha in [0.01, 0.001]: - for l1_ratio in [0.5, 0.8, 1.0]: - cd = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio, - fit_intercept=False) - cd.fit(X, y) - sgd = self.factory(penalty='elasticnet', max_iter=50, - alpha=alpha, l1_ratio=l1_ratio, - fit_intercept=False) - sgd.fit(X, y) - err_msg = ("cd and sgd did not converge to comparable " - "results for alpha=%f and l1_ratio=%f" - % (alpha, l1_ratio)) - assert_almost_equal(cd.coef_, sgd.coef_, decimal=2, - err_msg=err_msg) - - @ignore_warnings - def test_partial_fit(self): - third = X.shape[0] // 3 - clf = self.factory(alpha=0.01) - - clf.partial_fit(X[:third], Y[:third]) - assert_equal(clf.coef_.shape, (X.shape[1], )) - assert_equal(clf.intercept_.shape, (1,)) - assert_equal(clf.predict([[0, 0]]).shape, (1, )) - id1 = id(clf.coef_.data) - - clf.partial_fit(X[third:], Y[third:]) - id2 = id(clf.coef_.data) - # check that coef_ haven't been re-allocated - assert id1, id2 - - def _test_partial_fit_equal_fit(self, lr): - clf = self.factory(alpha=0.01, max_iter=2, eta0=0.01, - learning_rate=lr, shuffle=False) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_early_stopping_with_partial_fit(klass): + # Test parameter validity check + assert_raises(ValueError, + klass(early_stopping=True).partial_fit, X, Y) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_set_intercept_binary(klass): + # Checks intercept_ shape for the warm starts in binary case + klass().fit(X5, Y5, intercept_init=0) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_average_binary_computed_correctly(klass): + # Checks the SGDClassifier correctly computes the average weights + eta = .1 + alpha = 2. + n_samples = 20 + n_features = 10 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + w = rng.normal(size=n_features) + + clf = klass(loss='squared_loss', + learning_rate='constant', + eta0=eta, alpha=alpha, + fit_intercept=True, + max_iter=1, average=True, shuffle=False) + + # simple linear function without noise + y = np.dot(X, w) + y = np.sign(y) + + clf.fit(X, y) + + average_weights, average_intercept = asgd(klass, X, y, eta, alpha) + average_weights = average_weights.reshape(1, -1) + assert_array_almost_equal(clf.coef_, + average_weights, + decimal=14) + assert_almost_equal(clf.intercept_, average_intercept, decimal=14) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_set_intercept_to_intercept(klass): + # Checks intercept_ shape consistency for the warm starts + # Inconsistent intercept_ shape. + clf = klass().fit(X5, Y5) + klass().fit(X5, Y5, intercept_init=clf.intercept_) + clf = klass().fit(X, Y) + klass().fit(X, Y, intercept_init=clf.intercept_) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_at_least_two_labels(klass): + # Target must have at least two labels + clf = klass(alpha=0.01, max_iter=20) + assert_raises(ValueError, clf.fit, X2, np.ones(9)) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_partial_fit_weight_class_balanced(klass): + # partial_fit with class_weight='balanced' not supported""" + regex = (r"class_weight 'balanced' is not supported for " + r"partial_fit\. In order to use 'balanced' weights, " + r"use compute_class_weight\('balanced', classes, y\). " + r"In place of y you can us a large enough sample " + r"of the full training set target to properly " + r"estimate the class frequency distributions\. " + r"Pass the resulting weights as the class_weight " + r"parameter\.") + assert_raises_regexp(ValueError, + regex, + klass(class_weight='balanced').partial_fit, + X, Y, classes=np.unique(Y)) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_multiclass(klass): + # Multi-class test case + clf = klass(alpha=0.01, max_iter=20).fit(X2, Y2) + assert_equal(clf.coef_.shape, (3, 2)) + assert_equal(clf.intercept_.shape, (3,)) + assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3)) + pred = clf.predict(T2) + assert_array_equal(pred, true_result2) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_multiclass_average(klass): + eta = .001 + alpha = .01 + # Multi-class average test case + clf = klass(loss='squared_loss', + learning_rate='constant', + eta0=eta, alpha=alpha, + fit_intercept=True, + max_iter=1, average=True, shuffle=False) + + np_Y2 = np.array(Y2) + clf.fit(X2, np_Y2) + classes = np.unique(np_Y2) + + for i, cl in enumerate(classes): + y_i = np.ones(np_Y2.shape[0]) + y_i[np_Y2 != cl] = -1 + average_coef, average_intercept = asgd(klass, X2, y_i, eta, alpha) + assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16) + assert_almost_equal(average_intercept, + clf.intercept_[i], + decimal=16) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_multiclass_with_init_coef(klass): + # Multi-class test case + clf = klass(alpha=0.01, max_iter=20) + clf.fit(X2, Y2, coef_init=np.zeros((3, 2)), + intercept_init=np.zeros(3)) + assert_equal(clf.coef_.shape, (3, 2)) + assert clf.intercept_.shape, (3,) + pred = clf.predict(T2) + assert_array_equal(pred, true_result2) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_multiclass_njobs(klass): + # Multi-class test case with multi-core support + clf = klass(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2) + assert_equal(clf.coef_.shape, (3, 2)) + assert_equal(clf.intercept_.shape, (3,)) + assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3)) + pred = clf.predict(T2) + assert_array_equal(pred, true_result2) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_set_coef_multiclass(klass): + # Checks coef_init and intercept_init shape for multi-class + # problems + # Provided coef_ does not match dataset + clf = klass() + assert_raises(ValueError, clf.fit, X2, Y2, coef_init=np.zeros((2, 2))) + + # Provided coef_ does match dataset + clf = klass().fit(X2, Y2, coef_init=np.zeros((3, 2))) + + # Provided intercept_ does not match dataset + clf = klass() + assert_raises(ValueError, clf.fit, X2, Y2, + intercept_init=np.zeros((1,))) + + # Provided intercept_ does match dataset. + clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,))) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_predict_proba_method_access(klass): + # Checks that SGDClassifier predict_proba and predict_log_proba methods + # can either be accessed or raise an appropriate error message + # otherwise. See + # https://github.com/scikit-learn/scikit-learn/issues/10938 for more + # details. + for loss in linear_model.SGDClassifier.loss_functions: + clf = SGDClassifier(loss=loss) + if loss in ('log', 'modified_huber'): + assert hasattr(clf, 'predict_proba') + assert hasattr(clf, 'predict_log_proba') + else: + message = ("probability estimates are not " + "available for loss={!r}".format(loss)) + assert not hasattr(clf, 'predict_proba') + assert not hasattr(clf, 'predict_log_proba') + with pytest.raises(AttributeError, + match=message): + clf.predict_proba + with pytest.raises(AttributeError, + match=message): + clf.predict_log_proba + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_proba(klass): + # Check SGD.predict_proba + + # Hinge loss does not allow for conditional prob estimate. + # We cannot use the factory here, because it defines predict_proba + # anyway. + clf = SGDClassifier(loss="hinge", alpha=0.01, + max_iter=10, tol=None).fit(X, Y) + assert not hasattr(clf, "predict_proba") + assert not hasattr(clf, "predict_log_proba") + + # log and modified_huber losses can output probability estimates + # binary case + for loss in ["log", "modified_huber"]: + clf = klass(loss=loss, alpha=0.01, max_iter=10) clf.fit(X, Y) - y_pred = clf.predict(T) + p = clf.predict_proba([[3, 2]]) + assert p[0, 1] > 0.5 + p = clf.predict_proba([[-1, -1]]) + assert p[0, 1] < 0.5 + + p = clf.predict_log_proba([[3, 2]]) + assert p[0, 1] > p[0, 0] + p = clf.predict_log_proba([[-1, -1]]) + assert p[0, 1] < p[0, 0] + + # log loss multiclass probability estimates + clf = klass(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2) + + d = clf.decision_function([[.1, -.1], [.3, .2]]) + p = clf.predict_proba([[.1, -.1], [.3, .2]]) + assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1)) + assert_almost_equal(p[0].sum(), 1) + assert np.all(p[0] >= 0) + + p = clf.predict_proba([[-1, -1]]) + d = clf.decision_function([[-1, -1]]) + assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) + + l = clf.predict_log_proba([[3, 2]]) + p = clf.predict_proba([[3, 2]]) + assert_array_almost_equal(np.log(p), l) + + l = clf.predict_log_proba([[-1, -1]]) + p = clf.predict_proba([[-1, -1]]) + assert_array_almost_equal(np.log(p), l) + + # Modified Huber multiclass probability estimates; requires a separate + # test because the hard zero/one probabilities may destroy the + # ordering present in decision_function output. + clf = klass(loss="modified_huber", alpha=0.01, max_iter=10) + clf.fit(X2, Y2) + d = clf.decision_function([[3, 2]]) + p = clf.predict_proba([[3, 2]]) + if klass != SparseSGDClassifier: + assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1)) + else: # XXX the sparse test gets a different X2 (?) + assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1)) + + # the following sample produces decision_function values < -1, + # which would cause naive normalization to fail (see comment + # in SGDClassifier.predict_proba) + x = X.mean(axis=0) + d = clf.decision_function([x]) + if np.all(d < -1): # XXX not true in sparse test case (why?) + p = clf.predict_proba([x]) + assert_array_almost_equal(p[0], [1 / 3.] * 3) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sgd_l1(klass): + # Test L1 regularization + n = len(X4) + rng = np.random.RandomState(13) + idx = np.arange(n) + rng.shuffle(idx) + + X = X4[idx, :] + Y = Y4[idx] + + clf = klass(penalty='l1', alpha=.2, fit_intercept=False, + max_iter=2000, tol=None, shuffle=False) + clf.fit(X, Y) + assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,))) + pred = clf.predict(X) + assert_array_equal(pred, Y) + + # test sparsify with dense inputs + clf.sparsify() + assert sp.issparse(clf.coef_) + pred = clf.predict(X) + assert_array_equal(pred, Y) + + # pickle and unpickle with sparse coef_ + clf = pickle.loads(pickle.dumps(clf)) + assert sp.issparse(clf.coef_) + pred = clf.predict(X) + assert_array_equal(pred, Y) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_class_weights(klass): + # Test class weights. + X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], + [1.0, 1.0], [1.0, 0.0]]) + y = [1, 1, 1, -1, -1] + + clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, + class_weight=None) + clf.fit(X, y) + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) + + # we give a small weights to class 1 + clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, + class_weight={1: 0.001}) + clf.fit(X, y) + + # now the hyperplane should rotate clock-wise and + # the prediction on this point should shift + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_equal_class_weight(klass): + # Test if equal class weights approx. equals no class weights. + X = [[1, 0], [1, 0], [0, 1], [0, 1]] + y = [0, 0, 1, 1] + clf = klass(alpha=0.1, max_iter=1000, class_weight=None) + clf.fit(X, y) + + X = [[1, 0], [0, 1]] + y = [0, 1] + clf_weighted = klass(alpha=0.1, max_iter=1000, + class_weight={0: 0.5, 1: 0.5}) + clf_weighted.fit(X, y) + + # should be similar up to some epsilon due to learning rate schedule + assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_wrong_class_weight_label(klass): + # ValueError due to not existing class label. + clf = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5}) + assert_raises(ValueError, clf.fit, X, Y) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_wrong_class_weight_format(klass): + # ValueError due to wrong class_weight argument type. + clf = klass(alpha=0.1, max_iter=1000, class_weight=[0.5]) + assert_raises(ValueError, clf.fit, X, Y) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_weights_multiplied(klass): + # Tests that class_weight and sample_weight are multiplicative + class_weights = {1: .6, 2: .3} + rng = np.random.RandomState(0) + sample_weights = rng.random_sample(Y4.shape[0]) + multiplied_together = np.copy(sample_weights) + multiplied_together[Y4 == 1] *= class_weights[1] + multiplied_together[Y4 == 2] *= class_weights[2] + + clf1 = klass(alpha=0.1, max_iter=20, class_weight=class_weights) + clf2 = klass(alpha=0.1, max_iter=20) + + clf1.fit(X4, Y4, sample_weight=sample_weights) + clf2.fit(X4, Y4, sample_weight=multiplied_together) + + assert_almost_equal(clf1.coef_, clf2.coef_) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_balanced_weight(klass): + # Test class weights for imbalanced data""" + # compute reference metrics on iris dataset that is quite balanced by + # default + X, y = iris.data, iris.target + X = scale(X) + idx = np.arange(X.shape[0]) + rng = np.random.RandomState(6) + rng.shuffle(idx) + X = X[idx] + y = y[idx] + clf = klass(alpha=0.0001, max_iter=1000, + class_weight=None, shuffle=False).fit(X, y) + f1 = metrics.f1_score(y, clf.predict(X), average='weighted') + assert_almost_equal(f1, 0.96, decimal=1) + + # make the same prediction using balanced class_weight + clf_balanced = klass(alpha=0.0001, max_iter=1000, + class_weight="balanced", + shuffle=False).fit(X, y) + f1 = metrics.f1_score(y, clf_balanced.predict(X), average='weighted') + assert_almost_equal(f1, 0.96, decimal=1) + + # Make sure that in the balanced case it does not change anything + # to use "balanced" + assert_array_almost_equal(clf.coef_, clf_balanced.coef_, 6) + + # build an very very imbalanced dataset out of iris data + X_0 = X[y == 0, :] + y_0 = y[y == 0] + + X_imbalanced = np.vstack([X] + [X_0] * 10) + y_imbalanced = np.concatenate([y] + [y_0] * 10) + + # fit a model on the imbalanced data without class weight info + clf = klass(max_iter=1000, class_weight=None, shuffle=False) + clf.fit(X_imbalanced, y_imbalanced) + y_pred = clf.predict(X) + assert_less(metrics.f1_score(y, y_pred, average='weighted'), 0.96) + + # fit a model with balanced class_weight enabled + clf = klass(max_iter=1000, class_weight="balanced", + shuffle=False) + clf.fit(X_imbalanced, y_imbalanced) + y_pred = clf.predict(X) + assert_greater(metrics.f1_score(y, y_pred, average='weighted'), 0.96) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_sample_weights(klass): + # Test weights on individual samples + X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], + [1.0, 1.0], [1.0, 0.0]]) + y = [1, 1, 1, -1, -1] + + clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False) + clf.fit(X, y) + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) + + # we give a small weights to class 1 + clf.fit(X, y, sample_weight=[0.001] * 3 + [1] * 2) + + # now the hyperplane should rotate clock-wise and + # the prediction on this point should shift + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_wrong_sample_weights(klass): + # Test if ValueError is raised if sample_weight has wrong shape + clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False) + # provided sample_weight too long + assert_raises(ValueError, clf.fit, X, Y, sample_weight=np.arange(7)) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_partial_fit_exception(klass): + clf = klass(alpha=0.01) + # classes was not specified + assert_raises(ValueError, clf.partial_fit, X3, Y3) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_partial_fit_binary(klass): + third = X.shape[0] // 3 + clf = klass(alpha=0.01) + classes = np.unique(Y) + + clf.partial_fit(X[:third], Y[:third], classes=classes) + assert_equal(clf.coef_.shape, (1, X.shape[1])) + assert_equal(clf.intercept_.shape, (1,)) + assert_equal(clf.decision_function([[0, 0]]).shape, (1, )) + id1 = id(clf.coef_.data) + + clf.partial_fit(X[third:], Y[third:]) + id2 = id(clf.coef_.data) + # check that coef_ haven't been re-allocated + assert id1, id2 + + y_pred = clf.predict(T) + assert_array_equal(y_pred, true_result) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_partial_fit_multiclass(klass): + third = X2.shape[0] // 3 + clf = klass(alpha=0.01) + classes = np.unique(Y2) + + clf.partial_fit(X2[:third], Y2[:third], classes=classes) + assert_equal(clf.coef_.shape, (3, X2.shape[1])) + assert_equal(clf.intercept_.shape, (3,)) + assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3)) + id1 = id(clf.coef_.data) + + clf.partial_fit(X2[third:], Y2[third:]) + id2 = id(clf.coef_.data) + # check that coef_ haven't been re-allocated + assert id1, id2 + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_partial_fit_multiclass_average(klass): + third = X2.shape[0] // 3 + clf = klass(alpha=0.01, average=X2.shape[0]) + classes = np.unique(Y2) + + clf.partial_fit(X2[:third], Y2[:third], classes=classes) + assert_equal(clf.coef_.shape, (3, X2.shape[1])) + assert_equal(clf.intercept_.shape, (3,)) + + clf.partial_fit(X2[third:], Y2[third:]) + assert_equal(clf.coef_.shape, (3, X2.shape[1])) + assert_equal(clf.intercept_.shape, (3,)) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_fit_then_partial_fit(klass): + # Partial_fit should work after initial fit in the multiclass case. + # Non-regression test for #2496; fit would previously produce a + # Fortran-ordered coef_ that subsequent partial_fit couldn't handle. + clf = klass() + clf.fit(X2, Y2) + clf.partial_fit(X2, Y2) # no exception here + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize('lr', + ["constant", "optimal", "invscaling", "adaptive"]) +def test_partial_fit_equal_fit_classif(klass, lr): + for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)): + clf = klass(alpha=0.01, eta0=0.01, max_iter=2, + learning_rate=lr, shuffle=False) + clf.fit(X_, Y_) + y_pred = clf.decision_function(T_) t = clf.t_ - clf = self.factory(alpha=0.01, eta0=0.01, - learning_rate=lr, shuffle=False) + classes = np.unique(Y_) + clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, + shuffle=False) for i in range(2): - clf.partial_fit(X, Y) - y_pred2 = clf.predict(T) + clf.partial_fit(X_, Y_, classes=classes) + y_pred2 = clf.decision_function(T_) assert_equal(clf.t_, t) assert_array_almost_equal(y_pred, y_pred2, decimal=2) - def test_partial_fit_equal_fit_constant(self): - self._test_partial_fit_equal_fit("constant") - def test_partial_fit_equal_fit_optimal(self): - self._test_partial_fit_equal_fit("optimal") +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_regression_losses(klass): + clf = klass(alpha=0.01, learning_rate="constant", + eta0=0.1, loss="epsilon_insensitive") + clf.fit(X, Y) + assert_equal(1.0, np.mean(clf.predict(X) == Y)) + + clf = klass(alpha=0.01, learning_rate="constant", + eta0=0.1, loss="squared_epsilon_insensitive") + clf.fit(X, Y) + assert_equal(1.0, np.mean(clf.predict(X) == Y)) + + clf = klass(alpha=0.01, loss="huber") + clf.fit(X, Y) + assert_equal(1.0, np.mean(clf.predict(X) == Y)) + + clf = klass(alpha=0.01, learning_rate="constant", eta0=0.01, + loss="squared_loss") + clf.fit(X, Y) + assert_equal(1.0, np.mean(clf.predict(X) == Y)) - def test_partial_fit_equal_fit_invscaling(self): - self._test_partial_fit_equal_fit("invscaling") - def test_partial_fit_equal_fit_adaptive(self): - self._test_partial_fit_equal_fit("adaptive") +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_warm_start_multiclass(klass): + _test_warm_start(klass, X2, Y2, "optimal") - def test_loss_function_epsilon(self): - clf = self.factory(epsilon=0.9) - clf.set_params(epsilon=0.1) - assert clf.loss_functions['huber'][1] == 0.1 +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +def test_multiple_fit(klass): + # Test multiple calls of fit w/ different shaped inputs. + clf = klass(alpha=0.01, shuffle=False) + clf.fit(X, Y) + assert hasattr(clf, "coef_") -class SparseSGDRegressorTestCase(DenseSGDRegressorTestCase): - # Run exactly the same tests using the sparse representation variant + # Non-regression test: try fitting with a different label set. + y = [["ham", "spam"][i] for i in LabelEncoder().fit_transform(Y)] + clf.fit(X[:, :-1], y) + + +############################################################################### +# Regression Test Case - factory_class = SparseSGDRegressor +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +def test_sgd_reg(klass): + # Check that SGD gives any results. + clf = klass(alpha=0.1, max_iter=2, fit_intercept=False) + clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) + assert_equal(clf.coef_[0], clf.coef_[1]) + + +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +def test_sgd_averaged_computed_correctly(klass): + # Tests the average regressor matches the naive implementation + + eta = .001 + alpha = .01 + n_samples = 20 + n_features = 10 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + w = rng.normal(size=n_features) + + # simple linear function without noise + y = np.dot(X, w) + + clf = klass(loss='squared_loss', + learning_rate='constant', + eta0=eta, alpha=alpha, + fit_intercept=True, + max_iter=1, average=True, shuffle=False) + + clf.fit(X, y) + average_weights, average_intercept = asgd(klass, X, y, eta, alpha) + + assert_array_almost_equal(clf.coef_, + average_weights, + decimal=16) + assert_almost_equal(clf.intercept_, average_intercept, decimal=16) + + +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +def test_sgd_averaged_partial_fit(klass): + # Tests whether the partial fit yields the same average as the fit + eta = .001 + alpha = .01 + n_samples = 20 + n_features = 10 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + w = rng.normal(size=n_features) + + # simple linear function without noise + y = np.dot(X, w) + + clf = klass(loss='squared_loss', + learning_rate='constant', + eta0=eta, alpha=alpha, + fit_intercept=True, + max_iter=1, average=True, shuffle=False) + + clf.partial_fit(X[:int(n_samples / 2)][:], y[:int(n_samples / 2)]) + clf.partial_fit(X[int(n_samples / 2):][:], y[int(n_samples / 2):]) + average_weights, average_intercept = asgd(klass, X, y, eta, alpha) + + assert_array_almost_equal(clf.coef_, + average_weights, + decimal=16) + assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16) + + +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +def test_average_sparse(klass): + # Checks the average weights on data with 0s + + eta = .001 + alpha = .01 + clf = klass(loss='squared_loss', + learning_rate='constant', + eta0=eta, alpha=alpha, + fit_intercept=True, + max_iter=1, average=True, shuffle=False) + + n_samples = Y3.shape[0] + + clf.partial_fit(X3[:int(n_samples / 2)][:], Y3[:int(n_samples / 2)]) + clf.partial_fit(X3[int(n_samples / 2):][:], Y3[int(n_samples / 2):]) + average_weights, average_intercept = asgd(klass, X3, Y3, eta, alpha) + + assert_array_almost_equal(clf.coef_, + average_weights, + decimal=16) + assert_almost_equal(clf.intercept_, average_intercept, decimal=16) + + +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +def test_sgd_least_squares_fit(klass): + xmin, xmax = -5, 5 + n_samples = 100 + rng = np.random.RandomState(0) + X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) + + # simple linear function without noise + y = 0.5 * X.ravel() + + clf = klass(loss='squared_loss', alpha=0.1, max_iter=20, + fit_intercept=False) + clf.fit(X, y) + score = clf.score(X, y) + assert_greater(score, 0.99) + + # simple linear function with noise + y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() + + clf = klass(loss='squared_loss', alpha=0.1, max_iter=20, + fit_intercept=False) + clf.fit(X, y) + score = clf.score(X, y) + assert_greater(score, 0.5) + + +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +def test_sgd_epsilon_insensitive(klass): + xmin, xmax = -5, 5 + n_samples = 100 + rng = np.random.RandomState(0) + X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) + + # simple linear function without noise + y = 0.5 * X.ravel() + + clf = klass(loss='epsilon_insensitive', epsilon=0.01, + alpha=0.1, max_iter=20, + fit_intercept=False) + clf.fit(X, y) + score = clf.score(X, y) + assert score > 0.99 + + # simple linear function with noise + y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() + + clf = klass(loss='epsilon_insensitive', epsilon=0.01, + alpha=0.1, max_iter=20, + fit_intercept=False) + clf.fit(X, y) + score = clf.score(X, y) + assert score > 0.5 + + +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +def test_sgd_huber_fit(klass): + xmin, xmax = -5, 5 + n_samples = 100 + rng = np.random.RandomState(0) + X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) + + # simple linear function without noise + y = 0.5 * X.ravel() + + clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, + fit_intercept=False) + clf.fit(X, y) + score = clf.score(X, y) + assert_greater(score, 0.99) + + # simple linear function with noise + y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() + + clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, + fit_intercept=False) + clf.fit(X, y) + score = clf.score(X, y) + assert_greater(score, 0.5) + + +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +def test_elasticnet_convergence(klass): + # Check that the SGD output is consistent with coordinate descent + + n_samples, n_features = 1000, 5 + rng = np.random.RandomState(0) + X = rng.randn(n_samples, n_features) + # ground_truth linear model that generate y from X and to which the + # models should converge if the regularizer would be set to 0.0 + ground_truth_coef = rng.randn(n_features) + y = np.dot(X, ground_truth_coef) + + # XXX: alpha = 0.1 seems to cause convergence problems + for alpha in [0.01, 0.001]: + for l1_ratio in [0.5, 0.8, 1.0]: + cd = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio, + fit_intercept=False) + cd.fit(X, y) + sgd = klass(penalty='elasticnet', max_iter=50, + alpha=alpha, l1_ratio=l1_ratio, + fit_intercept=False) + sgd.fit(X, y) + err_msg = ("cd and sgd did not converge to comparable " + "results for alpha=%f and l1_ratio=%f" + % (alpha, l1_ratio)) + assert_almost_equal(cd.coef_, sgd.coef_, decimal=2, + err_msg=err_msg) + + +@ignore_warnings +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +def test_partial_fit(klass): + third = X.shape[0] // 3 + clf = klass(alpha=0.01) + + clf.partial_fit(X[:third], Y[:third]) + assert_equal(clf.coef_.shape, (X.shape[1], )) + assert_equal(clf.intercept_.shape, (1,)) + assert_equal(clf.predict([[0, 0]]).shape, (1, )) + id1 = id(clf.coef_.data) + + clf.partial_fit(X[third:], Y[third:]) + id2 = id(clf.coef_.data) + # check that coef_ haven't been re-allocated + assert id1, id2 + + +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +@pytest.mark.parametrize('lr', + ["constant", "optimal", "invscaling", "adaptive"]) +def _test_partial_fit_equal_fit(klass, lr): + clf = klass(alpha=0.01, max_iter=2, eta0=0.01, + learning_rate=lr, shuffle=False) + clf.fit(X, Y) + y_pred = clf.predict(T) + t = clf.t_ + + clf = klass(alpha=0.01, eta0=0.01, + learning_rate=lr, shuffle=False) + for i in range(2): + clf.partial_fit(X, Y) + y_pred2 = clf.predict(T) + + assert_equal(clf.t_, t) + assert_array_almost_equal(y_pred, y_pred2, decimal=2) + + +@pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) +def test_loss_function_epsilon(klass): + clf = klass(epsilon=0.9) + clf.set_params(epsilon=0.1) + assert clf.loss_functions['huber'][1] == 0.1 def test_l1_ratio(): @@ -1352,27 +1458,27 @@ def init(max_iter=None, tol=None, n_iter=None, for_partial_fit=False): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def test_tol_and_max_iter_default_values(): # Test that the default values are correctly changed - est = SGDClassifier() + est = linear_model.SGDClassifier() est._validate_params() assert_equal(est._tol, None) assert_equal(est._max_iter, 5) - est = SGDClassifier(n_iter=42) + est = linear_model.SGDClassifier(n_iter=42) est._validate_params() assert_equal(est._tol, None) assert_equal(est._max_iter, 42) - est = SGDClassifier(tol=1e-2) + est = linear_model.SGDClassifier(tol=1e-2) est._validate_params() assert_equal(est._tol, 1e-2) assert_equal(est._max_iter, 1000) - est = SGDClassifier(max_iter=42) + est = linear_model.SGDClassifier(max_iter=42) est._validate_params() assert_equal(est._tol, None) assert_equal(est._max_iter, 42) - est = SGDClassifier(max_iter=42, tol=1e-2) + est = linear_model.SGDClassifier(max_iter=42, tol=1e-2) est._validate_params() assert_equal(est._tol, 1e-2) assert_equal(est._max_iter, 42) @@ -1521,10 +1627,10 @@ def test_multi_core_gridsearch_and_early_stopping(): @pytest.mark.skipif( - not hasattr(sp, "random"), - reason="this test uses scipy.random, that was introduced in version " - "0.17. This skip condition can be dropped as soon as we drop support " - "for scipy versions older than 0.17") + not hasattr(sp, "random"), + reason="this test uses scipy.random, that was introduced in version " + "0.17. This skip condition can be dropped as soon as we drop " + "support for scipy versions older than 0.17") @pytest.mark.parametrize("backend", ["loky", "multiprocessing", "threading"]) def test_SGDClassifier_fit_for_all_backends(backend): From 25e5ba1170b42ac8493c22175c3bafdd078df43f Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 27 Feb 2019 10:15:21 +0100 Subject: [PATCH 2/5] review --- sklearn/linear_model/tests/test_sgd.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 236d3a9bea426..5b8787b9012ca 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -445,7 +445,8 @@ def test_sgd_bad_penalty(klass): l1_ratio=0.85) -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) def test_sgd_bad_loss(klass): # Check whether expected ValueError on bad loss assert_raises(ValueError, klass, loss="foobar") @@ -1293,7 +1294,7 @@ def test_partial_fit(klass): @pytest.mark.parametrize('klass', [SGDRegressor, SparseSGDRegressor]) @pytest.mark.parametrize('lr', ["constant", "optimal", "invscaling", "adaptive"]) -def _test_partial_fit_equal_fit(klass, lr): +def test_partial_fit_equal_fit(klass, lr): clf = klass(alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False) clf.fit(X, Y) From 3dbe40d0bd0c08fa78a875404a9ff0a93e48185d Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 27 Feb 2019 10:19:15 +0100 Subject: [PATCH 3/5] move together tests that are common between reg and classifiers --- sklearn/linear_model/tests/test_sgd.py | 45 +++++++++++++------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 5b8787b9012ca..06c696d83550f 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -138,7 +138,7 @@ def SparseSGDRegressor(**kwargs): ############################################################################### -# Tests common to classification and regression +# Common Test Case to classification and regression # a simple implementation of ASGD to use for testing # uses squared loss to find the gradient @@ -176,6 +176,27 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0): return average_weights, average_intercept +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_sgd_bad_alpha(klass): + # Check whether expected ValueError on bad alpha + assert_raises(ValueError, klass, alpha=-.1) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_sgd_bad_penalty(klass): + # Check whether expected ValueError on bad penalty + assert_raises(ValueError, klass, penalty='foobar', + l1_ratio=0.85) + + +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDRegressor, SparseSGDRegressor]) +def test_sgd_bad_loss(klass): + # Check whether expected ValueError on bad loss + assert_raises(ValueError, klass, loss="foobar") + def _test_warm_start(klass, X, Y, lr): # Test that explicit warm restart... clf = klass(alpha=0.01, eta0=0.01, shuffle=False, @@ -430,28 +451,6 @@ def test_sgd_bad_eta0(klass): learning_rate="constant") -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, - SGDRegressor, SparseSGDRegressor]) -def test_sgd_bad_alpha(klass): - # Check whether expected ValueError on bad alpha - assert_raises(ValueError, klass, alpha=-.1) - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, - SGDRegressor, SparseSGDRegressor]) -def test_sgd_bad_penalty(klass): - # Check whether expected ValueError on bad penalty - assert_raises(ValueError, klass, penalty='foobar', - l1_ratio=0.85) - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, - SGDRegressor, SparseSGDRegressor]) -def test_sgd_bad_loss(klass): - # Check whether expected ValueError on bad loss - assert_raises(ValueError, klass, loss="foobar") - - @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) def test_sgd_max_iter_param(klass): # Test parameter validity check From 33a34554ebf15e904f3261ddef7e2a5baea993db Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 27 Feb 2019 10:19:26 +0100 Subject: [PATCH 4/5] pep8 --- sklearn/linear_model/tests/test_sgd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 06c696d83550f..4824c06b7dd7f 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -197,6 +197,7 @@ def test_sgd_bad_loss(klass): # Check whether expected ValueError on bad loss assert_raises(ValueError, klass, loss="foobar") + def _test_warm_start(klass, X, Y, lr): # Test that explicit warm restart... clf = klass(alpha=0.01, eta0=0.01, shuffle=False, From dac3cbbbac119afd79829591901931a49ba71a72 Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Wed, 27 Feb 2019 15:48:43 +0100 Subject: [PATCH 5/5] lint --- sklearn/linear_model/tests/test_sgd.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 4824c06b7dd7f..405f7003798ff 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -727,13 +727,13 @@ def test_sgd_proba(klass): d = clf.decision_function([[-1, -1]]) assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) - l = clf.predict_log_proba([[3, 2]]) + lp = clf.predict_log_proba([[3, 2]]) p = clf.predict_proba([[3, 2]]) - assert_array_almost_equal(np.log(p), l) + assert_array_almost_equal(np.log(p), lp) - l = clf.predict_log_proba([[-1, -1]]) + lp = clf.predict_log_proba([[-1, -1]]) p = clf.predict_proba([[-1, -1]]) - assert_array_almost_equal(np.log(p), l) + assert_array_almost_equal(np.log(p), lp) # Modified Huber multiclass probability estimates; requires a separate # test because the hard zero/one probabilities may destroy the