From 94dc0a68b90bac47e783e4257109d2e0ce19f289 Mon Sep 17 00:00:00 2001 From: imaculate Date: Fri, 24 Jun 2016 19:28:37 +0200 Subject: [PATCH 1/6] Fresh branch for linearsvr_fit_sample_weight with weights and documentation --- sklearn/svm/classes.py | 14 ++++++++++-- sklearn/svm/tests/test_svm.py | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index 0dd969aa215ed..394ff782404fa 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -165,7 +165,7 @@ def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=1e-4, self.penalty = penalty self.loss = loss - def fit(self, X, y): + def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters @@ -177,6 +177,11 @@ def fit(self, X, y): y : array-like, shape = [n_samples] Target vector relative to X + sample_weight : array-like, shape = [n_samples], optional + Array of weights that are assigned to individual + samples. If not provided, + then each sample is given unit weight. + Returns ------- self : object @@ -210,7 +215,7 @@ def fit(self, X, y): X, y, self.C, self.fit_intercept, self.intercept_scaling, self.class_weight, self.penalty, self.dual, self.verbose, self.max_iter, self.tol, self.random_state, self.multi_class, - self.loss) + self.loss, sample_weight=sample_weight) if self.multi_class == "crammer_singer" and len(self.classes_) == 2: self.coef_ = (self.coef_[1] - self.coef_[0]).reshape(1, -1) @@ -341,6 +346,11 @@ def fit(self, X, y, sample_weight=None): y : array-like, shape = [n_samples] Target vector relative to X + sample_weight : array-like, shape = [n_samples], optional + Array of weights that are assigned to individual + samples. If not provided, + then each sample is given unit weight. + Returns ------- self : object diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 6c25c6d9da10e..afe0a71a34875 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -655,6 +655,46 @@ def test_linearsvc_crammer_singer(): assert_array_almost_equal(dec_func, cs_clf.decision_function(iris.data)) +def test_linearsvc_fit_sampleweight(): + # check correct result when sample_weight is 1 + # check that SVR(kernel='linear') and LinearSVC() give + # comparable results + + # Test basic routines using LinearSVC + n_samples = len(X) + unit_weight = np.ones(n_samples) + clf = svm.LinearSVC(random_state=0).fit(X, Y) + clf_unitweight = svm.LinearSVC(random_state=0).fit(X, Y, + sample_weight=unit_weight) + + # sanity check, by default should have intercept + assert_true(clf_unitweight.fit_intercept) + assert_array_almost_equal(clf_unitweight.intercept_, [0], decimal=3) + + # check if same as sample_weight=None + assert_array_equal(clf_unitweight.predict(T), clf.predict(T)) + assert_allclose(np.linalg.norm(clf.coef_), + np.linalg.norm(clf_unitweight.coef_), 1, 0.0001) + + # check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where + # X = X1 repeated n1 times, X2 repeated n2 times and so forth + + random_state = check_random_state(0) + random_weight = random_state.randint(0, 10, n_samples) + lsvc_unflat = svm.LinearSVC(random_state=0).fit(X, Y, + sample_weight=random_weight) + pred1 = lsvc_unflat.predict(T) + + X_flat = np.repeat(X, random_weight, axis=0) + y_flat = np.repeat(Y, random_weight, axis=0) + lsvc_flat = svm.LinearSVC(random_state=0).fit(X_flat, y_flat) + pred2 = lsvc_flat.predict(T) + + assert_array_equal(pred1, pred2) + assert_allclose(np.linalg.norm(lsvc_unflat.coef_), + np.linalg.norm(lsvc_flat.coef_), 1, 0.0001) + + def test_crammer_singer_binary(): # Test Crammer-Singer formulation in the binary case X, y = make_classification(n_classes=2, random_state=0) From e3aeb37d2f7bc6d0e822e21a79835864d2da754a Mon Sep 17 00:00:00 2001 From: imaculate Date: Tue, 28 Jun 2016 13:06:06 +0200 Subject: [PATCH 2/6] Fixed pep8 violations, changed CI tests to allow linearsvr with sample_weight --- sklearn/svm/tests/test_svm.py | 16 ++++------------ sklearn/tests/test_calibration.py | 6 ------ 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index afe0a71a34875..5612aaa994bbc 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -657,19 +657,11 @@ def test_linearsvc_crammer_singer(): def test_linearsvc_fit_sampleweight(): # check correct result when sample_weight is 1 - # check that SVR(kernel='linear') and LinearSVC() give - # comparable results - - # Test basic routines using LinearSVC n_samples = len(X) unit_weight = np.ones(n_samples) clf = svm.LinearSVC(random_state=0).fit(X, Y) - clf_unitweight = svm.LinearSVC(random_state=0).fit(X, Y, - sample_weight=unit_weight) - - # sanity check, by default should have intercept - assert_true(clf_unitweight.fit_intercept) - assert_array_almost_equal(clf_unitweight.intercept_, [0], decimal=3) + clf_unitweight = svm.LinearSVC(random_state=0).\ + fit(X, Y, sample_weight=unit_weight) # check if same as sample_weight=None assert_array_equal(clf_unitweight.predict(T), clf.predict(T)) @@ -681,8 +673,8 @@ def test_linearsvc_fit_sampleweight(): random_state = check_random_state(0) random_weight = random_state.randint(0, 10, n_samples) - lsvc_unflat = svm.LinearSVC(random_state=0).fit(X, Y, - sample_weight=random_weight) + lsvc_unflat = svm.LinearSVC(random_state=0).\ + fit(X, Y, sample_weight=random_weight) pred1 = lsvc_unflat.predict(T) X_flat = np.repeat(X, random_weight, axis=0) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 296d28a4ba94e..a2bb47984c8b4 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -119,12 +119,6 @@ def test_sample_weight_warning(): for method in ['sigmoid', 'isotonic']: base_estimator = LinearSVC(random_state=42) calibrated_clf = CalibratedClassifierCV(base_estimator, method=method) - # LinearSVC does not currently support sample weights but they - # can still be used for the calibration step (with a warning) - msg = "LinearSVC does not support sample_weight." - assert_warns_message( - UserWarning, msg, - calibrated_clf.fit, X_train, y_train, sample_weight=sw_train) probs_with_sw = calibrated_clf.predict_proba(X_test) # As the weights are used for the calibration, they should still yield From 976412730b545158df7f4f0560ca271b8cbee1b8 Mon Sep 17 00:00:00 2001 From: imaculate Date: Tue, 28 Jun 2016 13:23:19 +0200 Subject: [PATCH 3/6] Changed test_calibration --- sklearn/tests/test_calibration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index a2bb47984c8b4..763faf4df37f6 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -119,6 +119,7 @@ def test_sample_weight_warning(): for method in ['sigmoid', 'isotonic']: base_estimator = LinearSVC(random_state=42) calibrated_clf = CalibratedClassifierCV(base_estimator, method=method) + calibrated_clf.fit(X_train, y_train, sample_weight=sw_train) probs_with_sw = calibrated_clf.predict_proba(X_test) # As the weights are used for the calibration, they should still yield From 98c14b9edbf8779be5bd6ccc0fe7593bb9f0fb41 Mon Sep 17 00:00:00 2001 From: imaculate Date: Tue, 28 Jun 2016 17:37:39 +0200 Subject: [PATCH 4/6] Corrected indentation for docstrings --- sklearn/svm/classes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index 394ff782404fa..f1a7923979b98 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -347,9 +347,9 @@ def fit(self, X, y, sample_weight=None): Target vector relative to X sample_weight : array-like, shape = [n_samples], optional - Array of weights that are assigned to individual - samples. If not provided, - then each sample is given unit weight. + Array of weights that are assigned to individual + samples. If not provided, + then each sample is given unit weight. Returns ------- From ae27e3cc5d507e650b3467b643253447c825800e Mon Sep 17 00:00:00 2001 From: imaculate Date: Fri, 1 Jul 2016 00:06:16 +0200 Subject: [PATCH 5/6] Fixed docstring, remove normalization of coefficients in tests --- sklearn/svm/classes.py | 6 +++--- sklearn/svm/tests/test_svm.py | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index f1a7923979b98..1a309693d9d1c 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -178,9 +178,9 @@ def fit(self, X, y, sample_weight=None): Target vector relative to X sample_weight : array-like, shape = [n_samples], optional - Array of weights that are assigned to individual - samples. If not provided, - then each sample is given unit weight. + Array of weights that are assigned to individual + samples. If not provided, + then each sample is given unit weight. Returns ------- diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 5612aaa994bbc..73ae3728662f3 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -665,8 +665,7 @@ def test_linearsvc_fit_sampleweight(): # check if same as sample_weight=None assert_array_equal(clf_unitweight.predict(T), clf.predict(T)) - assert_allclose(np.linalg.norm(clf.coef_), - np.linalg.norm(clf_unitweight.coef_), 1, 0.0001) + assert_allclose(clf.coef_, clf_unitweight.coef_, 1, 0.0001) # check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where # X = X1 repeated n1 times, X2 repeated n2 times and so forth @@ -683,8 +682,7 @@ def test_linearsvc_fit_sampleweight(): pred2 = lsvc_flat.predict(T) assert_array_equal(pred1, pred2) - assert_allclose(np.linalg.norm(lsvc_unflat.coef_), - np.linalg.norm(lsvc_flat.coef_), 1, 0.0001) + assert_allclose(lsvc_unflat.coef_, lsvc_flat.coef_, 1, 0.0001) def test_crammer_singer_binary(): From f4ac81bebefe580d5bf316ca7da6d1061e35e3ef Mon Sep 17 00:00:00 2001 From: imaculate Date: Mon, 11 Jul 2016 13:35:19 +0200 Subject: [PATCH 6/6] Renamed function (test_sample_weight_calibration) --- sklearn/tests/test_calibration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 763faf4df37f6..61cb51c67365d 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -106,7 +106,7 @@ def test_calibration(): assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train) -def test_sample_weight_warning(): +def test_sample_weight(): n_samples = 100 X, y = make_classification(n_samples=2 * n_samples, n_features=6, random_state=42)