From d0c32a51d07116e435bc6cabad3630fef2964da2 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 24 Dec 2012 16:44:27 +0100 Subject: [PATCH 01/13] TST start on testing consistent class weights --- sklearn/tests/test_common.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 6a1de892ed2bf..973c12a7323af 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -53,6 +53,8 @@ from sklearn.random_projection import (GaussianRandomProjection, SparseRandomProjection) +from sklearn.cross_validation import train_test_split + dont_test = [SparseCoder, EllipticEnvelope, EllipticEnvelop, DictVectorizer, LabelBinarizer, LabelEncoder, TfidfTransformer, IsotonicRegression, OneHotEncoder, RandomTreesEmbedding, @@ -645,3 +647,32 @@ def test_configure(): finally: sys.argv = old_argv os.chdir(cwd) + + +def test_class_weight_classifiers(): + # test that class_weight works and that the semantics are consistent + classifiers = all_estimators(type_filter='classifier') + + with warnings.catch_warnings(record=True): + classifiers = [c for c in classifiers + if 'class_weight' in c[1]().get_params().keys()] + + # first blanced classification + for n_centers in [2, 3]: + X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=0.1) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, + random_state=0) + for name, Clf in classifiers: + if n_centers == 2: + class_weight = {0: 1000, 1: 0.0001} + else: + class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} + + with warnings.catch_warnings(record=True): + clf = Clf(class_weight=class_weight) + set_random_state(clf) + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + #assert_array_equal(y_pred, 0) + if (y_pred != 0).any(): + print name, y_pred From f25412fd5bb0cd28e4a4fd300d0fbf34148969bf Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 24 Dec 2012 17:17:45 +0100 Subject: [PATCH 02/13] ENH nu-SVC doesn't support class_weights --- sklearn/svm/classes.py | 7 ------- sklearn/svm/sparse/classes.py | 15 +++++++++------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index f3b6eb02a827e..af9ecbb50e0a8 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -311,13 +311,6 @@ class NuSVC(BaseSVC): cache_size : float, optional Specify the size of the kernel cache (in MB) - class_weight : {dict, 'auto'}, optional - Set the parameter C of class i to class_weight[i]*C for - SVC. If not given, all classes are supposed to have - weight one. The 'auto' mode uses the values of y to - automatically adjust weights inversely proportional to - class frequencies. - verbose : bool, default: False Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work diff --git a/sklearn/svm/sparse/classes.py b/sklearn/svm/sparse/classes.py index ca2b75c29c84c..35f931b2e944c 100644 --- a/sklearn/svm/sparse/classes.py +++ b/sklearn/svm/sparse/classes.py @@ -1,3 +1,5 @@ +import warnings + from .. import LinearSVC as GeneralLinearSVC from ..base import BaseSVC from ...base import RegressorMixin @@ -71,14 +73,15 @@ class NuSVC(SparseBaseLibSVM, BaseSVC): [1] """ - def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma=0.0, - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1): - + def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, + shrinking=True, probability=False, tol=1e-3, cache_size=200, + class_weight=None, verbose=False, max_iter=-1): + if class_weight is not None: + warnings.warn("Parameter class_weight is not supported in NuSVC " + "and will be ignored.", stacklevel=2) super(NuSVC, self).__init__( 'nu_svc', kernel, degree, gamma, coef0, tol, 0., nu, 0., shrinking, - probability, cache_size, class_weight, verbose, max_iter) + probability, cache_size, None, verbose, max_iter) @deprecated("""to be removed in v0.14; From 0fbf223d7e3dfd43d5eac954f06b989436da8e7c Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 24 Dec 2012 19:19:07 +0100 Subject: [PATCH 03/13] FIX liblinear class weight in binary case, robust testing. --- sklearn/svm/src/liblinear/linear.cpp | 2 +- sklearn/tests/test_common.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/sklearn/svm/src/liblinear/linear.cpp b/sklearn/svm/src/liblinear/linear.cpp index ed7429e8678f7..5ef458334f59f 100644 --- a/sklearn/svm/src/liblinear/linear.cpp +++ b/sklearn/svm/src/liblinear/linear.cpp @@ -2410,7 +2410,7 @@ model* train(const problem *prob, const parameter *param) for(; kw[0], weighted_C[0], weighted_C[1]); + train_one(&sub_prob, param, &model_->w[0], weighted_C[1], weighted_C[0]); } else { diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 973c12a7323af..bb37e66535edd 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -659,10 +659,14 @@ def test_class_weight_classifiers(): # first blanced classification for n_centers in [2, 3]: - X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=0.1) + # create a very noisy dataset + X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) for name, Clf in classifiers: + if name == "NuSVC": + # the sparse version has a parameter that doesn't do anything + continue if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: @@ -670,9 +674,13 @@ def test_class_weight_classifiers(): with warnings.catch_warnings(record=True): clf = Clf(class_weight=class_weight) + if hasattr(clf, "n_iter"): + clf.set_params(n_iter=100) + set_random_state(clf) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) - #assert_array_equal(y_pred, 0) - if (y_pred != 0).any(): + try: + assert_greater(np.mean(y_pred == 0), 0.9) + except: print name, y_pred From 06f4077e24feaf86c1d4276badfdbc975f9640e5 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 24 Dec 2012 19:19:37 +0100 Subject: [PATCH 04/13] cosmit whitespace --- sklearn/svm/liblinear.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/svm/liblinear.pyx b/sklearn/svm/liblinear.pyx index 5278c3486b4ff..c7bd3eb31e092 100644 --- a/sklearn/svm/liblinear.pyx +++ b/sklearn/svm/liblinear.pyx @@ -11,7 +11,7 @@ cimport liblinear def train_wrap(np.ndarray[np.float64_t, ndim=2, mode='c'] X, np.ndarray[np.float64_t, ndim=1, mode='c'] Y, - int solver_type, double eps, double bias, double C, + int solver_type, double eps, double bias, double C, np.ndarray[np.float64_t, ndim=1] class_weight, unsigned random_seed): """ @@ -35,12 +35,12 @@ def train_wrap(np.ndarray[np.float64_t, ndim=2, mode='c'] X, free_problem(problem) free_parameter(param) raise ValueError(error_msg) - + # early return model = train(problem, param) # coef matrix holder created as fortran since that's what's used in liblinear - cdef np.ndarray[np.float64_t, ndim=2, mode='fortran'] w + cdef np.ndarray[np.float64_t, ndim=2, mode='fortran'] w cdef int nr_class = get_nr_class(model) cdef int nr_feature = get_nr_feature(model) if bias > 0: nr_feature = nr_feature + 1 @@ -49,7 +49,7 @@ def train_wrap(np.ndarray[np.float64_t, ndim=2, mode='c'] X, copy_w(w.data, model, nr_feature) else: len_w = (nr_class) * nr_feature - w = np.empty((nr_class, nr_feature),order='F') + w = np.empty((nr_class, nr_feature),order='F') copy_w(w.data, model, len_w) ### FREE From ca9dcfd053a6ad2e801fd60cf1a380e0b3d8b330 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 24 Dec 2012 19:23:36 +0100 Subject: [PATCH 05/13] DOC add comment in liblinear --- sklearn/svm/src/liblinear/linear.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/svm/src/liblinear/linear.cpp b/sklearn/svm/src/liblinear/linear.cpp index 5ef458334f59f..55e85d3c28b17 100644 --- a/sklearn/svm/src/liblinear/linear.cpp +++ b/sklearn/svm/src/liblinear/linear.cpp @@ -7,6 +7,8 @@ - Changes roles of +1 and -1 to match scikit API, Andreas Mueller See issue 546: https://github.com/scikit-learn/scikit-learn/pull/546 + - Also changed roles for pairwise class weights, Andreas Mueller + See issue 1491: https://github.com/scikit-learn/scikit-learn/pull/1491 */ From bad361358c15273b1609661d56d2257cd5f81a10 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 25 Dec 2012 14:43:55 +0100 Subject: [PATCH 06/13] TST better test for class weights (that actually tests something) --- sklearn/svm/tests/test_svm.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 3026701e1f756..c75de6469d8cf 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -13,6 +13,7 @@ from sklearn import svm, linear_model, datasets, metrics, base from sklearn.datasets.samples_generator import make_classification +from sklearn.metrics import f1_score from sklearn.utils import check_random_state from sklearn.utils import ConvergenceWarning from sklearn.utils.testing import assert_greater, assert_less @@ -305,15 +306,15 @@ def test_weight(): # so all predicted values belong to class 2 assert_array_almost_equal(clf.predict(X), [2] * 6) - X_, y_ = make_classification(n_samples=200, n_features=100, - weights=[0.833, 0.167], random_state=0) + X_, y_ = make_classification(n_samples=200, n_features=10, + weights=[0.833, 0.167], random_state=2) for clf in (linear_model.LogisticRegression(), svm.LinearSVC(random_state=0), svm.SVC()): - clf.set_params(class_weight={0: 5}) - clf.fit(X_[: 180], y_[: 180]) - y_pred = clf.predict(X_[180:]) - assert_true(np.sum(y_pred == y_[180:]) >= 11) + clf.set_params(class_weight={0: .1, 1: 10}) + clf.fit(X_[:100], y_[:100]) + y_pred = clf.predict(X_[100:]) + assert_true(f1_score(y_[100:], y_pred) > .3) def test_sample_weights(): From 7dc838e8d1f91466165a7f97fbf3eeb5b0c4b704 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 25 Dec 2012 15:05:23 +0100 Subject: [PATCH 07/13] ENH test automatic setting of class weights in common test --- sklearn/tests/test_common.py | 41 +++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index bb37e66535edd..eb1f14d2d7244 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -29,7 +29,8 @@ TransformerMixin, ClusterMixin) from sklearn.utils import shuffle from sklearn.preprocessing import StandardScaler, Scaler -from sklearn.datasets import load_iris, load_boston, make_blobs +from sklearn.datasets import (load_iris, load_boston, make_blobs, + make_classification) from sklearn.metrics import zero_one_score, adjusted_rand_score from sklearn.lda import LDA from sklearn.svm.base import BaseLibSVM @@ -52,6 +53,7 @@ from sklearn.isotonic import IsotonicRegression from sklearn.random_projection import (GaussianRandomProjection, SparseRandomProjection) +from sklearn.metrics import f1_score from sklearn.cross_validation import train_test_split @@ -684,3 +686,40 @@ def test_class_weight_classifiers(): assert_greater(np.mean(y_pred == 0), 0.9) except: print name, y_pred + + +def test_class_weight_auto_classifies(): + # test that class_weight="auto" improves f1-score + classifiers = all_estimators(type_filter='classifier') + + with warnings.catch_warnings(record=True): + classifiers = [c for c in classifiers + if 'class_weight' in c[1]().get_params().keys()] + + for n_classes in [2, 3]: + # create unbalanced dataset + X, y = make_classification(n_classes=n_classes, n_samples=200, + n_features=10, weights=[0.7, 0.3], + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, + random_state=0) + for name, Clf in classifiers: + if name == "NuSVC": + # the sparse version has a parameter that doesn't do anything + continue + + with warnings.catch_warnings(record=True): + clf = Clf() + if hasattr(clf, "n_iter"): + clf.set_params(n_iter=100) + + set_random_state(clf) + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + + clf.set_params(class_weight='auto') + clf.fit(X_train, y_train) + y_pred_auto = clf.predict(X_test) + assert_greater(f1_score(y_test, y_pred_auto), + f1_score(y_test, y_pred)) + print(f1_score(y_test, y_pred_auto), f1_score(y_test, y_pred)) From c75decd577756fb30339dad8eeb6752682e12ad2 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 26 Dec 2012 13:07:17 +0100 Subject: [PATCH 08/13] TST skip RidgeClassifier in class weight test for the moment --- sklearn/tests/test_common.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index eb1f14d2d7244..8e7aac1d0a5b3 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -669,6 +669,10 @@ def test_class_weight_classifiers(): if name == "NuSVC": # the sparse version has a parameter that doesn't do anything continue + if name.startswith("RidgeClassifier"): + # RidgeClassifier behaves unexpected + # FIXME! + continue if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: @@ -682,10 +686,7 @@ def test_class_weight_classifiers(): set_random_state(clf) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) - try: - assert_greater(np.mean(y_pred == 0), 0.9) - except: - print name, y_pred + assert_greater(np.mean(y_pred == 0), 0.9) def test_class_weight_auto_classifies(): From 7edaaca4066a4adbe78c94df753aba4010570b3e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 26 Dec 2012 17:15:49 +0100 Subject: [PATCH 09/13] DOC added fix to whatsnew. --- doc/whats_new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index f80947d0a8f1d..4581393cd732a 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -112,6 +112,9 @@ Changelog :fun:`metrics.mean_absolute_error` and :fun:`metrics.r2_score` metrics support multioutput by `Arnaud Joly`_. + - Fixed `class_weight` support in :class:`svm.LinearSVC` and + :class:`linear_model.LogisticRegression` by `Andreas Müller`_. + API changes summary ------------------- - Renamed all occurences of ``n_atoms`` to ``n_components`` for consistency. From 2d5034ded6b8eded98e913e4876cfe97168e4b69 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 26 Dec 2012 17:52:31 +0100 Subject: [PATCH 10/13] FIX don't test auto in ridge classifier as it is not supported currently --- sklearn/tests/test_common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 8e7aac1d0a5b3..d3fd27f2f45ff 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -709,6 +709,11 @@ def test_class_weight_auto_classifies(): # the sparse version has a parameter that doesn't do anything continue + if name.startswith("RidgeClassifier"): + # RidgeClassifier behaves unexpected + # FIXME! + continue + with warnings.catch_warnings(record=True): clf = Clf() if hasattr(clf, "n_iter"): From 2d0ef35389df4be9782ca62d19ce8db1d2cd5cc0 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 26 Dec 2012 18:28:31 +0100 Subject: [PATCH 11/13] FIX tests for auto class weights --- sklearn/tests/test_common.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index d3fd27f2f45ff..6acc46cba330a 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -659,7 +659,6 @@ def test_class_weight_classifiers(): classifiers = [c for c in classifiers if 'class_weight' in c[1]().get_params().keys()] - # first blanced classification for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) @@ -697,11 +696,12 @@ def test_class_weight_auto_classifies(): classifiers = [c for c in classifiers if 'class_weight' in c[1]().get_params().keys()] - for n_classes in [2, 3]: + for n_classes, weights in zip([2, 3], [[.8, .2], [.8, .1, .1]]): # create unbalanced dataset X, y = make_classification(n_classes=n_classes, n_samples=200, - n_features=10, weights=[0.7, 0.3], - random_state=0) + n_features=10, weights=weights, + random_state=0, n_informative=n_classes) + X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) for name, Clf in classifiers: @@ -728,4 +728,3 @@ def test_class_weight_auto_classifies(): y_pred_auto = clf.predict(X_test) assert_greater(f1_score(y_test, y_pred_auto), f1_score(y_test, y_pred)) - print(f1_score(y_test, y_pred_auto), f1_score(y_test, y_pred)) From 6902e07f3211c42f49fd95e626163ce85fe90826 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sun, 30 Dec 2012 16:49:49 +0100 Subject: [PATCH 12/13] DOC more concrete whatsnew --- doc/whats_new.rst | 6 ++++-- sklearn/tests/test_common.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 4581393cd732a..4976fdf9b60fd 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -112,8 +112,10 @@ Changelog :fun:`metrics.mean_absolute_error` and :fun:`metrics.r2_score` metrics support multioutput by `Arnaud Joly`_. - - Fixed `class_weight` support in :class:`svm.LinearSVC` and - :class:`linear_model.LogisticRegression` by `Andreas Müller`_. + - Fixed ``class_weight`` support in :class:`svm.LinearSVC` and + :class:`linear_model.LogisticRegression` by `Andreas Müller`_. The meaning + of ``class_weight`` was reversed as erroneously higher weight meant less + positives of a given class in earlier releases. API changes summary ------------------- diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 6acc46cba330a..42343d72b4c0d 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -669,7 +669,7 @@ def test_class_weight_classifiers(): # the sparse version has a parameter that doesn't do anything continue if name.startswith("RidgeClassifier"): - # RidgeClassifier behaves unexpected + # RidgeClassifier shows unexpected behavior # FIXME! continue if n_centers == 2: From 30045e161b88ada96f8d9aeb04e6109aa7b0884b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 3 Jan 2013 13:56:02 +0100 Subject: [PATCH 13/13] FIX skip tests for naive bayes for the moment. --- sklearn/tests/test_common.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 42343d72b4c0d..945f6423c7e5f 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -672,6 +672,10 @@ def test_class_weight_classifiers(): # RidgeClassifier shows unexpected behavior # FIXME! continue + if name.endswith("NB"): + # NaiveBayes classifiers have a somewhat differnt interface. + # FIXME SOON! + continue if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: @@ -714,6 +718,11 @@ def test_class_weight_auto_classifies(): # FIXME! continue + if name.endswith("NB"): + # NaiveBayes classifiers have a somewhat differnt interface. + # FIXME SOON! + continue + with warnings.catch_warnings(record=True): clf = Clf() if hasattr(clf, "n_iter"):