From a1633f939357af79a1794a3c062563906d3eb345 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 10 Apr 2021 23:01:20 +0200 Subject: [PATCH 01/25] TST check multilabel common check for supported estimators --- sklearn/ensemble/_forest.py | 5 +++++ sklearn/linear_model/_ridge.py | 3 ++- sklearn/neighbors/_classification.py | 10 ++++++++++ sklearn/neural_network/_multilayer_perceptron.py | 5 +++++ sklearn/tree/_classes.py | 5 +++++ sklearn/utils/estimator_checks.py | 1 - 6 files changed, 27 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 140c1c93e8eef..2e1d81d906498 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -824,6 +824,11 @@ def predict_log_proba(self, X): return proba + def _more_tags(self): + return { + "multilabel": True, + } + class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): """ diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 343bc6a170c9b..c925f55a423dd 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1993,5 +1993,6 @@ def _more_tags(self): '_xfail_checks': { 'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples', - } + }, + "multilabel": True, } diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 71b869977f6aa..38d46dafb8444 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -273,6 +273,11 @@ def predict_proba(self, X): return probabilities + def _more_tags(self): + return { + "multilabel": True, + } + class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, @@ -613,3 +618,8 @@ def predict_proba(self, X): probabilities = probabilities[0] return probabilities + + def _more_tags(self): + return { + "multilabel": True, + } diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 04822360791e7..e68e8e960b81f 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -1123,6 +1123,11 @@ def predict_proba(self, X): else: return y_pred + def _more_tags(self): + return { + "multilabel": True, + } + class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): """Multi-layer Perceptron regressor. diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 420292881f7db..c0217b4ee49ed 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -993,6 +993,11 @@ def predict_log_proba(self, X): return proba + def _more_tags(self): + return { + "multilabel": True, + } + class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): """A decision tree regressor. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 71f5b3b42de42..824573d3da322 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2088,7 +2088,6 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True): def check_classifiers_multilabel_representation_invariance( name, classifier_orig ): - X, y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, From bc8a96f8982df9bd7748692809c69b2c318463eb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 10 Apr 2021 23:41:30 +0200 Subject: [PATCH 02/25] iter --- sklearn/utils/estimator_checks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 824573d3da322..e3a41916865c1 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -637,7 +637,7 @@ def _set_checking_parameters(estimator): estimator.set_params(strategy='stratified') # Speed-up by reducing the number of CV or splits for CV estimators - loo_cv = ['RidgeCV'] + loo_cv = ['RidgeCV', 'RidgeClassifierCV'] if name not in loo_cv and hasattr(estimator, 'cv'): estimator.set_params(cv=3) if hasattr(estimator, 'n_splits'): @@ -2088,10 +2088,11 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True): def check_classifiers_multilabel_representation_invariance( name, classifier_orig ): - X, y = make_multilabel_classification(n_samples=100, n_features=20, + X, y = make_multilabel_classification(n_samples=100, n_features=2, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0) + X = scale(X) X_train, y_train = X[:80], y[:80] X_test = X[80:] From d366492e25cb3b2dacc08bb63e407bdfc6904f32 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 11 Apr 2021 00:07:51 +0200 Subject: [PATCH 03/25] iter --- sklearn/utils/estimator_checks.py | 42 ++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e3a41916865c1..740a8e6b8a136 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -120,6 +120,7 @@ def _yield_checks(estimator): yield check_estimator_get_tags_default_keys + def _yield_classifier_checks(classifier): tags = _safe_tags(classifier) @@ -139,6 +140,7 @@ def _yield_classifier_checks(classifier): yield check_classifiers_regression_target if tags["multilabel"]: yield check_classifiers_multilabel_representation_invariance + yield check_classifiers_multilabel_format_output if not tags["no_validation"]: yield check_supervised_y_no_nan if not tags['multioutput_only']: @@ -2084,7 +2086,7 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True): estimator.fit(X) -@ignore_warnings(category=(FutureWarning)) +@ignore_warnings(category=FutureWarning) def check_classifiers_multilabel_representation_invariance( name, classifier_orig ): @@ -2120,6 +2122,44 @@ def check_classifiers_multilabel_representation_invariance( assert type(y_pred) == type(y_pred_list_of_lists) +@ignore_warnings(category=FutureWarning) +def check_classifiers_multilabel_format_output(name, classifier_orig): + classifier = clone(classifier_orig) + set_random_state(classifier) + + n_outputs = 5 + X, y = make_multilabel_classification(n_samples=100, n_features=2, + n_classes=n_outputs, n_labels=3, + length=50, allow_unlabeled=True, + random_state=0) + X = scale(X) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + classifier.fit(X_train, y_train) + + response_method_name = ["predict", "predict_proba", "decision_function"] + for method_name in response_method_name: + response_method = getattr(classifier, method_name, None) + if response_method is not None: + y_pred = response_method(X_test) + if method_name == "predict": + assert y_pred.shape == y_test.shape, ( + f"{name}.{method_name} output an array of shape " + f"{y_pred.shape} instead of {y_test.shape}" + ) + else: + assert isinstance(y_pred, list), ( + f"{name}.{method_name} output a/an {type(y_pred)} instead " + f"of a list of ndarray" + ) + assert len(y_pred) == n_outputs + for pred in y_pred: + # 25% of the original data with 0/1 labels so we expect + # array of shape (25, 2) + assert pred.shape == (25, 2) + else: + print(f"{name} does not support method {method_name}") + + @ignore_warnings(category=FutureWarning) def check_estimators_fit_returns_self( name, estimator_orig, readonly_memmap=False From 5ec328202e4d298e4645c145187a04346880f476 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 12 Apr 2021 13:58:17 +0200 Subject: [PATCH 04/25] iter --- sklearn/base.py | 7 +++++++ sklearn/ensemble/_forest.py | 16 +++++++++------- sklearn/linear_model/_ridge.py | 16 +++++++++++++--- sklearn/neighbors/_classification.py | 14 +++----------- sklearn/tree/_classes.py | 10 ++++------ sklearn/utils/estimator_checks.py | 26 +++++++++++++------------- 6 files changed, 49 insertions(+), 40 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index ec264b0cf5edc..be4e9a540a79b 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -767,6 +767,13 @@ class MetaEstimatorMixin: """Mixin class for all meta estimators in scikit-learn.""" +class MultiLabelMixin: + """Mixin to mark estimators that support multilabel-indicator target.""" + + def _more_tags(self): + return {"multilabel": True} + + class MultiOutputMixin: """Mixin to mark estimators that support multioutput.""" def _more_tags(self): diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 2e1d81d906498..e8e4b10263ba7 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -51,7 +51,12 @@ class calls the ``fit`` method of each sub-estimator on random samples from joblib import Parallel from ..base import is_classifier -from ..base import ClassifierMixin, RegressorMixin, MultiOutputMixin +from ..base import ( + ClassifierMixin, + MultiLabelMixin, + MultiOutputMixin, + RegressorMixin, +) from ..metrics import accuracy_score, r2_score from ..preprocessing import OneHotEncoder from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor, @@ -572,7 +577,9 @@ def _accumulate_prediction(predict, X, out, lock): out[i] += prediction[i] -class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): +class ForestClassifier( + ClassifierMixin, MultiLabelMixin, BaseForest, metaclass=ABCMeta, +): """ Base class for forest of trees-based classifiers. @@ -824,11 +831,6 @@ def predict_log_proba(self, X): return proba - def _more_tags(self): - return { - "multilabel": True, - } - class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): """ diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index c925f55a423dd..4f4a8a002dd80 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -20,7 +20,12 @@ from ._base import LinearClassifierMixin, LinearModel from ._base import _deprecate_normalize, _rescale_data from ._sag import sag_solver -from ..base import RegressorMixin, MultiOutputMixin, is_classifier +from ..base import ( + MultiLabelMixin, + MultiOutputMixin, + RegressorMixin, + is_classifier, +) from ..utils.extmath import safe_sparse_dot from ..utils.extmath import row_norms from ..utils import check_array @@ -1817,7 +1822,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): """ -class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): +class RidgeClassifierCV(LinearClassifierMixin, MultiLabelMixin, _BaseRidgeCV): """Ridge classifier with built-in cross-validation. See glossary entry for :term:`cross-validation estimator`. @@ -1993,6 +1998,11 @@ def _more_tags(self): '_xfail_checks': { 'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples', + # FIXME: see + # https://github.com/scikit-learn/scikit-learn/issues/19858 + # to track progress to resolve this issue + 'check_classifiers_multilabel_format_output': + 'RidgeClassifierCV.predict output an array of shape (25,) ' + 'instead of (25, 5)', }, - "multilabel": True, } diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 38d46dafb8444..bf76ed5bd22b9 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -16,12 +16,13 @@ import warnings from ._base import _check_weights, _get_weights from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin -from ..base import ClassifierMixin +from ..base import ClassifierMixin, MultiLabelMixin from ..utils.validation import _deprecate_positional_args class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, + MultiLabelMixin, NeighborsBase): """Classifier implementing the k-nearest neighbors vote. @@ -273,14 +274,10 @@ def predict_proba(self, X): return probabilities - def _more_tags(self): - return { - "multilabel": True, - } - class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, + MultiLabelMixin, NeighborsBase): """Classifier implementing a vote among neighbors within a given radius @@ -618,8 +615,3 @@ def predict_proba(self, X): probabilities = probabilities[0] return probabilities - - def _more_tags(self): - return { - "multilabel": True, - } diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index c0217b4ee49ed..5d0fd67cdcf0f 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -29,6 +29,7 @@ from ..base import clone from ..base import RegressorMixin from ..base import is_classifier +from ..base import MultiLabelMixin from ..base import MultiOutputMixin from ..utils import Bunch from ..utils import check_random_state @@ -619,7 +620,9 @@ def feature_importances_(self): # Public estimators # ============================================================================= -class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): +class DecisionTreeClassifier( + ClassifierMixin, MultiLabelMixin, BaseDecisionTree +): """A decision tree classifier. Read more in the :ref:`User Guide `. @@ -993,11 +996,6 @@ def predict_log_proba(self, X): return proba - def _more_tags(self): - return { - "multilabel": True, - } - class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): """A decision tree regressor. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 740a8e6b8a136..d4ee55111ea73 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2124,16 +2124,20 @@ def check_classifiers_multilabel_representation_invariance( @ignore_warnings(category=FutureWarning) def check_classifiers_multilabel_format_output(name, classifier_orig): + """Check the output of the response methods for classifiers supporting + multilabel-indicator targets.""" classifier = clone(classifier_orig) set_random_state(classifier) - n_outputs = 5 - X, y = make_multilabel_classification(n_samples=100, n_features=2, + n_samples, test_size, n_outputs = 100, 25, 5 + X, y = make_multilabel_classification(n_samples=n_samples, n_features=2, n_classes=n_outputs, n_labels=3, length=50, allow_unlabeled=True, random_state=0) X = scale(X) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + X_train, X_test = X[:-test_size], X[-test_size:] + y_train, y_test = y[:-test_size], y[-test_size:] classifier.fit(X_train, y_train) response_method_name = ["predict", "predict_proba", "decision_function"] @@ -2141,21 +2145,17 @@ def check_classifiers_multilabel_format_output(name, classifier_orig): response_method = getattr(classifier, method_name, None) if response_method is not None: y_pred = response_method(X_test) + if method_name == "predict": + # y_pred.shape -> y_test.shape with the same dtype assert y_pred.shape == y_test.shape, ( f"{name}.{method_name} output an array of shape " f"{y_pred.shape} instead of {y_test.shape}" ) - else: - assert isinstance(y_pred, list), ( - f"{name}.{method_name} output a/an {type(y_pred)} instead " - f"of a list of ndarray" - ) - assert len(y_pred) == n_outputs - for pred in y_pred: - # 25% of the original data with 0/1 labels so we expect - # array of shape (25, 2) - assert pred.shape == (25, 2) + assert y_pred.dtype == y_test.dtype + elif method_name == "decision_function": + # y_pred.shape -> y_test.shape with floating dtype + pass else: print(f"{name} does not support method {method_name}") From 157bb2c0c8e7eedb6ddd09629ce3aa41995e5e94 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 12 Apr 2021 14:11:53 +0200 Subject: [PATCH 05/25] iter --- sklearn/utils/estimator_checks.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d4ee55111ea73..73b829d767d4d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2155,9 +2155,28 @@ def check_classifiers_multilabel_format_output(name, classifier_orig): assert y_pred.dtype == y_test.dtype elif method_name == "decision_function": # y_pred.shape -> y_test.shape with floating dtype - pass - else: - print(f"{name} does not support method {method_name}") + assert y_pred.shape == (test_size, 2) + assert y_pred.dtype.kind == "f" + else: # predict_proba + # y_pred.shape -> 2 possibilities: + # - list of length n_outputs of shape (n_samples, 2); + # - ndarray of shape (n_samples, n_outputs). + # dtype should be floating + if isinstance(y_pred, list): + assert len(y_pred) == n_outputs + for pred in y_pred: + assert pred.shape == (test_size, 2) + assert pred.dtype.kind == "f" + # check that we have the correct probabilities + assert_allclose(pred.sum(axis=1), 1) + elif isinstance(y_pred, np.ndarray): + assert y_pred.shape == (test_size, n_outputs) + assert y_pred.dtype.kind == "f" + else: + raise ValueError( + f"Unknown return type {type(y_pred)} in " + f"{name}.{method_name}" + ) @ignore_warnings(category=FutureWarning) From 520911c6f2247c10d9ed17cf2fb109bb474c0359 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 12 Apr 2021 14:22:50 +0200 Subject: [PATCH 06/25] iter --- sklearn/utils/estimator_checks.py | 40 +++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 73b829d767d4d..883f01b4e1c34 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2152,11 +2152,22 @@ def check_classifiers_multilabel_format_output(name, classifier_orig): f"{name}.{method_name} output an array of shape " f"{y_pred.shape} instead of {y_test.shape}" ) - assert y_pred.dtype == y_test.dtype + assert y_pred.dtype == y_test.dtype, ( + f"{name}.{method_name} does not output the same dtype than" + f" the targets. Got {y_pred.dtype} instead of " + f"{y_test.dtype}" + ) elif method_name == "decision_function": # y_pred.shape -> y_test.shape with floating dtype - assert y_pred.shape == (test_size, 2) - assert y_pred.dtype.kind == "f" + assert y_pred.shape == (test_size, 2), ( + f"{name}.{method_name} is expected to provide a NumPy " + f"array of shape (n_samples, n_outputs). Got " + f"{y_pred.shape} instead of {(test_size, 2)}" + ) + assert y_pred.dtype.kind == "f", ( + f"{name}.{method_name} is expected to output a floating " + f"dtype. Got {y_pred.dtype} instead." + ) else: # predict_proba # y_pred.shape -> 2 possibilities: # - list of length n_outputs of shape (n_samples, 2); @@ -2165,13 +2176,28 @@ def check_classifiers_multilabel_format_output(name, classifier_orig): if isinstance(y_pred, list): assert len(y_pred) == n_outputs for pred in y_pred: - assert pred.shape == (test_size, 2) - assert pred.dtype.kind == "f" + assert pred.shape == (test_size, 2), ( + f"{name}.{method_name} is expected to output a" + f"list of NumPy array of shape (n_samples, 2). Got" + f" {y_pred.shape} instead of {(test_size, 2)}" + ) + assert pred.dtype.kind == "f", ( + f"{name}.{method_name} is expected to output a " + f"list of NumPy array of floating dtype. Got" + f"{y_pred.dtype} instead." + ) # check that we have the correct probabilities assert_allclose(pred.sum(axis=1), 1) elif isinstance(y_pred, np.ndarray): - assert y_pred.shape == (test_size, n_outputs) - assert y_pred.dtype.kind == "f" + assert y_pred.shape == (test_size, n_outputs), ( + f"{name}.{method_name} is expected to output a NumPy " + f"array of shape (n_samples, n_outputs). Got " + f"{y_pred.shape} instead of {(test_size, n_outputs)}" + ) + assert y_pred.dtype.kind == "f", ( + f"{name}.{method_name} is expected to output a NumPy " + f"array of floating dtype. Got {y_pred.dtype} instead." + ) else: raise ValueError( f"Unknown return type {type(y_pred)} in " From 5881ef80a0560c8c38b22f69591425591fce1d64 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 12 Apr 2021 15:46:21 +0200 Subject: [PATCH 07/25] TST add test --- .../neural_network/_multilayer_perceptron.py | 16 +- sklearn/utils/estimator_checks.py | 79 ++++-- sklearn/utils/tests/test_estimator_checks.py | 266 +++++++++++++++++- 3 files changed, 318 insertions(+), 43 deletions(-) diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index e68e8e960b81f..bea82a26eccd6 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -13,7 +13,12 @@ import scipy.optimize -from ..base import BaseEstimator, ClassifierMixin, RegressorMixin +from ..base import ( + BaseEstimator, + ClassifierMixin, + MultiLabelMixin, + RegressorMixin, +) from ..base import is_classifier from ._base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS from ._stochastic_optimizers import SGDOptimizer, AdamOptimizer @@ -698,7 +703,9 @@ def _partial_fit(self, X, y): return self._fit(X, y, incremental=True) -class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): +class MLPClassifier( + ClassifierMixin, MultiLabelMixin, BaseMultilayerPerceptron +): """Multi-layer Perceptron classifier. This model optimizes the log-loss function using LBFGS or stochastic @@ -1123,11 +1130,6 @@ def predict_proba(self, X): else: return y_pred - def _more_tags(self): - return { - "multilabel": True, - } - class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): """Multi-layer Perceptron regressor. diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 883f01b4e1c34..d0d30a2647660 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -19,6 +19,7 @@ from ._testing import assert_array_almost_equal from ._testing import assert_allclose from ._testing import assert_allclose_dense_sparse +from ._testing import assert_array_less from ._testing import set_random_state from ._testing import SkipTest from ._testing import ignore_warnings @@ -2148,61 +2149,91 @@ def check_classifiers_multilabel_format_output(name, classifier_orig): if method_name == "predict": # y_pred.shape -> y_test.shape with the same dtype + assert isinstance(y_pred, np.ndarray), ( + f"{name}.{method_name} is expected to output a NumPy " + f"array. Got {type(y_pred)} instead." + ) assert y_pred.shape == y_test.shape, ( - f"{name}.{method_name} output an array of shape " - f"{y_pred.shape} instead of {y_test.shape}" + f"{name}.{method_name} output a NumPy array of shape " + f"{y_pred.shape} instead of {y_test.shape}." ) assert y_pred.dtype == y_test.dtype, ( f"{name}.{method_name} does not output the same dtype than" f" the targets. Got {y_pred.dtype} instead of " - f"{y_test.dtype}" - ) - elif method_name == "decision_function": - # y_pred.shape -> y_test.shape with floating dtype - assert y_pred.shape == (test_size, 2), ( - f"{name}.{method_name} is expected to provide a NumPy " - f"array of shape (n_samples, n_outputs). Got " - f"{y_pred.shape} instead of {(test_size, 2)}" - ) - assert y_pred.dtype.kind == "f", ( - f"{name}.{method_name} is expected to output a floating " - f"dtype. Got {y_pred.dtype} instead." + f"{y_test.dtype}." ) - else: # predict_proba + elif method_name == "predict_proba": # y_pred.shape -> 2 possibilities: # - list of length n_outputs of shape (n_samples, 2); # - ndarray of shape (n_samples, n_outputs). # dtype should be floating if isinstance(y_pred, list): - assert len(y_pred) == n_outputs + assert len(y_pred) == n_outputs, ( + f"{name}.{method_name} is expected to output a list " + f"of length n_outputs of Numpy array. Got length of " + f"{len(y_pred)} instead of {n_outputs}." + ) for pred in y_pred: assert pred.shape == (test_size, 2), ( - f"{name}.{method_name} is expected to output a" + f"{name}.{method_name} is expected to output a " f"list of NumPy array of shape (n_samples, 2). Got" - f" {y_pred.shape} instead of {(test_size, 2)}" + f" {pred.shape} instead of {(test_size, 2)}." ) assert pred.dtype.kind == "f", ( f"{name}.{method_name} is expected to output a " - f"list of NumPy array of floating dtype. Got" - f"{y_pred.dtype} instead." + f"list of NumPy array of floating dtype. Got " + f"{pred.dtype} instead." ) # check that we have the correct probabilities - assert_allclose(pred.sum(axis=1), 1) + assert_allclose( + pred.sum(axis=1), + 1, + err_msg=( + f"{name}.{method_name} is expected to provide " + f"probabilities such that each array rows " + f"should sum to 1." + ), + ) elif isinstance(y_pred, np.ndarray): assert y_pred.shape == (test_size, n_outputs), ( f"{name}.{method_name} is expected to output a NumPy " f"array of shape (n_samples, n_outputs). Got " - f"{y_pred.shape} instead of {(test_size, n_outputs)}" + f"{y_pred.shape} instead of {(test_size, n_outputs)}." ) assert y_pred.dtype.kind == "f", ( f"{name}.{method_name} is expected to output a NumPy " f"array of floating dtype. Got {y_pred.dtype} instead." ) + assert_array_less( + y_pred, + 1, + err_msg=( + f"{name}.{method_name} is expected to provide " + f"probabilities of the positive class and should " + f"therefore contain values below 1." + ), + ) else: raise ValueError( - f"Unknown return type {type(y_pred)} in " - f"{name}.{method_name}" + f"Unknown returned type {type(y_pred)} by " + f"{name}.{method_name}. A list or a Numpy array are " + f"expected." ) + else: # "decision_function" + # y_pred.shape -> y_test.shape with floating dtype + assert isinstance(y_pred, np.ndarray), ( + f"{name}.{method_name} is expected to output a NumPy " + f"array. Got {type(y_pred)} instead." + ) + assert y_pred.shape == (test_size, n_outputs), ( + f"{name}.{method_name} is expected to provide a NumPy " + f"array of shape (n_samples, n_outputs). Got " + f"{y_pred.shape} instead of {(test_size, n_outputs)}." + ) + assert y_pred.dtype.kind == "f", ( + f"{name}.{method_name} is expected to output a floating " + f"dtype. Got {y_pred.dtype} instead." + ) @ignore_warnings(category=FutureWarning) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 4792f50f2baef..d2d3cdf60282c 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -10,6 +10,7 @@ import joblib from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.datasets import make_multilabel_classification from sklearn.utils import deprecated from sklearn.utils._testing import ( assert_raises, @@ -21,18 +22,6 @@ MinimalTransformer, SkipTest, ) -from sklearn.utils.estimator_checks import check_estimator, _NotAnArray -from sklearn.utils.estimator_checks \ - import check_class_weight_balanced_linear_classifier -from sklearn.utils.estimator_checks import set_random_state -from sklearn.utils.estimator_checks import _set_checking_parameters -from sklearn.utils.estimator_checks import check_estimators_unfitted -from sklearn.utils.estimator_checks import check_fit_score_takes_y -from sklearn.utils.estimator_checks import check_no_attributes_set_in_init -from sklearn.utils.estimator_checks import check_classifier_data_not_an_array -from sklearn.utils.estimator_checks import check_regressor_data_not_an_array -from sklearn.utils.estimator_checks import \ - check_estimator_get_tags_default_keys from sklearn.utils.validation import check_is_fitted from sklearn.utils.estimator_checks import check_outlier_corruption from sklearn.utils.fixes import np_version, parse_version @@ -48,6 +37,21 @@ from sklearn.utils import all_estimators from sklearn.exceptions import SkipTestWarning +from sklearn.utils.estimator_checks import ( + _NotAnArray, + _set_checking_parameters, + check_class_weight_balanced_linear_classifier, + check_classifier_data_not_an_array, + check_classifiers_multilabel_format_output, + check_estimator, + check_estimator_get_tags_default_keys, + check_estimators_unfitted, + check_fit_score_takes_y, + check_no_attributes_set_in_init, + check_regressor_data_not_an_array, + set_random_state, +) + class CorrectNotFittedError(ValueError): """Exception class to raise if estimator is used before fitting. @@ -673,6 +677,244 @@ def test_check_estimator_get_tags_default_keys(): ) +def test_check_classifiers_multilabel_output_format(): + n_samples, test_size, n_outputs = 100, 25, 5 + _, y = make_multilabel_classification(n_samples=n_samples, n_features=2, + n_classes=n_outputs, n_labels=3, + length=50, allow_unlabeled=True, + random_state=0) + y_train, y_test = y[:-test_size], y[-test_size:] + + class BaseMultiLabelClassifierMock(ClassifierMixin, BaseEstimator): + def __init__(self, response_output): + self.response_output = response_output + + def fit(self, X, y): + return self + + class MultiLabelClassifierPredict(BaseMultiLabelClassifierMock): + def predict(self, X): + return self.response_output + + class MultiLabelClassifierPredictProba(BaseMultiLabelClassifierMock): + def predict_proba(self, X): + return self.response_output + + class MultiLabelClassifierDecisionFunction(BaseMultiLabelClassifierMock): + def decision_function(self, X): + return self.response_output + + # 1.method predict + # 1.1 inconsistent array type + clf = MultiLabelClassifierPredict(response_output=y_test.tolist()) + err_msg = ( + r"MultiLabelClassifierPredict.predict is expected to output a " + r"NumPy array. Got instead." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 1.2. inconsistent shape + clf = MultiLabelClassifierPredict(response_output=y_test[:, :-1]) + err_msg = ( + r"MultiLabelClassifierPredict.predict output a NumPy array of " + r"shape \(25, 4\) instead of \(25, 5\)." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 1.3 inconsistent dtype + clf = MultiLabelClassifierPredict( + response_output=y_test.astype(np.float64) + ) + err_msg = ( + r"MultiLabelClassifierPredict.predict does not output the same " + r"dtype than the targets. Got float64 instead of int64." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + + # 2. method predict_proba + # 2.1 unknown output type + clf = MultiLabelClassifierPredictProba( + response_output=sp.csr_matrix(y_test) + ) + err_msg = ( + r"Unknown returned type by " + r"MultiLabelClassifierPredictProba.predict_proba. A list or a Numpy " + r"array are expected." + ) + assert_raises_regex( + ValueError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 2.2 for list output + # 2.2.1 iconsistent length + clf = MultiLabelClassifierPredictProba(response_output=y_test.tolist()) + err_msg = ( + r"MultiLabelClassifierPredictProba.predict_proba is expected to " + r"output a list of length n_outputs of Numpy array. Got length of 25 " + r"instead of 5." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 2.2.2 array of inconsistent shape + response_output = [np.ones_like(y_test) for _ in range(n_outputs)] + clf = MultiLabelClassifierPredictProba(response_output=response_output) + err_msg = ( + r"MultiLabelClassifierPredictProba.predict_proba is expected to output" + r" a list of NumPy array of shape \(n_samples, 2\). Got \(25, 5\) " + r"instead of \(25, 2\)." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 2.2.3 array of inconsistent dtype + response_output = [ + np.ones(shape=(y_test.shape[0], 2), dtype=np.int64) + for _ in range(n_outputs) + ] + clf = MultiLabelClassifierPredictProba(response_output=response_output) + err_msg = ( + r"MultiLabelClassifierPredictProba.predict_proba is expected to output" + r" a list of NumPy array of floating dtype. Got int64 instead." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 2.2.4 array does not contain probability (each row should sum to 1) + response_output = [ + np.ones(shape=(y_test.shape[0], 2), dtype=np.float64) + for _ in range(n_outputs) + ] + clf = MultiLabelClassifierPredictProba(response_output=response_output) + err_msg = ( + r"MultiLabelClassifierPredictProba.predict_proba is expected to " + r"provide probabilities such that each array rows should sum to 1." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 2.3 for array output + # 2.3.1 array of inconsistent shape + clf = MultiLabelClassifierPredictProba(response_output=y_test[:, :-1]) + err_msg = ( + r"MultiLabelClassifierPredictProba.predict_proba is expected to " + r"output a NumPy array of shape \(n_samples, n_outputs\). Got " + r"\(25, 4\) instead of \(25, 5\)." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 2.3.2 array of inconsistent dtype + response_output = np.zeros_like(y_test, dtype=np.int64) + clf = MultiLabelClassifierPredictProba(response_output=response_output) + err_msg = ( + r"MultiLabelClassifierPredictProba.predict_proba is expected to " + r"output a NumPy array of floating dtype. Got int64 instead." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 2.2.4 array does not contain probabilities + clf = MultiLabelClassifierPredictProba(response_output=y_test * 2.0) + err_msg = ( + r"MultiLabelClassifierPredictProba.predict_proba is expected to " + r"provide probabilities of the positive class and should therefore " + r"contain values below 1." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + + # 3. decision_function + # 3.1 inconsistent array type + clf = MultiLabelClassifierDecisionFunction(response_output=y_test.tolist()) + err_msg = ( + r"MultiLabelClassifierDecisionFunction.decision_function is expected " + r"to output a NumPy array. Got instead." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 3.2. inconsistent shape + clf = MultiLabelClassifierDecisionFunction(response_output=y_test[:, :-1]) + err_msg = ( + r"MultiLabelClassifierDecisionFunction.decision_function is expected " + r"to provide a NumPy array of shape \(n_samples, n_outputs\). Got " + r"\(25, 4\) instead of \(25, 5\)" + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + # 3.3 inconsistent dtype + clf = MultiLabelClassifierDecisionFunction(response_output=y_test) + err_msg = ( + r"MultiLabelClassifierDecisionFunction.decision_function is expected " + r"to output a floating dtype. Got int64 instead." + ) + assert_raises_regex( + AssertionError, + err_msg, + check_classifiers_multilabel_format_output, + clf.__class__.__name__, + clf, + ) + + def run_tests_without_pytest(): """Runs the tests in this file without using pytest. """ From c59e7eb83c56aa525b0aa20c495593657af4156b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 12 Apr 2021 15:49:35 +0200 Subject: [PATCH 08/25] iter --- sklearn/utils/tests/test_estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index d2d3cdf60282c..dffba210033b2 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -683,7 +683,7 @@ def test_check_classifiers_multilabel_output_format(): n_classes=n_outputs, n_labels=3, length=50, allow_unlabeled=True, random_state=0) - y_train, y_test = y[:-test_size], y[-test_size:] + y_test = y[-test_size:] class BaseMultiLabelClassifierMock(ClassifierMixin, BaseEstimator): def __init__(self, response_output): From 7e9f70bfa4391da5b69731b7d8bac5c1b38b8133 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 12 Apr 2021 16:21:20 +0200 Subject: [PATCH 09/25] iter --- sklearn/utils/tests/test_estimator_checks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index dffba210033b2..f3b92c3678aff 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -737,7 +737,7 @@ def decision_function(self, X): ) err_msg = ( r"MultiLabelClassifierPredict.predict does not output the same " - r"dtype than the targets. Got float64 instead of int64." + r"dtype than the targets." ) assert_raises_regex( AssertionError, @@ -802,7 +802,7 @@ def decision_function(self, X): clf = MultiLabelClassifierPredictProba(response_output=response_output) err_msg = ( r"MultiLabelClassifierPredictProba.predict_proba is expected to output" - r" a list of NumPy array of floating dtype. Got int64 instead." + r" a list of NumPy array of floating dtype." ) assert_raises_regex( AssertionError, @@ -848,7 +848,7 @@ def decision_function(self, X): clf = MultiLabelClassifierPredictProba(response_output=response_output) err_msg = ( r"MultiLabelClassifierPredictProba.predict_proba is expected to " - r"output a NumPy array of floating dtype. Got int64 instead." + r"output a NumPy array of floating dtype." ) assert_raises_regex( AssertionError, @@ -904,7 +904,7 @@ def decision_function(self, X): clf = MultiLabelClassifierDecisionFunction(response_output=y_test) err_msg = ( r"MultiLabelClassifierDecisionFunction.decision_function is expected " - r"to output a floating dtype. Got int64 instead." + r"to output a floating dtype." ) assert_raises_regex( AssertionError, From 6b2b57429c7ad3440762a1eec5588bc5a2ec8a93 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 12 Apr 2021 23:40:26 +0200 Subject: [PATCH 10/25] FIX make predict returning a multilabel indicator matrix in RidgeClassifierCV --- doc/whats_new/v1.0.rst | 7 +++++- sklearn/linear_model/_base.py | 20 ++++++++-------- sklearn/linear_model/_ridge.py | 30 +++++++++++++++++++++--- sklearn/linear_model/tests/test_ridge.py | 7 ++++++ 4 files changed, 50 insertions(+), 14 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index ba3f6d6d1110d..1c56311ee0ac3 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -229,6 +229,11 @@ Changelog is now faster. This is especially noticeable on large sparse input. :pr:`19734` by :user:`Fred Robinson `. +- |Fix| Fix a bug in :class:`linear_model.RidgeClassifierCV` where the method + `predict` was performing an `argmax` on the scores obtain from + `decision_function` instead of returning the multilabel indicator matrix. + :pr:`xxxxxx` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.manifold` ....................... @@ -276,7 +281,7 @@ Changelog :pr:`18649` by `Leandro Hermida ` and `Rodion Martynov `. -- |Fix| The `fit` method of the successive halving parameter search +- |Fix| The `fit` method of the successive halving parameter search (:class:`model_selection.HalvingGridSearchCV`, and :class:`model_selection.HalvingRandomSearchCV`) now correctly handles the `groups` parameter. :pr:`19847` by :user:`Xiaoyu Chai `. diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index c80c2db622921..653e3810fea78 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -387,15 +387,15 @@ def decision_function(self, X): Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) - Samples. + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data matrix for which we want to get the confidence scores. Returns ------- - array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) - Confidence scores per (sample, class) combination. In the binary - case, confidence score for self.classes_[1] where >0 means this - class would be predicted. + scores : ndarray of shape (n_samples,) or (n_samples, n_classes) + Confidence scores per `(n_sample, n_classes)` combination. In the + binary case, confidence score for `self.classes_[1]` where >0 means + this class would be predicted. """ check_is_fitted(self) @@ -410,13 +410,13 @@ def predict(self, X): Parameters ---------- - X : array-like or sparse matrix, shape (n_samples, n_features) - Samples. + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data matrix for which we want to get the predictions. Returns ------- - C : array, shape [n_samples] - Predicted class label per sample. + y_pred : ndarray of shape (n_samples,) + Vector containing the class labels for each sample. """ scores = self.decision_function(X) if len(scores.shape) == 1: diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 4f4a8a002dd80..65cd8f7352826 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1993,6 +1993,30 @@ def fit(self, X, y, sample_weight=None): def classes_(self): return self._label_binarizer.classes_ + def predict(self, X): + """Predict class labels for samples in `X`. + + Parameters + ---------- + X : {array-like, spare matrix} of shape (n_samples, n_features) + The data matrix for which we want to predict the targets. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs) + Vector or matrix containing the predictions. In binary and + multiclass problems, this is a vector containing `n_samples`. In + a multilabel problem, it returns a matrix of shape + `(n_samples, n_outputs)`. + """ + if self._label_binarizer.y_type_.startswith("multilabel"): + # Threshold such that the negative label is -1 and positive label + # is 1 to use the inverse transform of the label binarizer fitted + # during fit. + scores = 2 * (self.decision_function(X) > 0) - 1 + return self._label_binarizer.inverse_transform(scores) + return super().predict(X) + def _more_tags(self): return { '_xfail_checks': { @@ -2001,8 +2025,8 @@ def _more_tags(self): # FIXME: see # https://github.com/scikit-learn/scikit-learn/issues/19858 # to track progress to resolve this issue - 'check_classifiers_multilabel_format_output': - 'RidgeClassifierCV.predict output an array of shape (25,) ' - 'instead of (25, 5)', + # 'check_classifiers_multilabel_format_output': + # 'RidgeClassifierCV.predict output an array of shape (25,) ' + # 'instead of (25, 5)', }, } diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index b812788239b14..83a1445c9434d 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -1412,3 +1412,10 @@ def test_ridge_sag_with_X_fortran(): X = X[::2, :] y = y[::2] Ridge(solver='sag').fit(X, y) + + +def test_ridge_xxx(): + # X, y = datasets.load_iris(return_X_y=True) + X, y = datasets.make_multilabel_classification() + clf = RidgeClassifierCV().fit(X, y) + print(clf.predict(X)) From 42548fce029e53db9b54bb5f6112b51445d1b2b0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 12 Apr 2021 23:44:11 +0200 Subject: [PATCH 11/25] update whats new --- doc/whats_new/v1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 1c56311ee0ac3..b011e203ce357 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -232,7 +232,7 @@ Changelog - |Fix| Fix a bug in :class:`linear_model.RidgeClassifierCV` where the method `predict` was performing an `argmax` on the scores obtain from `decision_function` instead of returning the multilabel indicator matrix. - :pr:`xxxxxx` by :user:`Guillaume Lemaitre `. + :pr:`19869` by :user:`Guillaume Lemaitre `. :mod:`sklearn.manifold` ....................... From c37e68ff93be6532442d21bad34a154c4060d747 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 13 Apr 2021 00:09:30 +0200 Subject: [PATCH 12/25] iter --- sklearn/utils/estimator_checks.py | 175 +++++++++++++++--------------- 1 file changed, 85 insertions(+), 90 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d0d30a2647660..506d1b284f7f1 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2141,99 +2141,94 @@ def check_classifiers_multilabel_format_output(name, classifier_orig): y_train, y_test = y[:-test_size], y[-test_size:] classifier.fit(X_train, y_train) - response_method_name = ["predict", "predict_proba", "decision_function"] - for method_name in response_method_name: - response_method = getattr(classifier, method_name, None) - if response_method is not None: - y_pred = response_method(X_test) - - if method_name == "predict": - # y_pred.shape -> y_test.shape with the same dtype - assert isinstance(y_pred, np.ndarray), ( - f"{name}.{method_name} is expected to output a NumPy " - f"array. Got {type(y_pred)} instead." - ) - assert y_pred.shape == y_test.shape, ( - f"{name}.{method_name} output a NumPy array of shape " - f"{y_pred.shape} instead of {y_test.shape}." - ) - assert y_pred.dtype == y_test.dtype, ( - f"{name}.{method_name} does not output the same dtype than" - f" the targets. Got {y_pred.dtype} instead of " - f"{y_test.dtype}." - ) - elif method_name == "predict_proba": - # y_pred.shape -> 2 possibilities: - # - list of length n_outputs of shape (n_samples, 2); - # - ndarray of shape (n_samples, n_outputs). - # dtype should be floating - if isinstance(y_pred, list): - assert len(y_pred) == n_outputs, ( - f"{name}.{method_name} is expected to output a list " - f"of length n_outputs of Numpy array. Got length of " - f"{len(y_pred)} instead of {n_outputs}." - ) - for pred in y_pred: - assert pred.shape == (test_size, 2), ( - f"{name}.{method_name} is expected to output a " - f"list of NumPy array of shape (n_samples, 2). Got" - f" {pred.shape} instead of {(test_size, 2)}." - ) - assert pred.dtype.kind == "f", ( - f"{name}.{method_name} is expected to output a " - f"list of NumPy array of floating dtype. Got " - f"{pred.dtype} instead." - ) - # check that we have the correct probabilities - assert_allclose( - pred.sum(axis=1), - 1, - err_msg=( - f"{name}.{method_name} is expected to provide " - f"probabilities such that each array rows " - f"should sum to 1." - ), - ) - elif isinstance(y_pred, np.ndarray): - assert y_pred.shape == (test_size, n_outputs), ( - f"{name}.{method_name} is expected to output a NumPy " - f"array of shape (n_samples, n_outputs). Got " - f"{y_pred.shape} instead of {(test_size, n_outputs)}." - ) - assert y_pred.dtype.kind == "f", ( - f"{name}.{method_name} is expected to output a NumPy " - f"array of floating dtype. Got {y_pred.dtype} instead." - ) - assert_array_less( - y_pred, - 1, - err_msg=( - f"{name}.{method_name} is expected to provide " - f"probabilities of the positive class and should " - f"therefore contain values below 1." - ), - ) - else: - raise ValueError( - f"Unknown returned type {type(y_pred)} by " - f"{name}.{method_name}. A list or a Numpy array are " - f"expected." - ) - else: # "decision_function" - # y_pred.shape -> y_test.shape with floating dtype - assert isinstance(y_pred, np.ndarray), ( - f"{name}.{method_name} is expected to output a NumPy " - f"array. Got {type(y_pred)} instead." + predict_method = getattr(classifier, "predict", None) + if predict_method is not None: + y_pred = predict_method(X_test) + + # y_pred.shape -> y_test.shape with the same dtype + assert isinstance(y_pred, np.ndarray), ( + f"{name}.predict is expected to output a NumPy array. Got " + f"{type(y_pred)} instead." + ) + assert y_pred.shape == y_test.shape, ( + f"{name}.predict output a NumPy array of shape {y_pred.shape} " + f"instead of {y_test.shape}." + ) + assert y_pred.dtype == y_test.dtype, ( + f"{name}.predict does not output the same dtype than the targets. " + f"Got {y_pred.dtype} instead of {y_test.dtype}." + ) + + predict_proba_method = getattr(classifier, "predict_proba", None) + if predict_proba_method is not None: + y_pred = predict_proba_method(X_test) + + # y_pred.shape -> 2 possibilities: + # - list of length n_outputs of shape (n_samples, 2); + # - ndarray of shape (n_samples, n_outputs). + # dtype should be floating + if isinstance(y_pred, list): + assert len(y_pred) == n_outputs, ( + f"{name}.predict_proba is expected to output a list of length " + f"n_outputs of Numpy array. Got length of {len(y_pred)} " + f"instead of {n_outputs}." + ) + for pred in y_pred: + assert pred.shape == (test_size, 2), ( + f"{name}.predict_proba is expected to output a list of " + f"NumPy array of shape (n_samples, 2). Got {pred.shape} " + f"instead of {(test_size, 2)}." ) - assert y_pred.shape == (test_size, n_outputs), ( - f"{name}.{method_name} is expected to provide a NumPy " - f"array of shape (n_samples, n_outputs). Got " - f"{y_pred.shape} instead of {(test_size, n_outputs)}." + assert pred.dtype.kind == "f", ( + f"{name}.predict_proba is expected to output a list of " + f"NumPy array of floating dtype. Got {pred.dtype} instead." ) - assert y_pred.dtype.kind == "f", ( - f"{name}.{method_name} is expected to output a floating " - f"dtype. Got {y_pred.dtype} instead." + # check that we have the correct probabilities + err_msg = ( + f"{name}.predict_proba is expected to provide " + f"probabilities such that each array rows should sum to 1." ) + assert_allclose(pred.sum(axis=1), 1, err_msg=err_msg) + elif isinstance(y_pred, np.ndarray): + assert y_pred.shape == (test_size, n_outputs), ( + f"{name}.predict_proba is expected to output a NumPy array of " + f"shape (n_samples, n_outputs). Got {y_pred.shape} instead of " + f"{(test_size, n_outputs)}." + ) + assert y_pred.dtype.kind == "f", ( + f"{name}.predict_proba is expected to output a NumPy array of " + f"floating dtype. Got {y_pred.dtype} instead." + ) + err_msg = ( + f"{name}.predict_proba is expected to provide probabilities " + f"of the positive class and should therefore contain values " + f"below 1." + ) + assert_array_less(y_pred, 1,err_msg=err_msg) + else: + raise ValueError( + f"Unknown returned type {type(y_pred)} by {name}." + f"predict_proba. A list or a Numpy array are expected." + ) + + decision_function_method = getattr(classifier, "decision_function", None) + if decision_function_method is not None: + y_pred = decision_function_method(X_test) + + # y_pred.shape -> y_test.shape with floating dtype + assert isinstance(y_pred, np.ndarray), ( + f"{name}.decision_function is expected to output a NumPy array." + f" Got {type(y_pred)} instead." + ) + assert y_pred.shape == (test_size, n_outputs), ( + f"{name}.decision_function is expected to provide a NumPy array " + f"of shape (n_samples, n_outputs). Got {y_pred.shape} instead of " + f"{(test_size, n_outputs)}." + ) + assert y_pred.dtype.kind == "f", ( + f"{name}.decision_function is expected to output a floating dtype." + f" Got {y_pred.dtype} instead." + ) @ignore_warnings(category=FutureWarning) From 079caad38f38cb19ab4282612fd6898c9e6172be Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 13 Apr 2021 00:14:36 +0200 Subject: [PATCH 13/25] PEP8 --- sklearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 506d1b284f7f1..faf578e8fc421 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2204,7 +2204,7 @@ def check_classifiers_multilabel_format_output(name, classifier_orig): f"of the positive class and should therefore contain values " f"below 1." ) - assert_array_less(y_pred, 1,err_msg=err_msg) + assert_array_less(y_pred, 1, err_msg=err_msg) else: raise ValueError( f"Unknown returned type {type(y_pred)} by {name}." @@ -2215,7 +2215,7 @@ def check_classifiers_multilabel_format_output(name, classifier_orig): if decision_function_method is not None: y_pred = decision_function_method(X_test) - # y_pred.shape -> y_test.shape with floating dtype + # y_pred.shape -> y_test.shape with floating dtype assert isinstance(y_pred, np.ndarray), ( f"{name}.decision_function is expected to output a NumPy array." f" Got {type(y_pred)} instead." From 4e728f6b15b314dbcefebfb8148fe0e03efb6943 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 13 Apr 2021 00:18:34 +0200 Subject: [PATCH 14/25] add check is fitted --- sklearn/linear_model/_ridge.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 65cd8f7352826..955a867d31f42 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -32,6 +32,7 @@ from ..utils import check_consistent_length from ..utils import compute_sample_weight from ..utils import column_or_1d +from ..utils.validation import check_is_fitted from ..utils.validation import _check_sample_weight from ..utils.validation import _deprecate_positional_args from ..preprocessing import LabelBinarizer @@ -2009,6 +2010,7 @@ def predict(self, X): a multilabel problem, it returns a matrix of shape `(n_samples, n_outputs)`. """ + check_is_fitted(self, attributes=["_label_binarizer"]) if self._label_binarizer.y_type_.startswith("multilabel"): # Threshold such that the negative label is -1 and positive label # is 1 to use the inverse transform of the label binarizer fitted From 34fed9a8ed57d3a8c82c249fc20910d6a19c0e27 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 13 Apr 2021 20:31:26 +0200 Subject: [PATCH 15/25] iter --- doc/whats_new/v1.0.rst | 4 ++ sklearn/linear_model/_ridge.py | 70 ++++++++++++------------ sklearn/linear_model/tests/test_ridge.py | 21 ++++--- 3 files changed, 48 insertions(+), 47 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index b011e203ce357..ba6b9781efa61 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -234,6 +234,10 @@ Changelog `decision_function` instead of returning the multilabel indicator matrix. :pr:`19869` by :user:`Guillaume Lemaitre `. +- |Enhancement| :class:`linear_model.RidgeClassifier` is now supporting + multilabel classification. + :pr:`19689` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.manifold` ....................... diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 955a867d31f42..d215677b750a8 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -777,7 +777,39 @@ def fit(self, X, y, sample_weight=None): return super().fit(X, y, sample_weight=sample_weight) -class RidgeClassifier(LinearClassifierMixin, _BaseRidge): +class _BaseRidgeClassifier(LinearClassifierMixin, MultiLabelMixin): + + def predict(self, X): + """Predict class labels for samples in `X`. + + Parameters + ---------- + X : {array-like, spare matrix} of shape (n_samples, n_features) + The data matrix for which we want to predict the targets. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs) + Vector or matrix containing the predictions. In binary and + multiclass problems, this is a vector containing `n_samples`. In + a multilabel problem, it returns a matrix of shape + `(n_samples, n_outputs)`. + """ + check_is_fitted(self, attributes=["_label_binarizer"]) + if self._label_binarizer.y_type_.startswith("multilabel"): + # Threshold such that the negative label is -1 and positive label + # is 1 to use the inverse transform of the label binarizer fitted + # during fit. + scores = 2 * (self.decision_function(X) > 0) - 1 + return self._label_binarizer.inverse_transform(scores) + return super().predict(X) + + @property + def classes_(self): + return self._label_binarizer.classes_ + + +class RidgeClassifier(_BaseRidgeClassifier, _BaseRidge): """Classifier using Ridge regression. This classifier first converts the target values into ``{-1, 1}`` and @@ -951,11 +983,6 @@ def fit(self, X, y, sample_weight=None): Y = self._label_binarizer.fit_transform(y) if not self._label_binarizer.y_type_.startswith('multilabel'): y = column_or_1d(y, warn=True) - else: - # we don't (yet) support multi-label classification in Ridge - raise ValueError( - "%s doesn't support multi-label classification" % ( - self.__class__.__name__)) if self.class_weight: # modify the sample weights with the corresponding class weight @@ -965,10 +992,6 @@ def fit(self, X, y, sample_weight=None): super().fit(X, Y, sample_weight=sample_weight) return self - @property - def classes_(self): - return self._label_binarizer.classes_ - def _check_gcv_mode(X, gcv_mode): possible_gcv_modes = [None, 'auto', 'svd', 'eigen'] @@ -1823,7 +1846,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): """ -class RidgeClassifierCV(LinearClassifierMixin, MultiLabelMixin, _BaseRidgeCV): +class RidgeClassifierCV(_BaseRidgeClassifier, _BaseRidgeCV): """Ridge classifier with built-in cross-validation. See glossary entry for :term:`cross-validation estimator`. @@ -1994,31 +2017,6 @@ def fit(self, X, y, sample_weight=None): def classes_(self): return self._label_binarizer.classes_ - def predict(self, X): - """Predict class labels for samples in `X`. - - Parameters - ---------- - X : {array-like, spare matrix} of shape (n_samples, n_features) - The data matrix for which we want to predict the targets. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs) - Vector or matrix containing the predictions. In binary and - multiclass problems, this is a vector containing `n_samples`. In - a multilabel problem, it returns a matrix of shape - `(n_samples, n_outputs)`. - """ - check_is_fitted(self, attributes=["_label_binarizer"]) - if self._label_binarizer.y_type_.startswith("multilabel"): - # Threshold such that the negative label is -1 and positive label - # is 1 to use the inverse transform of the label binarizer fitted - # during fit. - scores = 2 * (self.decision_function(X) > 0) - 1 - return self._label_binarizer.inverse_transform(scores) - return super().predict(X) - def _more_tags(self): return { '_xfail_checks': { diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 83a1445c9434d..8f481adc61c8e 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -1306,12 +1306,6 @@ def test_ridge_regression_check_arguments_validity(return_intercept, assert_allclose(out, true_coefs, rtol=0, atol=atol) -def test_ridge_classifier_no_support_multilabel(): - X, y = make_multilabel_classification(n_samples=10, random_state=0) - with pytest.raises(ValueError): - RidgeClassifier().fit(X, y) - - @pytest.mark.parametrize( "solver", ["svd", "sparse_cg", "cholesky", "lsqr", "sag", "saga"]) def test_dtype_match(solver): @@ -1414,8 +1408,13 @@ def test_ridge_sag_with_X_fortran(): Ridge(solver='sag').fit(X, y) -def test_ridge_xxx(): - # X, y = datasets.load_iris(return_X_y=True) - X, y = datasets.make_multilabel_classification() - clf = RidgeClassifierCV().fit(X, y) - print(clf.predict(X)) +@pytest.mark.parametrize("Classifier", [RidgeClassifier, RidgeClassifierCV]) +def test_ridgeclassifier_multilabel(Classifier): + X, y = make_multilabel_classification(n_classes=1, random_state=0) + y = y.reshape(-1, 1) + Y = np.concatenate([y, y], axis=1) + clf = Classifier().fit(X, Y) + Y_pred = clf.predict(X) + + assert Y_pred.shape == Y.shape + assert_array_equal(Y_pred[:, 0], Y_pred[:, 1]) From 465c07147a6a83f5b04434953178f63d46208f3a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 13 Apr 2021 20:31:47 +0200 Subject: [PATCH 16/25] iter --- sklearn/linear_model/_ridge.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index d215677b750a8..5097d50a1e2d2 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -2022,11 +2022,5 @@ def _more_tags(self): '_xfail_checks': { 'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples', - # FIXME: see - # https://github.com/scikit-learn/scikit-learn/issues/19858 - # to track progress to resolve this issue - # 'check_classifiers_multilabel_format_output': - # 'RidgeClassifierCV.predict output an array of shape (25,) ' - # 'instead of (25, 5)', }, } From 9cdbf7b68b3c614f8055532aebc02ea6d0bf0782 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 13 Apr 2021 22:02:37 +0200 Subject: [PATCH 17/25] refactor fit --- sklearn/linear_model/_ridge.py | 95 ++++++++++++++++-------- sklearn/linear_model/tests/test_ridge.py | 15 +++- 2 files changed, 74 insertions(+), 36 deletions(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 5097d50a1e2d2..5fabab457a242 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -779,6 +779,57 @@ def fit(self, X, y, sample_weight=None): class _BaseRidgeClassifier(LinearClassifierMixin, MultiLabelMixin): + def _prepare_data(self, X, y, sample_weight, solver): + """Validate `X` and `y` and binarize `y`. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : ndarray of shape (n_samples,) + Target values. + + sample_weight : float or ndarray of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. + + solver : str + The solver used in `Ridge` to know which sparse format to support. + + Returns + ------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Validated training data. + + y : ndarray of shape (n_samples,) + Validated target values. + + sample_weight : ndarray of shape (n_samples,) + Validated sample weights. + + Y : ndarray of shape (n_samples, n_classes) + The binarized version of `y`. + """ + accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver) + X, y = self._validate_data( + X, y, accept_sparse=accept_sparse, multi_output=True, + y_numeric=False, + ) + + self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) + Y = self._label_binarizer.fit_transform(y) + if not self._label_binarizer.y_type_.startswith("multilabel"): + y = column_or_1d(y, warn=True) + + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + if self.class_weight: + # modify the sample weights with the corresponding class weight + sample_weight = ( + sample_weight * compute_sample_weight(self.class_weight, y) + ) + return X, y, sample_weight, Y + def predict(self, X): """Predict class labels for samples in `X`. @@ -966,28 +1017,16 @@ def fit(self, X, y, sample_weight=None): will have the same weight. .. versionadded:: 0.17 - *sample_weight* support to Classifier. + *sample_weight* support to RidgeClassifier. Returns ------- self : object Instance of the estimator. """ - _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), - self.solver) - X, y = self._validate_data(X, y, accept_sparse=_accept_sparse, - multi_output=True, y_numeric=False) - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - - self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) - Y = self._label_binarizer.fit_transform(y) - if not self._label_binarizer.y_type_.startswith('multilabel'): - y = column_or_1d(y, warn=True) - - if self.class_weight: - # modify the sample weights with the corresponding class weight - sample_weight = (sample_weight * - compute_sample_weight(self.class_weight, y)) + X, y, sample_weight, Y = self._prepare_data( + X, y, sample_weight, self.solver + ) super().fit(X, Y, sample_weight=sample_weight) return self @@ -1995,28 +2034,18 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True, y_numeric=False) - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - - self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) - Y = self._label_binarizer.fit_transform(y) - if not self._label_binarizer.y_type_.startswith('multilabel'): - y = column_or_1d(y, warn=True) - - if self.class_weight: - # modify the sample weights with the corresponding class weight - sample_weight = (sample_weight * - compute_sample_weight(self.class_weight, y)) + # by using solver="eigen" we force to accept all sparse format + X, y, sample_weight, Y = self._prepare_data( + X, y, sample_weight, solver="eigen" + ) + # If cv is None, gcv mode will be used and we will directly used the + # binarized Y. If cv is not None, a GridSearchCV will be used and `y` + # will be binarized again and thus we pass y instead of Y. target = Y if self.cv is None else y _BaseRidgeCV.fit(self, X, target, sample_weight=sample_weight) return self - @property - def classes_(self): - return self._label_binarizer.classes_ - def _more_tags(self): return { '_xfail_checks': { diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 8f481adc61c8e..87c53558a62e6 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -1408,12 +1408,21 @@ def test_ridge_sag_with_X_fortran(): Ridge(solver='sag').fit(X, y) -@pytest.mark.parametrize("Classifier", [RidgeClassifier, RidgeClassifierCV]) -def test_ridgeclassifier_multilabel(Classifier): +@pytest.mark.parametrize( + "Classifier, params", + [ + (RidgeClassifier, {}), + (RidgeClassifierCV, {"cv": None}), + (RidgeClassifierCV, {"cv": 3}) + ] +) +def test_ridgeclassifier_multilabel(Classifier, params): + """Check that multilabel classification is supported and give meaningful + results.""" X, y = make_multilabel_classification(n_classes=1, random_state=0) y = y.reshape(-1, 1) Y = np.concatenate([y, y], axis=1) - clf = Classifier().fit(X, Y) + clf = Classifier(**params).fit(X, Y) Y_pred = clf.predict(X) assert Y_pred.shape == Y.shape From b50f9b2bbd1aa1929bdbe17e963b5e67fdda68a0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 13 Apr 2021 22:14:23 +0200 Subject: [PATCH 18/25] doc --- sklearn/linear_model/_ridge.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 5fabab457a242..fff4d658d75a2 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -824,7 +824,6 @@ def _prepare_data(self, X, y, sample_weight, solver): sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) if self.class_weight: - # modify the sample weights with the corresponding class weight sample_weight = ( sample_weight * compute_sample_weight(self.class_weight, y) ) @@ -946,7 +945,7 @@ class RidgeClassifier(_BaseRidgeClassifier, _BaseRidge): .. versionadded:: 0.17 Stochastic Average Gradient descent solver. .. versionadded:: 0.19 - SAGA solver. + SAGA solver. random_state : int, RandomState instance, default=None Used when ``solver`` == 'sag' or 'saga' to shuffle the data. @@ -2039,9 +2038,11 @@ def fit(self, X, y, sample_weight=None): X, y, sample_weight, solver="eigen" ) - # If cv is None, gcv mode will be used and we will directly used the - # binarized Y. If cv is not None, a GridSearchCV will be used and `y` - # will be binarized again and thus we pass y instead of Y. + # If cv is None, gcv mode will be used and we used the binarized Y + # since y will not be binarized in _RidgeGCV estimator. + # If cv is not None, a GridSearchCV with some RidgeClassifier + # estimators are used where y will be binarized. Thus, we pass y + # instead of the binarized Y. target = Y if self.cv is None else y _BaseRidgeCV.fit(self, X, target, sample_weight=sample_weight) return self From c787be0135986274338daa01ba14716f19ca96ad Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 13 Apr 2021 22:17:04 +0200 Subject: [PATCH 19/25] add support in user guide --- doc/modules/multiclass.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst index b3ea8d838574e..810d9bf13c508 100644 --- a/doc/modules/multiclass.rst +++ b/doc/modules/multiclass.rst @@ -102,6 +102,7 @@ can provide additional strategies beyond what is built-in: - :class:`neural_network.MLPClassifier` - :class:`neighbors.RadiusNeighborsClassifier` - :class:`ensemble.RandomForestClassifier` + - :class:`linear_model.RidgeClassifier` - :class:`linear_model.RidgeClassifierCV` From f47252d44e8d30ec82994e39375a7ecc7407f3e2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Aug 2021 11:54:04 +0200 Subject: [PATCH 20/25] fix --- doc/whats_new/v1.0.rst | 5 - sklearn/base.py | 7 - sklearn/ensemble/_forest.py | 10 +- sklearn/linear_model/_ridge.py | 5 +- .../neural_network/_multilayer_perceptron.py | 5 +- sklearn/tree/_classes.py | 1 - sklearn/utils/tests/test_estimator_checks.py | 237 ------------------ 7 files changed, 10 insertions(+), 260 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 87beff9c4afe5..1d6032de6a7f7 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -589,11 +589,6 @@ Changelog :pr:`18649` by `Leandro Hermida ` and `Rodion Martynov `. -- |Fix| The `fit` method of the successive halving parameter search - (:class:`model_selection.HalvingGridSearchCV`, and - :class:`model_selection.HalvingRandomSearchCV`) now correctly handles the - `groups` parameter. :pr:`19847` by :user:`Xiaoyu Chai `. - :mod:`sklearn.naive_bayes` .......................... diff --git a/sklearn/base.py b/sklearn/base.py index 85f6048b033ba..6730ea8fd4590 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -812,13 +812,6 @@ class MetaEstimatorMixin: """Mixin class for all meta estimators in scikit-learn.""" -class MultiLabelMixin: - """Mixin to mark estimators that support multilabel-indicator target.""" - - def _more_tags(self): - return {"multilabel": True} - - class MultiOutputMixin: """Mixin to mark estimators that support multioutput.""" diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index f717eebd11c3e..d56a5e9856872 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -633,12 +633,7 @@ def _accumulate_prediction(predict, X, out, lock): out[i] += prediction[i] -class ForestClassifier( - ClassifierMixin, - MultiLabelMixin, - BaseForest, - metaclass=ABCMeta, -): +class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ Base class for forest of trees-based classifiers. @@ -899,6 +894,9 @@ def predict_log_proba(self, X): return proba + def _more_tags(self): + return {"multilabel": True} + class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): """ diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 3c254cc066bfa..4c1ca04621da9 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1006,7 +1006,7 @@ def fit(self, X, y, sample_weight=None): return super().fit(X, y, sample_weight=sample_weight) -class _BaseRidgeClassifier(LinearClassifierMixin, MultiLabelMixin): +class _BaseRidgeClassifier(LinearClassifierMixin): def _prepare_data(self, X, y, sample_weight, solver): """Validate `X` and `y` and binarize `y`. @@ -1087,6 +1087,9 @@ def predict(self, X): def classes_(self): return self._label_binarizer.classes_ + def _more_tags(self): + return {"multilabel": True} + class RidgeClassifier(_BaseRidgeClassifier, _BaseRidge): """Classifier using Ridge regression. diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 668b4aee7c082..35ccb83a658ba 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -6,7 +6,6 @@ # Jiyuan Qian # License: BSD 3 clause -from tkinter.tix import Tree import numpy as np from abc import ABCMeta, abstractmethod @@ -779,7 +778,7 @@ def _partial_fit(self, X, y): return self._fit(X, y, incremental=True) -class MLPClassifier(ClassifierMixin, MultiLabelMixin, BaseMultilayerPerceptron): +class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): """Multi-layer Perceptron classifier. This model optimizes the log-loss function using LBFGS or stochastic @@ -1252,7 +1251,7 @@ def predict_proba(self, X): return y_pred def _more_tags(self): - return {"multilabel": Tree} + return {"multilabel": True} class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 6ae23a21cd813..87a9b5f815e28 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -29,7 +29,6 @@ from ..base import clone from ..base import RegressorMixin from ..base import is_classifier -from ..base import MultiLabelMixin from ..base import MultiOutputMixin from ..utils import Bunch from ..utils import check_random_state diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index a08e982d9e2fa..ea158234ea785 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -927,243 +927,6 @@ def decision_function(self, X): ) -def test_check_classifiers_multilabel_output_format(): - n_samples, test_size, n_outputs = 100, 25, 5 - _, y = make_multilabel_classification( - n_samples=n_samples, - n_features=2, - n_classes=n_outputs, - n_labels=3, - length=50, - allow_unlabeled=True, - random_state=0, - ) - y_test = y[-test_size:] - - class BaseMultiLabelClassifierMock(ClassifierMixin, BaseEstimator): - def __init__(self, response_output): - self.response_output = response_output - - def fit(self, X, y): - return self - - class MultiLabelClassifierPredict(BaseMultiLabelClassifierMock): - def predict(self, X): - return self.response_output - - class MultiLabelClassifierPredictProba(BaseMultiLabelClassifierMock): - def predict_proba(self, X): - return self.response_output - - class MultiLabelClassifierDecisionFunction(BaseMultiLabelClassifierMock): - def decision_function(self, X): - return self.response_output - - # 1.method predict - # 1.1 inconsistent array type - clf = MultiLabelClassifierPredict(response_output=y_test.tolist()) - err_msg = ( - r"MultiLabelClassifierPredict.predict is expected to output a " - r"NumPy array. Got instead." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 1.2. inconsistent shape - clf = MultiLabelClassifierPredict(response_output=y_test[:, :-1]) - err_msg = ( - r"MultiLabelClassifierPredict.predict output a NumPy array of " - r"shape \(25, 4\) instead of \(25, 5\)." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 1.3 inconsistent dtype - clf = MultiLabelClassifierPredict(response_output=y_test.astype(np.float64)) - err_msg = ( - r"MultiLabelClassifierPredict.predict does not output the same " - r"dtype than the targets." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - - # 2. method predict_proba - # 2.1 unknown output type - clf = MultiLabelClassifierPredictProba(response_output=sp.csr_matrix(y_test)) - err_msg = ( - r"Unknown returned type by " - r"MultiLabelClassifierPredictProba.predict_proba. A list or a Numpy " - r"array are expected." - ) - assert_raises_regex( - ValueError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 2.2 for list output - # 2.2.1 iconsistent length - clf = MultiLabelClassifierPredictProba(response_output=y_test.tolist()) - err_msg = ( - r"MultiLabelClassifierPredictProba.predict_proba is expected to " - r"output a list of length n_outputs of Numpy array. Got length of 25 " - r"instead of 5." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 2.2.2 array of inconsistent shape - response_output = [np.ones_like(y_test) for _ in range(n_outputs)] - clf = MultiLabelClassifierPredictProba(response_output=response_output) - err_msg = ( - r"MultiLabelClassifierPredictProba.predict_proba is expected to output" - r" a list of NumPy array of shape \(n_samples, 2\). Got \(25, 5\) " - r"instead of \(25, 2\)." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 2.2.3 array of inconsistent dtype - response_output = [ - np.ones(shape=(y_test.shape[0], 2), dtype=np.int64) for _ in range(n_outputs) - ] - clf = MultiLabelClassifierPredictProba(response_output=response_output) - err_msg = ( - r"MultiLabelClassifierPredictProba.predict_proba is expected to output" - r" a list of NumPy array of floating dtype." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 2.2.4 array does not contain probability (each row should sum to 1) - response_output = [ - np.ones(shape=(y_test.shape[0], 2), dtype=np.float64) for _ in range(n_outputs) - ] - clf = MultiLabelClassifierPredictProba(response_output=response_output) - err_msg = ( - r"MultiLabelClassifierPredictProba.predict_proba is expected to " - r"provide probabilities such that each array rows should sum to 1." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 2.3 for array output - # 2.3.1 array of inconsistent shape - clf = MultiLabelClassifierPredictProba(response_output=y_test[:, :-1]) - err_msg = ( - r"MultiLabelClassifierPredictProba.predict_proba is expected to " - r"output a NumPy array of shape \(n_samples, n_outputs\). Got " - r"\(25, 4\) instead of \(25, 5\)." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 2.3.2 array of inconsistent dtype - response_output = np.zeros_like(y_test, dtype=np.int64) - clf = MultiLabelClassifierPredictProba(response_output=response_output) - err_msg = ( - r"MultiLabelClassifierPredictProba.predict_proba is expected to " - r"output a NumPy array of floating dtype." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 2.2.4 array does not contain probabilities - clf = MultiLabelClassifierPredictProba(response_output=y_test * 2.0) - err_msg = ( - r"MultiLabelClassifierPredictProba.predict_proba is expected to " - r"provide probabilities of the positive class and should therefore " - r"contain values below 1." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - - # 3. decision_function - # 3.1 inconsistent array type - clf = MultiLabelClassifierDecisionFunction(response_output=y_test.tolist()) - err_msg = ( - r"MultiLabelClassifierDecisionFunction.decision_function is expected " - r"to output a NumPy array. Got instead." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 3.2. inconsistent shape - clf = MultiLabelClassifierDecisionFunction(response_output=y_test[:, :-1]) - err_msg = ( - r"MultiLabelClassifierDecisionFunction.decision_function is expected " - r"to provide a NumPy array of shape \(n_samples, n_outputs\). Got " - r"\(25, 4\) instead of \(25, 5\)" - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - # 3.3 inconsistent dtype - clf = MultiLabelClassifierDecisionFunction(response_output=y_test) - err_msg = ( - r"MultiLabelClassifierDecisionFunction.decision_function is expected " - r"to output a floating dtype." - ) - assert_raises_regex( - AssertionError, - err_msg, - check_classifiers_multilabel_format_output, - clf.__class__.__name__, - clf, - ) - - def run_tests_without_pytest(): """Runs the tests in this file without using pytest.""" main_module = sys.modules["__main__"] From a33cbe0f93a2129db6c9bbc4ad9e385b58ac2cc2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Aug 2021 11:56:30 +0200 Subject: [PATCH 21/25] iter --- doc/whats_new/v1.0.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 1d6032de6a7f7..caa1504aa00fc 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -583,11 +583,12 @@ Changelog .............................. - |Feature| added :class:`model_selection.StratifiedGroupKFold`, that combines - :class:`model_selection.StratifiedKFold` and `model_selection.GroupKFold`, - providing an ability to split data preserving the distribution of classes in - each split while keeping each group within a single split. - :pr:`18649` by `Leandro Hermida ` and - `Rodion Martynov `. + :class:`model_selection.StratifiedKFold` and + :class:`model_selection.GroupKFold`, providing an ability to split data + preserving the distribution of classes in each split while keeping each + group within a single split. + :pr:`18649` by :user:`Leandro Hermida ` and + :user:`Rodion Martynov `. :mod:`sklearn.naive_bayes` .......................... From d5d33b14129c8c4875f2d2863e37597bb0bf3dab Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 7 Aug 2021 11:29:03 +0200 Subject: [PATCH 22/25] Apply suggestions from code review Co-authored-by: Alexandre Gramfort --- sklearn/linear_model/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 697588e889b92..ca0f4b7a9d15e 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -398,7 +398,7 @@ def decision_function(self, X): Returns ------- scores : ndarray of shape (n_samples,) or (n_samples, n_classes) - Confidence scores per `(n_sample, n_classes)` combination. In the + Confidence scores per `(n_samples, n_classes)` combination. In the binary case, confidence score for `self.classes_[1]` where >0 means this class would be predicted. """ From f9b086b240c8917b2af2e5674b6988ae83185937 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 18 Oct 2021 14:00:37 +0200 Subject: [PATCH 23/25] iter --- doc/whats_new/v1.0.rst | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index ae4c93d304971..25d6e95064a09 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -603,27 +603,6 @@ Changelog a model perfectly on some datasets when `residual_threshold=0`. :pr:`19499` by :user:`Gregory Strubel `. -- |Efficiency| The implementation of `fit` for `PolynomialFeatures` transformer - is now faster. This is especially noticeable on large sparse input. - :pr:`19734` by :user:`Fred Robinson `. - -- |Enhancement| `fit` method preserves dtype for numpy.float32 in - :class:`Lars`, :class:`LassoLars`, :class:`LassoLars`, :class:`LarsCV` and - :class:`LassoLarsCV`. :pr:`20155` by :user:`Takeshi Oura `. - -- |Fix| Sample weight invariance for class:`Ridge` was fixed in :pr:`19616` by - :user:`Oliver Grisel ` and - :user:`Christian Lorentzen `. - -- |Fix| The :func:`preprocessing.StandardScaler.inverse_transform` method - now raises error when the input data is 1D. - :pr:`19752` by :user:`Zhehao Liu `. - -- |Feature| Added new solver `lbfgs` (available with `solver="lbfgs"`) - and `positive` argument to class:`linear_model.Ridge`. - When `positive` is set to True, forces the coefficients to be positive - (only supported by `lbfgs`). - :pr:`20231` by :user:`Toshihiro Nakae `. - |Fix| Sample weight invariance for :class:`linear_model.Ridge` was fixed in :pr:`19616` by :user:`Oliver Grisel ` and :user:`Christian Lorentzen `. From d4454f3a4983dfd7bde4380da87ecc75ce6731cb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Oct 2021 14:11:23 +0200 Subject: [PATCH 24/25] Update doc/whats_new/v1.1.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 4cf10cac404dd..53e90a2d5c49d 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -76,7 +76,7 @@ Changelog :pr:`21093` by `Tom Dupre la Tour`_. - |Fix| Fix a bug in :class:`linear_model.RidgeClassifierCV` where the method - `predict` was performing an `argmax` on the scores obtain from + `predict` was performing an `argmax` on the scores obtained from `decision_function` instead of returning the multilabel indicator matrix. :pr:`19869` by :user:`Guillaume Lemaitre `. From 65fb1b02fb1eae58edbb71e6993dcea1ae1fade3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 27 Oct 2021 14:17:31 +0200 Subject: [PATCH 25/25] Apply changes from review --- sklearn/linear_model/_ridge.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 416abcaebbfd3..ab3f2aaacc23e 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1012,7 +1012,7 @@ def fit(self, X, y, sample_weight=None): return super().fit(X, y, sample_weight=sample_weight) -class _BaseRidgeClassifier(LinearClassifierMixin): +class _RidgeClassifierMixin(LinearClassifierMixin): def _prepare_data(self, X, y, sample_weight, solver): """Validate `X` and `y` and binarize `y`. @@ -1098,7 +1098,7 @@ def _more_tags(self): return {"multilabel": True} -class RidgeClassifier(_BaseRidgeClassifier, _BaseRidge): +class RidgeClassifier(_RidgeClassifierMixin, _BaseRidge): """Classifier using Ridge regression. This classifier first converts the target values into ``{-1, 1}`` and @@ -2209,7 +2209,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): """ -class RidgeClassifierCV(_BaseRidgeClassifier, _BaseRidgeCV): +class RidgeClassifierCV(_RidgeClassifierMixin, _BaseRidgeCV): """Ridge classifier with built-in cross-validation. See glossary entry for :term:`cross-validation estimator`. @@ -2382,7 +2382,9 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - # by using solver="eigen" we force to accept all sparse format + # `RidgeClassifier` does not accept "sag" or "saga" solver and thus support + # csr, csc, and coo sparse matrices. By using solver="eigen" we force to accept + # all sparse format. X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, solver="eigen") # If cv is None, gcv mode will be used and we used the binarized Y @@ -2391,7 +2393,7 @@ def fit(self, X, y, sample_weight=None): # estimators are used where y will be binarized. Thus, we pass y # instead of the binarized Y. target = Y if self.cv is None else y - _BaseRidgeCV.fit(self, X, target, sample_weight=sample_weight) + super().fit(X, target, sample_weight=sample_weight) return self def _more_tags(self):