8000 FIX/ENH CheckingClassifier support parameters and sparse matrices by glemaitre · Pull Request #17259 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

FIX/ENH CheckingClassifier support parameters and sparse matrices #17259

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 18, 2020
135 changes: 108 additions & 27 deletions sklearn/utils/_mocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,31 +51,46 @@ def __ne__(self, other):
class CheckingClassifier(ClassifierMixin, BaseEstimator):
"""Dummy classifier to test pipelining and meta-estimators.

Checks some property of X and y in fit / predict.
Checks some property of `X` and `y`in fit / predict.
This allows testing whether pipelines / cross-validation or metaestimators
changed the input.

Parameters
----------
check_y
check_X
foo_param
expected_fit_params
check_y, check_X : callable, default=None
The callable used to validate `X` and `y`. These callable should return
a bool where `False` will trigger an `AssertionError`.

check_y_params, check_X_params : dict, default=None
The optional parameters to pass to `check_X` and `check_y`.

foo_param : int, default=0
A `foo` param. When `foo > 1`, the output of :meth:`score` will be 1
otherwise it is 0.

expected_fit_params : list of str, default=None
A list of the expected parameters given when calling `fit`.

Attributes
----------
classes_
classes_ : int
The classes seen during `fit`.

n_features_in_ : int
The number of features seen during `fit`.
"""
def __init__(self, check_y=None, check_X=None, foo_param=0,
def __init__(self, *, check_y=None, check_y_params=None,
check_X=None, check_X_params=None, foo_param=0,
expected_fit_params=None):
self.check_y = check_y
self.check_y_params = check_y_params
self.check_X = check_X
self.check_X_params = check_X_params
self.foo_param = foo_param
self.expected_fit_params = expected_fit_params

def fit(self, X, y, **fit_params):
"""
Fit classifier
"""Fit classifier.

Parameters
----------
Expand All @@ -89,48 +104,114 @@ def fit(self, X, y, **fit_params):

**fit_params : dict of string -> object
Parameters passed to the ``fit`` method of the estimator

Returns
-------
self
"""
assert len(X) == len(y)
assert _num_samples(X) == _num_samples(y)
if self.check_X is not None:
assert self.check_X(X)
params = {} if self.check_X_params is None else self.check_X_params
assert self.check_X(X, **params)
if self.check_y is not None:
params = {} if self.check_y_params is None else self.check_y_params
assert self.check_y(y)
self.n_features_in_ = len(X)
self.classes_ = np.unique(check_array(y, ensure_2d=False,
allow_nd=True))
self.n_features_in_ = np.shape(X)[1]
self.classes_ = np.unique(
check_array(y, ensure_2d=False, allow_nd=True)
)
if self.expected_fit_params:
missing = set(self.expected_fit_params) - set(fit_params)
assert len(missing) == 0, 'Expected fit parameter(s) %s not ' \
'seen.' % list(missing)
if missing:
raise AssertionError(
f'Expected fit parameter(s) {list(missing)} not seen.'
)
for key, value in fit_params.items():
assert len(value) == len(X), (
'Fit parameter %s has length %d; '
'expected %d.'
% (key, len(value), len(X)))
if _num_samples(value) != _num_samples(X):
raise AssertionError(
f'Fit parameter {key} has length {_num_samples(value)}'
f'; expected {_num_samples(X)}.'
)

return self

def predict(self, T):
"""
def predict(self, X):
"""Predict the first class seen in `classes_`.

Parameters
----------
T : indexable, length n_samples
X : array-like of shape (n_samples, n_features)
The input data.

Returns
-------
preds : ndarray of shape (n_samples,)
Predictions of the first class seens in `classes_`.
"""
if self.check_X is not None:
assert self.check_X(T)
return self.classes_[np.zeros(_num_samples(T), dtype=np.int)]
params = {} if self.check_X_params is None else self.check_X_params
assert self.check_X(X, **params)
return self.classes_[np.zeros(_num_samples(X), dtype=np.int)]

def score(self, X=None, Y=None):
def predict_proba(self, X):
"""Predict probabilities for each class.

Here, the dummy classifier will provide a probability of 1 for the
first class of `classes_` and 0 otherwise.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data.

Returns
-------
proba : ndarray of shape (n_samples, n_classes)
The probabilities for each sample and class.
"""
proba = np.zeros((_num_samples(X), len(self.classes_)))
proba[:, 0] = 1
return proba

def decision_function(self, X):
"""Confidence score.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data.

Returns
-------
decision : ndarray of shape (n_samples,) if n_classes == 2\
else (n_samples, n_classes)
Confidence score.
"""
if len(self.classes_) == 2:
# for binary classifier, the confidence score is related to
# classes_[1] and therefore should be null.
return np.zeros(_num_samples(X))
else:
return self.predict_proba(X)

def score(self, X=None, Y=None):
"""Fake score.

Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data, where n_samples is the number of samples and
n_features is the number of features.

Y : array-like of shape (n_samples, n_output) or (n_samples,), optional
Y : array-like of shape (n_samples, n_output) or (n_samples,)
Target relative to X for classification or regression;
None for unsupervised learning.

Returns
-------
score : float
Either 0 or 1 depending of `foo_param` (i.e. `foo_param > 1 =>
score=1` otherwise `score=0`).
"""
if self.foo_param > 1:
score = 1.
Expand Down
108 changes: 108 additions & 0 deletions sklearn/utils/tests/test_mocking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import numpy as np
import pytest
from scipy import sparse

from numpy.testing import assert_array_equal
from numpy.testing import assert_allclose

from sklearn.datasets import load_iris
from sklearn.utils import check_array
from sklearn.utils import _safe_indexing
from sklearn.utils._testing import _convert_container

from sklearn.utils._mocking import CheckingClassifier


@pytest.fixture
def iris():
return load_iris(return_X_y=True)


@pytest.mark.parametrize(
"input_type", ["list", "array", "sparse", "dataframe"]
)
def test_checking_classifier(iris, input_type):
# Check that the CheckingClassifier outputs what we expect
X, y = iris
X = _convert_container(X, input_type)
clf = CheckingClassifier()
clf.fit(X, y)

assert_array_equal(clf.classes_, np.unique(y))
assert len(clf.classes_) == 3
assert clf.n_features_in_ == 4

y_pred = clf.predict(X)
assert_array_equal(y_pred, np.zeros(y_pred.size, dtype=np.int))

assert clf.score(X) == pytest.approx(0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should be able to check for strict equality here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since they are floating-point, why would you make strict equality?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because they're hard-coded ones and zeros. There's not going to be any floating issue there

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still consider this a bad pattern to have in the code.

clf.set_params(foo_param=10)
assert clf.fit(X, y).score(X) == pytest.approx(1)
< 9E81 span class='blob-code-inner blob-code-marker ' data-code-marker="+">
y_proba = clf.predict_proba(X)
assert y_proba.shape == (150, 3)
assert_allclose(y_proba[:, 0], 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here and for most other checks

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

proba are also floating point

assert_allclose(y_proba[:, 1:], 0)

y_decision = clf.decision_function(X)
assert y_decision.shape == (150, 3)
assert_allclose(y_decision[:, 0], 1)
assert_allclose(y_decision[:, 1:], 0)

# check the shape in case of binary classification
first_2_classes = np.logical_or(y == 0, y == 1)
X = _safe_indexing(X, first_2_classes)
y = _safe_indexing(y, first_2_classes)
clf.fit(X, y)

y_proba = clf.predict_proba(X)
assert y_proba.shape == (100, 2)
assert_allclose(y_proba[:, 0], 1)
assert_allclose(y_proba[:, 1], 0)

y_decision = clf.decision_function(X)
assert y_decision.shape == (100,)
assert_allclose(y_decision, 0)


def test_checking_classifier_with_params(iris):
X, y = iris
X_sparse = sparse.csr_matrix(X)

def check_X_is_sparse(X):
if not sparse.issparse(X):
raise ValueError("X is not sparse")
return True

clf = CheckingClassifier(check_X=check_X_is_sparse)
with pytest.raises(ValueError, match="X is not sparse"):
clf.fit(X, y)
clf.fit(X_sparse, y)

def _check_array(X, **params):
check_array(X, **params)
return True

clf = CheckingClassifier(
check_X=_check_array, check_X_params={"accept_sparse": False}
)
clf.fit(X, y)
with pytest.raises(TypeError, match="A sparse matrix was passed"):
clf.fit(X_sparse, y)


def test_checking_classifier_fit_params(iris):
# check the error raised when the number of samples is not the one expected
X, y = iris
clf = CheckingClassifier(expected_fit_params=["sample_weight"])
sample_weight = np.ones(len(X) // 2)

with pytest.raises(AssertionError, match="Fit parameter sample_weight"):
clf.fit(X, y, sample_weight=sample_weight)


def test_checking_classifier_missing_fit_params(iris):
X, y = iris
clf = CheckingClassifier(expected_fit_params=["sample_weight"])
with pytest.raises(AssertionError, match="Expected fit parameter"):
clf.fit(X, y)
0