8000 FIX Fix handling of binary_only tag in check_estimator by brcharron · Pull Request #17812 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

FIX Fix handling of binary_only tag in check_estimator #17812

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,13 @@ Changelog
:meth:`tree.DecisionTreeRegressor.fit`, and has not effect.
:pr:`17614` by :user:`Juan Carlos Alfaro Jiménez <alfaro96>`.

:mod:`sklearn.utils`
.........................

- |Fix| Fix :func:`utils.estimator_checks.check_estimator` so that all test
cases support the `binary_only` estimator tag.
:pr:`17812` by :user:`Bruno Charron <brcharron>`.

Code and Documentation Contributors
-----------------------------------

Expand Down
56 changes: 15 additions & 41 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,15 +701,12 @@ def check_estimator_sparse_data(name, estimator_orig):
X[X < .8] = 0
X = _pairwise_estimator_convert_X(X, estimator_orig)
X_csr = sparse.csr_matrix(X)
tags = estimator_orig._get_tags()
if tags['binary_only']:
y = (2 * rng.rand(40)).astype(int)
else:
y = (4 * rng.rand(40)).astype(int)
y = (4 * rng.rand(40)).astype(int)
# catch deprecation warnings
with ignore_warnings(category=FutureWarning):
estimator = clone(estimator_orig)
y = _enforce_estimator_tags_y(estimator, y)
tags = estimator_orig._get_tags()
for matrix_format, X in _generate_sparse_matrix(X_csr):
# catch deprecation warnings
with ignore_warnings(category=FutureWarning):
Expand Down Expand Up @@ -807,10 +804,7 @@ def check_sample_weights_list(name, estimator_orig):
n_samples = 30
X = _pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)),
estimator_orig)
if estimator._get_tags()['binary_only']:
y = np.arange(n_samples) % 2
else:
y = np.arange(n_samples) % 3
y = np.arange(n_samples) % 3
y = _enforce_estimator_tags_y(estimator, y)
sample_weight = [3] * n_samples
# Test that estimators don't raise any exception
Expand Down Expand Up @@ -901,10 +895,7 @@ def check_dtype_object(name, estimator_orig):
X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig)
X = X.astype(object)
tags = estimator_orig._get_tags()
if tags['binary_only']:
y = (X[:, 0] * 2).astype(int)
else:
y = (X[:, 0] * 4).astype(int)
y = (X[:, 0] * 4).astype(int)
estimator = clone(estimator_orig)
y = _enforce_estimator_tags_y(estimator, y)

Expand Down Expand Up @@ -998,8 +989,6 @@ def check_dont_overwrite_parameters(name, estimator_orig):
X = 3 * rnd.uniform(size=(20, 3))
X = _pairwise_estimator_convert_X(X, estimator_orig)
y = X[:, 0].astype(int)
if estimator._get_tags()['binary_only']:
y[y == 2] = 1
y = _enforce_estimator_tags_y(estimator, y)

if hasattr(estimator, "n_components"):
Expand Down Expand Up @@ -1050,8 +1039,6 @@ def check_fit2d_predict1d(name, estimator_orig):
X = _pairwise_estimator_convert_X(X, estimator_orig)
y = X[:, 0].astype(int)
tags = estimator_orig._get_tags()
if tags['binary_only']:
y[y == 2] = 1
estimator = clone(estimator_orig)
y = _enforce_estimator_tags_y(estimator, y)

Expand Down Expand Up @@ -1100,8 +1087,6 @@ def check_methods_subset_invariance(name, estimator_orig):
X = 3 * rnd.uniform(size=(20, 3))
X = _pairwise_estimator_convert_X(X, estimator_orig)
y = X[:, 0].astype(int)
if estimator_orig._get_tags()['binary_only']:
y[y == 2] = 1
estimator = clone(estimator_orig)
y = _enforce_estimator_tags_y(estimator, y)

Expand Down Expand Up @@ -1373,10 +1358,7 @@ def check_fit_score_takes_y(name, estimator_orig):
n_samples = 30
X = rnd.uniform(size=(n_samples, 3))
X = _pairwise_estimator_convert_X(X, estimator_orig)
if estimator_orig._get_tags()['binary_only']:
y = np.arange(n_samples) % 2
else:
y = np.arange(n_samples) % 3
y = np.arange(n_samples) % 3
estimator = clone(estimator_orig)
y = _enforce_estimator_tags_y(estimator, y)
set_random_state(estimator)
Expand Down Expand Up @@ -1406,8 +1388,6 @@ def check_estimators_dtypes(name, estimator_orig):
X_train_int_64 = X_train_32.astype(np.int64)
X_train_int_32 = X_train_32.astype(np.int32)
y = X_train_int_64[:, 0]
if estimator_orig._get_tags()['binary_only']:
y[y == 2] = 1
y = _enforce_estimator_tags_y(estimator_orig, y)

methods = ["predict", "transform", "decision_function", "predict_proba"]
Expand Down Expand Up @@ -1581,6 +1561,7 @@ def check_estimators_partial_fit_n_features(name, estimator_orig):
estimator = clone(estimator_orig)
X, y = make_blobs(n_samples=50, random_state=1)
X -= X.min()
y = _enforce_estimator_tags_y(estimator_orig, y)

try:
if is_classifier(estimator):
Expand Down Expand Up @@ -2047,11 +2028,7 @@ def check_classifiers_multilabel_representation_invariance(name,
def check_estimators_fit_returns_self(name, estimator_orig,
readonly_memmap=False):
"""Check if self is returned when calling fit"""
if estimator_orig._get_tags()['binary_only']:
n_centers = 2
else:
n_centers = 3
X, y = make_blobs(random_state=0, n_samples=21, centers=n_centers)
X, y = make_blobs(random_state=0, n_samples=21)
# some want non-negative input
X -= X.min()
X = _pairwise_estimator_convert_X(X, estimator_orig)
Expand Down Expand Up @@ -2093,10 +2070,7 @@ def check_supervised_y_2d(name, estimator_orig):
X = _pairwise_estimator_convert_X(
rnd.uniform(size=(n_samples, 3)), estimator_orig
)
if tags['binary_only']:
y = np.arange(n_samples) % 2
else:
y = np.arange(n_samples) % 3
y = np.arange(n_samples) % 3
y = _enforce_estimator_tags_y(estimator_orig, y)
estimator = clone(estimator_orig)
set_random_state(estimator)
Expand Down Expand Up @@ -2414,11 +2388,7 @@ def check_class_weight_balanced_linear_classifier(name, Classifier):

@ignore_warnings(category=FutureWarning)
def check_estimators_overwrite_params(name, estimator_orig):
if estimator_orig._get_tags()['binary_only']:
n_centers = 2
else:
n_centers = 3
X, y = make_blobs(random_state=0, n_samples=21, centers=n_centers)
X, y = make_blobs(random_state=0, n_samples=21)
# some want non-negative input
X -= X.min()
X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
Expand Down Expand Up @@ -2489,7 +2459,8 @@ def check_no_attributes_set_in_init(name, estimator_orig):
def check_sparsify_coefficients(name, estimator_orig):
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1],
[-1, -2], [2, 2], [-2, -2]])
y = [1, 1, 1, 2, 2, 2, 3, 3, 3]
y = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3])
y = _enforce_estimator_tags_y(estimator_orig, y)
est = clone(estimator_orig)

est.fit(X, y)
Expand All @@ -2513,7 +2484,7 @@ def check_classifier_data_not_an_array(name, estimator_orig):
X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1],
[0, 3], [1, 0], [2, 0], [4, 4], [2, 3], [3, 2]])
X = _pairwise_estimator_convert_X(X, estimator_orig)
y = [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2]
y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2])
y = _enforce_estimator_tags_y(estimator_orig, y)
for obj_type in ["NotAnArray", "PandasDataframe"]:
check_estimators_data_not_an_array(name, estimator_orig, X, y,
Expand Down Expand Up @@ -2649,6 +2620,9 @@ def _enforce_estimator_tags_y(estimator, y):
# Create strictly positive y. The minimal increment above 0 is 1, as
# y could be of integer dtype.
y += 1 + abs(y.min())
# Estimators with a `binary_only` tag only accept up to two unique y values
if estimator._get_tags()["binary_only"] and y.size > 0:
y = np.where(y == y.flat[0], y, y.flat[0] + 1)
# Estimators in mono_output_task_error raise ValueError if y is of 1-D
# Convert into a 2-D y for those estimators.
if estimator._get_tags()["multioutput_only"]:
Expand Down
17 changes: 12 additions & 5 deletions sklearn/utils/tests/test_estimator_checks.py
< 75D1 td class="blob-code blob-code-context js-file-line"> return np.array([self.value_] * X.shape[0])
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
from sklearn.linear_model import MultiTaskElasticNet, LogisticRegression
from sklearn.svm import SVC, NuSVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.validation import check_array
from sklearn.utils import all_estimators
from sklearn.exceptions import SkipTestWarning
Expand Down Expand Up @@ -307,11 +306,19 @@ def predict(self, X):


class UntaggedBinaryClassifier(DecisionTreeClassifier):
class UntaggedBinaryClassifier(SGDClassifier):
# Toy classifier that only supports binary classification, will fail tests.
def fit(self, X, y, sample_weight=None):
super().fit(X, y, sample_weight)
if np.all(self.n_classes_ > 2):
def fit(self, X, y, coef_init=None, intercept_init=None,
sample_weight=None):
super().fit(X, y, coef_init, intercept_init, sample_weight)
if len(self.classes_) > 2:
raise ValueError('Only 2 classes are supported')
return self

def partial_fit(self, X, y, classes=None, sample_weight=None):
super().partial_fit(X=X, y=y, classes=classes,
sample_weight=sample_weight)
if len(self.classes_) > 2:
raise ValueError('Only 2 classes are supported')
return self

Expand Down
0