glemaitre
diff --git a/‎doc/whats_new/v0.23.rst
Lines changed: 8 additions & 1 deletion b/‎doc/whats_new/v0.23.rst
Lines changed: 8 additions & 1 deletion
diff --git a/‎sklearn/utils/estimator_checks.py
Lines changed: 17 additions & 43 deletions b/‎sklearn/utils/estimator_checks.py
Lines changed: 17 additions & 43 deletions
diff --git a/‎sklearn/utils/tests/test_estimator_checks.py
Lines changed: 12 additions & 5 deletions b/‎sklearn/utils/tests/test_estimator_checks.py
Lines changed: 12 additions & 5 deletions
@@ -90,7 +90,14 @@ Changelog
 - |Fix| Fixed a bug in :func:`metrics.mean_squared_error` where the
   average of multiple RMSE values was incorrectly calculated as the root of the
   average of multiple MSE values.
-  :pr:`17309` by :user:`Swier Heeres <swierh>`
+  :pr:`17309` by :user:`Swier Heeres <swierh>`.
+
+:mod:`sklearn.utils`
+....................
+
+- |Fix| Fix :func:`utils.estimator_checks.check_estimator` so that all test
+  cases support the `binary_only` estimator tag.
+  :pr:`17812` by :user:`Bruno Charron <brcharron>`.
 
 .. _changes_0_23_1:
 
 
@@ -719,15 +719,12 @@ def check_estimator_sparse_data(name, estimator_orig):
     X[X < .8] = 0
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     X_csr = sparse.csr_matrix(X)
-    tags = estimator_orig._get_tags()
-    if tags['binary_only']:
-        y = (2 * rng.rand(40)).astype(np.int)
-    else:
-        y = (4 * rng.rand(40)).astype(np.int)
+    y = (4 * rng.rand(40)).astype(int)
     # catch deprecation warnings
     with ignore_warnings(category=FutureWarning):
         estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
+    tags = estimator_orig._get_tags()
     for matrix_format, X in _generate_sparse_matrix(X_csr):
         # catch deprecation warnings
         with ignore_warnings(category=FutureWarning):
@@ -825,10 +822,7 @@ def check_sample_weights_list(name, estimator_orig):
         n_samples = 30
         X = _pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)),
                                           estimator_orig)
-        if estimator._get_tags()['binary_only']:
-            y = np.arange(n_samples) % 2
-        else:
-            y = np.arange(n_samples) % 3
+        y = np.arange(n_samples) % 3
         y = _enforce_estimator_tags_y(estimator, y)
         sample_weight = [3] * n_samples
         # Test that estimators don't raise any exception
@@ -905,10 +899,7 @@ def check_dtype_object(name, estimator_orig):
     X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig)
     X = X.astype(object)
     tags = estimator_orig._get_tags()
-    if tags['binary_only']:
-        y = (X[:, 0] * 2).astype(np.int)
-    else:
-        y = (X[:, 0] * 4).astype(np.int)
+    y = (X[:, 0] * 4).astype(int)
     estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
 
@@ -1007,9 +998,7 @@ def check_dont_overwrite_parameters(name, estimator_orig):
     rnd = np.random.RandomState(0)
     X = _pairwise_estimator_convert_X(X, estimator_orig)
-    y = X[:, 0].astype(np.int)
-    if estimator._get_tags()['binary_only']:
-        y[y == 2] = 1
+    y = X[:, 0].astype(int)
     y = _enforce_estimator_tags_y(estimator, y)
 
     if hasattr(estimator, "n_components"):
@@ -1060,8 +1049,6 @@ def check_fit2d_predict1d(name, estimator_orig):
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     y = X[:, 0].astype(np.int)
     tags = estimator_orig._get_tags()
-    if tags['binary_only']:
-        y[y == 2] = 1
     estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
 
@@ -1109,9 +1096,7 @@ def check_methods_subset_invariance(name, estimator_orig):
     rnd = np.random.RandomState(0)
     X = 3 * rnd.uniform(size=(20, 3))
     X = _pairwise_estimator_convert_X(X, estimator_orig)
-    y = X[:, 0].astype(np.int)
-    if estimator_orig._get_tags()['binary_only']:
-        y[y == 2] = 1
+    y = X[:, 0].astype(int)
     estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
 
@@ -1383,10 +1368,7 @@ def check_fit_score_takes_y(name, estimator_orig):
     n_samples = 30
     X = rnd.uniform(size=(n_samples, 3))
     X = _pairwise_estimator_convert_X(X, estimator_orig)
-    if estimator_orig._get_tags()['binary_only']:
-        y = np.arange(n_samples) % 2
-    else:
-        y = np.arange(n_samples) % 3
+    y = np.arange(n_samples) % 3
     estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
     set_random_state(estimator)
@@ -1416,8 +1398,6 @@ def check_estimators_dtypes(name, estimator_orig):
     X_train_int_64 = X_train_32.astype(np.int64)
     X_train_int_32 = X_train_32.astype(np.int32)
     y = X_train_int_64[:, 0]
-    if estimator_orig._get_tags()['binary_only']:
-        y[y == 2] = 1
     y = _enforce_estimator_tags_y(estimator_orig, y)
 
     methods = ["predict", "transform", "decision_function", "predict_proba"]
@@ -1596,6 +1576,7 @@ def check_estimators_partial_fit_n_features(name, estimator_orig):
     estimator = clone(estimator_orig)
     X, y = make_blobs(n_samples=50, random_state=1)
     X -= X.min()
+    y = _enforce_estimator_tags_y(estimator_orig, y)
 
     try:
         if is_classifier(estimator):
@@ -2062,11 +2043,7 @@ def check_classifiers_multilabel_representation_invariance(name,
 def check_estimators_fit_returns_self(name, estimator_orig,
                                       readonly_memmap=False):
     """Check if self is returned when calling fit"""
-    if estimator_orig._get_tags()['binary_only']:
-        n_centers = 2
-    else:
-        n_centers = 3
-    X, y = make_blobs(random_state=0, n_samples=21, centers=n_centers)
+    X, y = make_blobs(random_state=0, n_samples=21)
     # some want non-negative input
     X -= X.min()
     X = _pairwise_estimator_convert_X(X, estimator_orig)
@@ -2108,10 +2085,7 @@ def check_supervised_y_2d(name, estimator_orig):
     X = _pairwise_estimator_convert_X(
         rnd.uniform(size=(n_samples, 3)), estimator_orig
     )
-    if tags['binary_only']:
-        y = np.arange(n_samples) % 2
-    else:
-        y = np.arange(n_samples) % 3
+    y = np.arange(n_samples) % 3
     y = _enforce_estimator_tags_y(estimator_orig, y)
     estimator = clone(estimator_orig)
     set_random_state(estimator)
@@ -2436,11 +2410,7 @@ def check_class_weight_balanced_linear_classifier(name, Classifier):
 
 @ignore_warnings(category=FutureWarning)
 def check_estimators_overwrite_params(name, estimator_orig):
-    if estimator_orig._get_tags()['binary_only']:
-        n_centers = 2
-    else:
-        n_centers = 3
-    X, y = make_blobs(random_state=0, n_samples=21, centers=n_centers)
+    X, y = make_blobs(random_state=0, n_samples=21)
     # some want non-negative input
     X -= X.min()
     X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
@@ -2511,7 +2481,8 @@ def check_no_attributes_set_in_init(name, estimator_orig):
 def check_sparsify_coefficients(name, estimator_orig):
     X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1],
                   [-1, -2], [2, 2], [-2, -2]])
-    y = [1, 1, 1, 2, 2, 2, 3, 3, 3]
+    y = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3])
+    y = _enforce_estimator_tags_y(estimator_orig, y)
     est = clone(estimator_orig)
 
     est.fit(X, y)
@@ -2535,7 +2506,7 @@ def check_classifier_data_not_an_array(name, estimator_orig):
     X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1],
                   [0, 3], [1, 0], [2, 0], [4, 4], [2, 3], [3, 2]])
     X = _pairwise_estimator_convert_X(X, estimator_orig)
-    y = [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2]
+    y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2])
     y = _enforce_estimator_tags_y(estimator_orig, y)
     for obj_type in ["NotAnArray", "PandasDataframe"]:
         check_estimators_data_not_an_array(name, estimator_orig, X, y,
@@ -2682,6 +2653,9 @@ def _enforce_estimator_tags_y(estimator, y):
         # Create strictly positive y. The minimal increment above 0 is 1, as
         # y could be of integer dtype.
         y += 1 + abs(y.min())
+    # Estimators with a `binary_only` tag only accept up to two unique y values
+    if estimator._get_tags()["binary_only"] and y.size > 0:
+        y = np.where(y == y.flat[0], y, y.flat[0] + 1)
     # Estimators in mono_output_task_error raise ValueError if y is of 1-D
     # Convert into a 2-D y for those estimators.
     if estimator._get_tags()["multioutput_only"]:
 
@@ -34,7 +34,6 @@
 from sklearn.linear_model import MultiTaskElasticNet, LogisticRegression
 from sklearn.svm import SVC
 from sklearn.neighbors import KNeighborsRegressor
-from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils.validation import check_array
 from sklearn.utils import all_estimators
 
@@ -306,11 +305,19 @@ def predict(self, X):
         return np.array([self.value_] * X.shape[0])
 
 
-class UntaggedBinaryClassifier(DecisionTreeClassifier):
+class UntaggedBinaryClassifier(SGDClassifier):
     # Toy classifier that only supports binary classification, will fail tests.
-    def fit(self, X, y, sample_weight=None):
-        super().fit(X, y, sample_weight)
-        if np.all(self.n_classes_ > 2):
+    def fit(self, X, y, coef_init=None, intercept_init=None,
+            sample_weight=None):
+        super().fit(X, y, coef_init, intercept_init, sample_weight)
+        if len(self.classes_) > 2:
+            raise ValueError('Only 2 classes are supported')
+        return self
+
+    def partial_fit(self, X, y, classes=None, sample_weight=None):
+        super().partial_fit(X=X, y=y, classes=classes,
+                            sample_weight=sample_weight)
+        if len(self.classes_) > 2:
             raise ValueError('Only 2 classes are supported')
         return self