diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 1fc651e303e56..89eaa5e24598e 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -339,6 +339,10 @@ Changelog in favor of `y_proba`. `y_prob` will be removed in version 1.7. :pr:`28092` by :user:`Adam Li `. +- |API| For classifiers and classification metrics, labels encoded as bytes + is deprecated and will raise an error in v1.6. + :pr:`18555` by :user:`Kaushik Amar Das `. + :mod:`sklearn.mixture` ...................... @@ -418,6 +422,11 @@ Changelog - |API| :func:`utils.tosequence` is deprecated and will be removed in version 1.7. :pr:`28763` by :user:`Jérémie du Boisberranger `. +- |API| Raise informative warning message in :func:`type_of_target` when + represented as bytes. For classifiers and classification metrics, labels encoded + as bytes is deprecated and will raise an error in v1.6. + :pr:`18555` by :user:`Kaushik Amar Das `. + - |Fix| :func:`~utils._safe_indexing` now works correctly for polars DataFrame when `axis=0` and supports indexing polars Series. :pr:`28521` by :user:`Yao Xiao `. diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 7b3a71978907a..4beeea23446c8 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -29,10 +29,12 @@ from sklearn.preprocessing import label_binarize from sklearn.random_projection import _sparse_random_matrix from sklearn.utils._testing import ( + _convert_container, assert_allclose, assert_almost_equal, assert_array_almost_equal, assert_array_equal, + ignore_warnings, ) from sklearn.utils.extmath import softmax from sklearn.utils.fixes import CSR_CONTAINERS @@ -864,17 +866,6 @@ def test_binary_clf_curve_implicit_pos_label(curve_func): with pytest.raises(ValueError, match=msg): curve_func(np.array(["a", "b"], dtype=object), [0.0, 1.0]) - # The error message is slightly different for bytes-encoded - # class labels, but otherwise the behavior is the same: - msg = ( - "y_true takes value in {b'a', b'b'} and pos_label is " - "not specified: either make y_true take " - "value in {0, 1} or {-1, 1} or pass pos_label " - "explicitly." - ) - with pytest.raises(ValueError, match=msg): - curve_func(np.array([b"a", b"b"], dtype=" 2 or (y.ndim == 2 and len(first_row) > 1): + if issparse(first_row_or_val): + first_row_or_val = first_row_or_val.data + if xp.unique_values(y).shape[0] > 2 or (y.ndim == 2 and len(first_row_or_val) > 1): # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] return "multiclass" + suffix else: diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 6603aca206e66..40a13526ab009 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -10,6 +10,7 @@ from sklearn.utils._array_api import yield_namespace_device_dtype_combinations from sklearn.utils._testing import ( _array_api_for_tests, + _convert_container, assert_allclose, assert_array_almost_equal, assert_array_equal, @@ -595,3 +596,18 @@ def test_ovr_decision_function(): ] assert_allclose(dec_values, dec_values_one, atol=1e-6) + + +# TODO(1.6): Change to ValueError when byte labels is deprecated. +@pytest.mark.parametrize("input_type", ["list", "array"]) +def test_labels_in_bytes_format(input_type): + # check that we raise an error with bytes encoded labels + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16980 + target = _convert_container([b"a", b"b"], input_type) + err_msg = ( + "Support for labels represented as bytes is deprecated in v1.5 and will" + " error in v1.7. Convert the labels to a string or integer format." + ) + with pytest.warns(FutureWarning, match=err_msg): + type_of_target(target)