From 451dbea8c3586ca702b4090a29bdc87c27ffc5bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Sok=C3=B3=C5=82?= Date: Wed, 23 Aug 2023 11:20:08 +0200 Subject: [PATCH 1/3] MAINT: Remove np.in1d and np.trapz usages --- .../plot_label_propagation_digits_active_learning.py | 2 +- sklearn/datasets/_twenty_newsgroups.py | 2 +- sklearn/feature_extraction/image.py | 4 +++- sklearn/metrics/_plot/tests/test_precision_recall_display.py | 5 +++-- sklearn/metrics/_plot/tests/test_roc_curve_display.py | 5 +++-- sklearn/metrics/_ranking.py | 5 +++-- sklearn/model_selection/_split.py | 4 ++-- sklearn/model_selection/tests/test_search.py | 2 +- sklearn/model_selection/tests/test_split.py | 4 ++-- sklearn/naive_bayes.py | 2 +- sklearn/preprocessing/_label.py | 2 +- sklearn/tests/test_isotonic.py | 2 +- sklearn/utils/_encode.py | 2 +- sklearn/utils/class_weight.py | 4 ++-- 14 files changed, 25 insertions(+), 20 deletions(-) diff --git a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py index 215655a287c2d..efd953faa88d6 100644 --- a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py +++ b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py @@ -79,7 +79,7 @@ # select up to 5 digit examples that the classifier is most uncertain about uncertainty_index = np.argsort(pred_entropies)[::-1] uncertainty_index = uncertainty_index[ - np.in1d(uncertainty_index, unlabeled_indices) + np.isin(uncertainty_index, unlabeled_indices).ravel() ][:5] # keep track of indices that we get labels for diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py index 95a7274c20f75..637cf8e4fc8d4 100644 --- a/sklearn/datasets/_twenty_newsgroups.py +++ b/sklearn/datasets/_twenty_newsgroups.py @@ -319,7 +319,7 @@ def fetch_20newsgroups( # Sort the categories to have the ordering of the labels labels.sort() labels, categories = zip(*labels) - mask = np.in1d(data.target, labels) + mask = np.isin(data.target, labels) data.filenames = data.filenames[mask] data.target = data.target[mask] # searchsorted to have continuous labels diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 1ef3895fe2818..13d97f5a80e4e 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -76,7 +76,9 @@ def _mask_edges_weights(mask, edges, weights=None): """Apply a mask to edges (weighted or not)""" inds = np.arange(mask.size) inds = inds[mask.ravel()] - ind_mask = np.logical_and(np.in1d(edges[0], inds), np.in1d(edges[1], inds)) + ind_mask = np.logical_and( + np.isin(edges[0], inds).ravel(), np.isin(edges[1], inds).ravel() + ) edges = edges[:, ind_mask] if weights is not None: weights = weights[ind_mask] diff --git a/sklearn/metrics/_plot/tests/test_precision_recall_display.py b/sklearn/metrics/_plot/tests/test_precision_recall_display.py index 3ca94bd96dbe3..772f4eac10ea1 100644 --- a/sklearn/metrics/_plot/tests/test_precision_recall_display.py +++ b/sklearn/metrics/_plot/tests/test_precision_recall_display.py @@ -2,6 +2,7 @@ import numpy as np import pytest +from scipy.integrate import trapz as trapezoid from sklearn.compose import make_column_transformer from sklearn.datasets import load_breast_cancer, make_classification @@ -286,7 +287,7 @@ def test_plot_precision_recall_pos_label(pyplot, constructor_name, response_meth # we should obtain the statistics of the "cancer" class avg_prec_limit = 0.65 assert display.average_precision < avg_prec_limit - assert -np.trapz(display.precision, display.recall) < avg_prec_limit + assert -trapezoid(display.precision, display.recall) < avg_prec_limit # otherwise we should obtain the statistics of the "not cancer" class if constructor_name == "from_estimator": @@ -305,7 +306,7 @@ def test_plot_precision_recall_pos_label(pyplot, constructor_name, response_meth ) avg_prec_limit = 0.95 assert display.average_precision > avg_prec_limit - assert -np.trapz(display.precision, display.recall) > avg_prec_limit + assert -trapezoid(display.precision, display.recall) > avg_prec_limit @pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) diff --git a/sklearn/metrics/_plot/tests/test_roc_curve_display.py b/sklearn/metrics/_plot/tests/test_roc_curve_display.py index b87005e877b77..1cc6bcc7b28b3 100644 --- a/sklearn/metrics/_plot/tests/test_roc_curve_display.py +++ b/sklearn/metrics/_plot/tests/test_roc_curve_display.py @@ -1,6 +1,7 @@ import numpy as np import pytest from numpy.testing import assert_allclose +from scipy.integrate import trapz as trapezoid from sklearn.compose import make_column_transformer from sklearn.datasets import load_breast_cancer, load_iris @@ -290,7 +291,7 @@ def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name): roc_auc_limit = 0.95679 assert display.roc_auc == pytest.approx(roc_auc_limit) - assert np.trapz(display.tpr, display.fpr) == pytest.approx(roc_auc_limit) + assert trapezoid(display.tpr, display.fpr) == pytest.approx(roc_auc_limit) if constructor_name == "from_estimator": display = RocCurveDisplay.from_estimator( @@ -308,4 +309,4 @@ def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name): ) assert display.roc_auc == pytest.approx(roc_auc_limit) - assert np.trapz(display.tpr, display.fpr) == pytest.approx(roc_auc_limit) + assert trapezoid(display.tpr, display.fpr) == pytest.approx(roc_auc_limit) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index 166c2ce20eb87..3835c547d7717 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -24,6 +24,7 @@ from numbers import Integral, Real import numpy as np +from scipy.integrate import trapz as trapezoid from scipy.sparse import csr_matrix, issparse from scipy.stats import rankdata @@ -104,9 +105,9 @@ def auc(x, y): else: raise ValueError("x is neither increasing nor decreasing : {}.".format(x)) - area = direction * np.trapz(y, x) + area = direction * trapezoid(y, x) if isinstance(area, np.memmap): - # Reductions such as .sum used internally in np.trapz do not return a + # Reductions such as .sum used internally in trapezoid do not return a # scalar by default for numpy.memmap instances contrary to # regular numpy.ndarray instances. area = area.dtype.type(area) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 2dceff9b22126..4d30538023abd 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1973,8 +1973,8 @@ def _iter_indices(self, X, y, groups): # these are the indices of classes in the partition # invert them into data indices - train = np.flatnonzero(np.in1d(group_indices, group_train)) - test = np.flatnonzero(np.in1d(group_indices, group_test)) + train = np.flatnonzero(np.isin(group_indices, group_train)) + test = np.flatnonzero(np.isin(group_indices, group_test)) yield train, test diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 04c3f1f156fab..50b519118a2b3 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1418,7 +1418,7 @@ def test_grid_search_correct_score_results(): expected_keys = ("mean_test_score", "rank_test_score") + tuple( "split%d_test_score" % cv_i for cv_i in range(n_splits) ) - assert all(np.in1d(expected_keys, result_keys)) + assert all(np.isin(expected_keys, result_keys)) cv = StratifiedKFold(n_splits=n_splits) n_splits = grid_search.n_splits_ diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 151498205dd39..648f11041cfbf 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -987,8 +987,8 @@ def test_group_shuffle_split(): # First test: no train group is in the test set and vice versa l_train_unique = np.unique(l[train]) l_test_unique = np.unique(l[test]) - assert not np.any(np.in1d(l[train], l_test_unique)) - assert not np.any(np.in1d(l[test], l_train_unique)) + assert not np.any(np.isin(l[train], l_test_unique)) + assert not np.any(np.isin(l[test], l_train_unique)) # Second test: train and test add up to all the data assert l[train].size + l[test].size == l.size diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 22e65f5062586..9ee664bf8b3a4 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -467,7 +467,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): classes = self.classes_ unique_y = np.unique(y) - unique_y_in_classes = np.in1d(unique_y, classes) + unique_y_in_classes = np.isin(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError( diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 3008710d3c3dc..41494f2649a01 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -553,7 +553,7 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False) y = column_or_1d(y) # pick out the known labels from y - y_in_classes = np.in1d(y, classes) + y_in_classes = np.isin(y, classes) y_seen = y[y_in_classes] indices = np.searchsorted(sorted_class, y_seen) indptr = np.hstack((0, np.cumsum(y_in_classes))) diff --git a/sklearn/tests/test_isotonic.py b/sklearn/tests/test_isotonic.py index 15e0d0d99cfb0..93df0221236b8 100644 --- a/sklearn/tests/test_isotonic.py +++ b/sklearn/tests/test_isotonic.py @@ -595,7 +595,7 @@ def test_isotonic_thresholds(increasing): # the data is already strictly monotonic which is not the case with # this random data) assert X_thresholds.shape[0] < X.shape[0] - assert np.in1d(X_thresholds, X).all() + assert np.isin(X_thresholds, X).all() # Output thresholds lie in the range of the training set: assert y_thresholds.max() <= y.max() diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index fb3912b27dbfe..d17435b8aab5b 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -296,7 +296,7 @@ def is_valid(value): diff = np.setdiff1d(unique_values, known_values, assume_unique=True) if return_mask: if diff.size: - valid_mask = np.in1d(values, known_values) + valid_mask = np.isin(values, known_values).ravel() else: valid_mask = np.ones(len(values), dtype=bool) diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py index 19e7bcb7ba17a..049ab589c1c97 100644 --- a/sklearn/utils/class_weight.py +++ b/sklearn/utils/class_weight.py @@ -57,7 +57,7 @@ def compute_class_weight(class_weight, *, classes, y): # Find the weight of each class as present in y. le = LabelEncoder() y_ind = le.fit_transform(y) - if not all(np.in1d(classes, le.classes_)): + if not all(np.isin(classes, le.classes_)): raise ValueError("classes should have valid labels that are in y") recip_freq = len(y) / (len(le.classes_) * np.bincount(y_ind).astype(np.float64)) @@ -194,7 +194,7 @@ def compute_sample_weight(class_weight, y, *, indices=None): if classes_missing: # Make missing classes' weight zero - weight_k[np.in1d(y_full, list(classes_missing))] = 0.0 + weight_k[np.isin(y_full, list(classes_missing))] = 0.0 expanded_class_weight.append(weight_k) From e522e0171a05f92a7a18c213b411d8605858da20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Sok=C3=B3=C5=82?= Date: Wed, 30 Aug 2023 19:45:43 +0200 Subject: [PATCH 2/3] add try except for trapezoid import --- .../metrics/_plot/tests/test_precision_recall_display.py | 7 ++++++- sklearn/metrics/_plot/tests/test_roc_curve_display.py | 7 ++++++- sklearn/metrics/_ranking.py | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_plot/tests/test_precision_recall_display.py b/sklearn/metrics/_plot/tests/test_precision_recall_display.py index 629cf7e9771ef..072a490c5ad3d 100644 --- a/sklearn/metrics/_plot/tests/test_precision_recall_display.py +++ b/sklearn/metrics/_plot/tests/test_precision_recall_display.py @@ -2,7 +2,12 @@ import numpy as np import pytest -from scipy.integrate import trapz as trapezoid + +try: + from scipy.integrate import trapezoid +except ImportError: + # NOTE: remove once 1.6.0 is minimum supported scipy version + from scipy.integrate import trapz as trapezoid from sklearn.compose import make_column_transformer from sklearn.datasets import load_breast_cancer, make_classification diff --git a/sklearn/metrics/_plot/tests/test_roc_curve_display.py b/sklearn/metrics/_plot/tests/test_roc_curve_display.py index 6ec10f64c6e46..112d0cb4bf3ba 100644 --- a/sklearn/metrics/_plot/tests/test_roc_curve_display.py +++ b/sklearn/metrics/_plot/tests/test_roc_curve_display.py @@ -1,7 +1,12 @@ import numpy as np import pytest from numpy.testing import assert_allclose -from scipy.integrate import trapz as trapezoid + +try: + from scipy.integrate import trapezoid +except ImportError: + # NOTE: remove once 1.6.0 is minimum supported scipy version + from scipy.integrate import trapz as trapezoid from sklearn.compose import make_column_transformer from sklearn.datasets import load_breast_cancer, load_iris diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index 3835c547d7717..fe93b0c821f78 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -24,10 +24,15 @@ from numbers import Integral, Real import numpy as np -from scipy.integrate import trapz as trapezoid from scipy.sparse import csr_matrix, issparse from scipy.stats import rankdata +try: + from scipy.integrate import trapezoid +except ImportError: + # NOTE: remove once 1.6.0 is minimum supported scipy version + from scipy.integrate import trapz as trapezoid + from ..exceptions import UndefinedMetricWarning from ..preprocessing import label_binarize from ..utils import ( From 1888f6511110602481bbb9a30ea6d44164c62724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 6 Sep 2023 15:45:21 +0200 Subject: [PATCH 3/3] Use utils.fixes + remove some unneeded ravel --- .../plot_label_propagation_digits_active_learning.py | 2 +- sklearn/feature_extraction/image.py | 4 +--- .../metrics/_plot/tests/test_precision_recall_display.py | 7 +------ sklearn/metrics/_plot/tests/test_roc_curve_display.py | 7 +------ sklearn/metrics/_ranking.py | 7 +------ sklearn/utils/_encode.py | 2 +- sklearn/utils/fixes.py | 7 +++++++ 7 files changed, 13 insertions(+), 23 deletions(-) diff --git a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py index efd953faa88d6..45af1d7891b2e 100644 --- a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py +++ b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py @@ -79,7 +79,7 @@ # select up to 5 digit examples that the classifier is most uncertain about uncertainty_index = np.argsort(pred_entropies)[::-1] uncertainty_index = uncertainty_index[ - np.isin(uncertainty_index, unlabeled_indices).ravel() + np.isin(uncertainty_index, unlabeled_indices) ][:5] # keep track of indices that we get labels for diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 13d97f5a80e4e..a2a23b9ec4f3d 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -76,9 +76,7 @@ def _mask_edges_weights(mask, edges, weights=None): """Apply a mask to edges (weighted or not)""" inds = np.arange(mask.size) inds = inds[mask.ravel()] - ind_mask = np.logical_and( - np.isin(edges[0], inds).ravel(), np.isin(edges[1], inds).ravel() - ) + ind_mask = np.logical_and(np.isin(edges[0], inds), np.isin(edges[1], inds)) edges = edges[:, ind_mask] if weights is not None: weights = weights[ind_mask] diff --git a/sklearn/metrics/_plot/tests/test_precision_recall_display.py b/sklearn/metrics/_plot/tests/test_precision_recall_display.py index 072a490c5ad3d..0173e5338d722 100644 --- a/sklearn/metrics/_plot/tests/test_precision_recall_display.py +++ b/sklearn/metrics/_plot/tests/test_precision_recall_display.py @@ -3,12 +3,6 @@ import numpy as np import pytest -try: - from scipy.integrate import trapezoid -except ImportError: - # NOTE: remove once 1.6.0 is minimum supported scipy version - from scipy.integrate import trapz as trapezoid - from sklearn.compose import make_column_transformer from sklearn.datasets import load_breast_cancer, make_classification from sklearn.exceptions import NotFittedError @@ -22,6 +16,7 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle +from sklearn.utils.fixes import trapezoid # TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved pytestmark = pytest.mark.filterwarnings( diff --git a/sklearn/metrics/_plot/tests/test_roc_curve_display.py b/sklearn/metrics/_plot/tests/test_roc_curve_display.py index 112d0cb4bf3ba..8fd9f96576518 100644 --- a/sklearn/metrics/_plot/tests/test_roc_curve_display.py +++ b/sklearn/metrics/_plot/tests/test_roc_curve_display.py @@ -2,12 +2,6 @@ import pytest from numpy.testing import assert_allclose -try: - from scipy.integrate import trapezoid -except ImportError: - # NOTE: remove once 1.6.0 is minimum supported scipy version - from scipy.integrate import trapz as trapezoid - from sklearn.compose import make_column_transformer from sklearn.datasets import load_breast_cancer, load_iris from sklearn.exceptions import NotFittedError @@ -17,6 +11,7 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle +from sklearn.utils.fixes import trapezoid @pytest.fixture(scope="module") diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index fe93b0c821f78..a7d4b5ef18d66 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -27,12 +27,6 @@ from scipy.sparse import csr_matrix, issparse from scipy.stats import rankdata -try: - from scipy.integrate import trapezoid -except ImportError: - # NOTE: remove once 1.6.0 is minimum supported scipy version - from scipy.integrate import trapz as trapezoid - from ..exceptions import UndefinedMetricWarning from ..preprocessing import label_binarize from ..utils import ( @@ -44,6 +38,7 @@ from ..utils._encode import _encode, _unique from ..utils._param_validation import Interval, StrOptions, validate_params from ..utils.extmath import stable_cumsum +from ..utils.fixes import trapezoid from ..utils.multiclass import type_of_target from ..utils.sparsefuncs import count_nonzero from ..utils.validation import _check_pos_label_consistency, _check_sample_weight diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index d17435b8aab5b..b3bf1c2a317ec 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -296,7 +296,7 @@ def is_valid(value): diff = np.setdiff1d(unique_values, known_values, assume_unique=True) if return_mask: if diff.size: - valid_mask = np.isin(values, known_values).ravel() + valid_mask = np.isin(values, known_values) else: valid_mask = np.ones(len(values), dtype=bool) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index aeb01f91590a2..e545054bff96c 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -200,3 +200,10 @@ def _contents(data_module): from numpy.exceptions import ComplexWarning, VisibleDeprecationWarning else: from numpy import ComplexWarning, VisibleDeprecationWarning # type: ignore # noqa + + +# TODO: Remove when Scipy 1.6 is the minimum supported version +try: + from scipy.integrate import trapezoid # type: ignore # noqa +except ImportError: + from scipy.integrate import trapz as trapezoid # type: ignore # noqa