8000 MAINT: Remove np.in1d and np.trapz usages by mtsokol · Pull Request #27140 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

MAINT: Remove np.in1d and np.trapz usages #27140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
# select 8000 up to 5 digit examples that the classifier is most uncertain about
uncertainty_index = np.argsort(pred_entropies)[::-1]
uncertainty_index = uncertainty_index[
np.in1d(uncertainty_index, unlabeled_indices)
np.isin(uncertainty_index, unlabeled_indices)
][:5]

# keep track of indices that we get labels for
Expand Down
2 changes: 1 addition & 1 deletion sklearn/datasets/_twenty_newsgroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def fetch_20newsgroups(
# Sort the categories to have the ordering of the labels
labels.sort()
labels, categories = zip(*labels)
mask = np.in1d(data.target, labels)
mask = np.isin(data.target, labels)
data.filenames = data.filenames[mask]
data.target = data.target[mask]
# searchsorted to have continuous labels
Expand Down
2 changes: 1 addition & 1 deletion sklearn/feature_extraction/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _mask_edges_weights(mask, edges, weights=None):
"""Apply a mask to edges (weighted or not)"""
inds = np.arange(mask.size)
inds = inds[mask.ravel()]
ind_mask = np.logical_and(np.in1d(edges[0], inds), np.in1d(edges[1], inds))
ind_mask = np.logical_and(np.isin(edges[0], inds), np.isin(edges[1], inds))
edges = edges[:, ind_mask]
if weights is not None:
weights = weights[ind_mask]
Expand Down
5 changes: 3 additions & 2 deletions sklearn/metrics/_plot/tests/test_precision_recall_display.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.utils.fixes import trapezoid

# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved
pytestmark = pytest.mark.filterwarnings(
Expand Down Expand Up @@ -289,7 +290,7 @@ def test_plot_precision_recall_pos_label(pyplot, constructor_name, response_meth
# we should obtain the statistics of the "cancer" class
avg_prec_limit = 0.65
assert display.average_precision < avg_prec_limit
assert -np.trapz(display.precision, display.recall) < avg_prec_limit
assert -trapezoid(display.precision, display.recall) < avg_prec_limit

# otherwise we should obtain the statistics of the "not cancer" class
if constructor_name == "from_estimator":
Expand All @@ -308,7 +309,7 @@ def test_plot_precision_recall_pos_label(pyplot, constructor_name, response_meth
)
avg_prec_limit = 0.95
assert display.average_precision > avg_prec_limit
assert -np.trapz(display.precision, display.recall) > avg_prec_limit
assert -trapezoid(display.precision, display.recall) > avg_prec_limit


@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
Expand Down
5 changes: 3 additions & 2 deletions sklearn/metrics/_plot/tests/test_roc_curve_display.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.utils.fixes import trapezoid


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -293,7 +294,7 @@ def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name):
roc_auc_limit = 0.95679

assert display.roc_auc == pytest.approx(roc_auc_limit)
assert np.trapz(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
assert trapezoid(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)

if constructor_name == "from_estimator":
display = RocCurveDisplay.from_estimator(
Expand All @@ -311,4 +312,4 @@ def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name):
)

assert display.roc_auc == pytest.approx(roc_auc_limit)
assert np.trapz(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
assert trapezoid(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
5 changes: 3 additions & 2 deletions sklearn/metrics/_ranking.py 628C
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from ..utils._encode import _encode, _unique
from ..utils._param_validation import Interval, StrOptions, validate_params
from ..utils.extmath import stable_cumsum
from ..utils.fixes import trapezoid
from ..utils.multiclass import type_of_target
from ..utils.sparsefuncs import count_nonzero
from ..utils.validation import _check_pos_label_consistency, _check_sample_weight
Expand Down Expand Up @@ -104,9 +105,9 @@ def auc(x, y):
else:
raise ValueError("x is neither increasing nor decreasing : {}.".format(x))

area = direction * np.trapz(y, x)
area = direction * trapezoid(y, x)
if isinstance(area, np.memmap):
# Reductions such as .sum used internally in np.trapz do not return a
# Reductions such as .sum used internally in trapezoid do not return a
# scalar by default for numpy.memmap instances contrary to
# regular numpy.ndarray instances.
area = area.dtype.type(area)
Expand Down
4 changes: 2 additions & 2 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -1973,8 +1973,8 @@ def _iter_indices(self, X, y, groups):
# these are the indices of classes in the partition
# invert them into data indices

train = np.flatnonzero(np.in1d(group_indices, group_train))
test = np.flatnonzero(np.in1d(group_indices, group_test))
train = np.flatnonzero(np.isin(group_indices, group_train))
test = np.flatnonzero(np.isin(group_indices, group_test))

yield train, test

Expand Down
2 changes: 1 addition & 1 deletion sklearn/model_selection/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -1418,7 +1418,7 @@ def test_grid_search_correct_score_results():
expected_keys = ("mean_test_score", "rank_test_score") + tuple(
"split%d_test_score" % cv_i for cv_i in range(n_splits)
)
assert all(np.in1d(expected_ F438 keys, result_keys))
assert all(np.isin(expected_keys, result_keys))

cv = StratifiedKFold(n_splits=n_splits)
n_splits = grid_search.n_splits_
Expand Down
4 changes: 2 additions & 2 deletions sklearn/model_selection/tests/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -987,8 +987,8 @@ def test_group_shuffle_split():
# First test: no train group is in the test set and vice versa
l_train_unique = np.unique(l[train])
l_test_unique = np.unique(l[test])
assert not np.any(np.in1d(l[train], l_test_unique))
assert not np.any(np.in1d(l[test], l_train_unique))
assert not np.any(np.isin(l[train], l_test_unique))
assert not np.any(np.isin(l[test], l_train_unique))

# Second test: train and test add up to all the data
assert l[train].size + l[test].size == l.size
Expand Down
2 changes: 1 addition & 1 deletion sklearn/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):
classes = self.classes_

unique_y = np.unique(y)
unique_y_in_classes = np.in1d(unique_y, classes)
unique_y_in_classes = np.isin(unique_y, classes)

if not np.all(unique_y_in_classes):
raise ValueError(
Expand Down
2 changes: 1 addition & 1 deletion sklearn/preprocessing/_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False)
y = column_or_1d(y)

# pick out the known labels from y
y_in_classes = np.in1d(y, classes)
y_in_classes = np.isin(y, classes)
y_seen = y[y_in_classes]
indices = np.searchsorted(sorted_class, y_seen)
indptr = np.hstack((0, np.cumsum(y_in_classes)))
Expand Down
2 changes: 1 addition & 1 deletion sklearn/tests/test_isotonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ def test_isotonic_thresholds(increasing):
# the data is already strictly monotonic which is not the case with
# this random data)
assert X_thresholds.shape[0] < X.shape[0]
assert np.in1d(X_thresholds, X).all()
assert np.isin(X_thresholds, X).all()

# Output thresholds lie in the range of the training set:
assert y_thresholds.max() <= y.max()
Expand Down
2 changes: 1 addition & 1 deletion sklearn/utils/_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def is_valid(value):
diff = np.setdiff1d(unique_values, known_values, assume_unique=True)
if return_mask:
if diff.size:
valid_mask = np.in1d(values, known_values)
valid_mask = np.isin(values, known_values)
else:
valid_mask = np.ones(len(values), dtype=bool)

Expand Down
4 changes: 2 additions & 2 deletions sklearn/utils/class_weight.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def compute_class_weight(class_weight, *, classes, y):
# Find the weight of each class as present in y.
le = LabelEncoder()
y_ind = le.fit_transform(y)
if not all(np.in1d(classes, le.classes_)):
if not all(np.isin(classes, le.classes_)):
raise ValueError("classes should have valid labels that are in y")

recip_freq = len(y) / (len(le.classes_) * np.bincount(y_ind).astype(np.float64))
Expand Down Expand Up @@ -195,7 +195,7 @@ def compute_sample_weight(class_weight, y, *, indices=None):

if classes_missing:
# Make missing classes' weight zero
weight_k[np.in1d(y_full, list(classes_missing))] = 0.0
weight_k[np.isin(y_full, list(classes_missing))] = 0.0

expanded_class_weight.append(weight_k)

Expand Down
7 changes: 7 additions & 0 deletions sklearn/utils/fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,10 @@ def _contents(data_module):
from numpy.exceptions import ComplexWarning, VisibleDeprecationWarning
else:
from numpy import ComplexWarning, VisibleDeprecationWarning # type: ignore # noqa


# TODO: Remove when Scipy 1.6 is the minimum supported version
try:
from scipy.integrate import trapezoid # type: ignore # noqa
except ImportError:
from scipy.integrate import trapz as trapezoid # type: ignore # noqa
0