8000 Remove clean-ups to make PR smaller · scikit-learn/scikit-learn@b69fbb8 · GitHub
[go: up one dir, main page]

Skip to content

Commit b69fbb8

Browse files
committed
Remove clean-ups to make PR smaller
1 parent 40445d1 commit b69fbb8

File tree

15 files changed

+240
-30
lines changed

15 files changed

+240
-30
lines changed

sklearn/datasets/_lfw.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from ..utils import Bunch
2121
from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params
22+
from ..utils.fixes import tarfile_extractall
2223
from ._base import (
2324
RemoteFileMetadata,
2425
_fetch_remote,
@@ -117,11 +118,7 @@ def _check_fetch_lfw(
117118

118119
logger.debug("Decompressing the data archive to %s", data_folder_path)
119120
with tarfile.open(archive_path, "r:gz") as fp:
120-
# Use filter="data" to prevent the most dangerous security issues.
121-
# For more details, see
122-
# https://docs.python.org/3.9/library/tarfile.html#tarfile.TarFile.extractall
123-
fp.extractall(path=lfw_home, filter="data")
124-
121+
tarfile_extractall(fp, path=lfw_home)
125122
remove(archive_path)
126123

127124
return lfw_home, data_folder_path

sklearn/datasets/_twenty_newsgroups.py

Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
from ..feature_extraction.text import CountVectorizer
4444
from ..utils import Bunch, check_random_state
4545
from ..utils._param_validation import Interval, StrOptions, validate_params
46+
from ..utils.fixes import tarfile_extractall
4647
from . import get_data_home, load_files
4748
from ._base import (
4849
RemoteFileMetadata,
@@ -81,10 +82,7 @@ def _download_20newsgroups(target_dir, cache_path, n_retries, delay):
8182

8283
logger.debug("Decompressing %s", archive_path)
8384
with tarfile.open(archive_path, "r:gz") as fp:
84-
# Use filter="data" to prevent the most dangerous security issues.
85-
# For more details, see
86-
# https://docs.python.org/3.9/library/tarfile.html#tarfile.TarFile.extractall
87-
fp.extractall(path=target_dir, filter="data")
85+
tarfile_extractall(fp, path=target_dir)
8886

8987
with suppress(FileNotFoundError):
9088
os.remove(archive_path)
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,8 @@ def test_logistic_regression_solvers_multiclass_unpenalized(
743743
fit_intercept, global_random_seed
744744
):
745745
"""Test and compare solver results for unpenalized multinomial multiclass."""
746+
# Our use of numpy.random.multinomial requires numpy >= 1.22
747+
pytest.importorskip("numpy", minversion="1.22.0")
746748
# We want to avoid perfect separation.
747749
n_samples, n_features, n_classes = 100, 4, 3
748750
rng = np.random.RandomState(global_random_seed)
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
create_memmap_backed_data,
2020
ignore_warnings,
2121
)
22-
from sklearn.utils.fixes import CSR_CONTAINERS
22+
from sklearn.utils.fixes import CSR_CONTAINERS, parse_version, sp_version
2323

2424

2525
def dist_func(x1, x2, p):
@@ -81,6 +81,13 @@ def test_cdist(metric_param_grid, X, Y, csr_container):
8181
# with scipy
8282
rtol_dict = {"rtol": 1e-6}
8383

84+
# TODO: Remove when scipy minimum version >= 1.7.0
85+
# scipy supports 0<p<1 for minkowski metric >= 1.7.0
86+
if metric == "minkowski":
87+
p = kwargs["p"]
88+
if sp_version < parse_version("1.7.0") and p < 1:
89+
pytest.skip("scipy does not support 0<p<1 for minkowski metric < 1.7.0")
90+
8491
D_scipy_cdist = cdist(X, Y, metric, **kwargs)
8592

8693
dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs)
@@ -165,6 +172,12 @@ def test_pdist(metric_param_grid, X, csr_container):
165172
# with scipy
166173
rtol_dict = {"rtol": 1e-6}
167174

175+
# TODO: Remove when scipy minimum version >= 1.7.0
176+
# scipy supports 0<p<1 for minkowski metric >= 1.7.0
177+
if metric == "minkowski":
178+
p = kwargs["p"]
179+
if sp_version < parse_version("1.7.0") and p < 1:
180+
pytest.skip("scipy does not support 0<p<1 for minkowski metric < 1.7.0")
168181
D_scipy_pdist = cdist(X, X, metric, **kwargs)
169182

170183
dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs)
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@
5454
DIA_CONTAINERS,
5555
DOK_CONTAINERS,
5656
LIL_CONTAINERS,
57+
parse_version,
58+
sp_version,
5759
)
5860
from sklearn.utils.validation import check_random_state
5961

@@ -118,13 +120,13 @@ def _generate_test_params_for(metric: str, n_features: int):
118120
rng = np.random.RandomState(1)
119121

120122
if metric == "minkowski":
121-
return [
122-
dict(p=1.5),
123-
dict(p=2),
124-
dict(p=3),
125-
dict(p=np.inf),
126-
dict(p=3, w=rng.rand(n_features)),
127-
]
123+
minkowski_kwargs = [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)]
124+
if sp_version >= parse_version("1.8.0.dev0"):
125+
# TODO: remove the test once we no longer support scipy < 1.8.0.
126+
# Recent scipy versions accept weights in the Minkowski metric directly:
127+
# type: ignore
128+
minkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
129+
return minkowski_kwargs
128130

129131
if metric == "seuclidean":
130132
return [dict(V=rng.rand(n_features))]
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from ..utils import resample
1212
from ..utils._param_validation import Interval, Options, StrOptions
1313
from ..utils.deprecation import _deprecate_Xt_in_inverse_transform
14+
from ..utils.fixes import np_version, parse_version
1415
from ..utils.stats import _averaged_weighted_percentile, _weighted_percentile
1516
from ..utils.validation import (
1617
_check_feature_names_in,
@@ -345,12 +346,26 @@ def fit(self, X, y=None, sample_weight=None):
345346
elif self.strategy == "quantile":
346347
percentile_levels = np.linspace(0, 100, n_bins[jj] + 1)
347348

349+
# TODO: simplify the following when numpy min version >= 1.22.
350+
348351
# method="linear" is the implicit default for any numpy
349352
# version. So we keep it version independent in that case by
350353
# using an empty param dict.
351354
percentile_kwargs = {}
352355
if quantile_method != "linear" and sample_weight is None:
353-
percentile_kwargs["method"] = quantile_method
356+
if np_version < parse_version("1.22"):
357+
if quantile_method in ["averaged_inverted_cdf", "inverted_cdf"]:
358+
# The method parameter is not supported in numpy <
359+
# 1.22 but we can define unit sample weight to use
360+
# our own implementation instead:
361+
sample_weight = np.ones(X.shape[0], dtype=X.dtype)
362+
else:
363+
raise ValueError(
364+
f"quantile_method='{quantile_method}' is not "
365+
"supported with numpy < 1.22"
366+
)
367+
else:
368+
percentile_kwargs["method"] = quantile_method
354369

355370
if sample_weight is None:
356371
bin_edges[jj] = np.asarray(
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,24 @@ def _create_expansion(X, interaction_only, deg, n_features, cumulative_size=0):
5959
needs_int64 = max(max_indices, max_indptr) > max_int32
6060
index_dtype = np.int64 if needs_int64 else np.int32
6161

62+
# This is a pretty specific bug that is hard to work around by a user,
63+
# hence we do not detail the entire bug and all possible avoidance
64+
# mechnasisms. Instead we recommend upgrading scipy or shrinking their data.
65+
cumulative_size += expanded_col
66+
if (
67+
sp_version < parse_version("1.8.0")
68+
and cumulative_size - 1 > max_int32
69+
and not needs_int64
70+
):
71+
raise ValueError(
72+
"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
73+
" sometimes produces negative columns when the output shape contains"
74+
" `n_cols` too large to be represented by a 32bit signed"
75+
" integer. To avoid this error, either use a version"
76+
" of scipy `>=1.8.0` or alter the `PolynomialFeatures`"
77+
" transformer to produce fewer than 2^31 output features."
78+
)
79+
6280
# Result of the expansion, modified in place by the
6381
# `_csr_polynomial_expansion` routine.
6482
expanded_data = np.empty(shape=total_nnz, dtype=X.data.dtype)
@@ -639,7 +657,8 @@ class SplineTransformer(TransformerMixin, BaseEstimator):
639657
may slow down subsequent estimators.
640658
641659
sparse_output : bool, default=False
642-
Will return sparse CSR matrix if set True else will return an array.
660+
Will return sparse CSR matrix if set True else will return an array. This
661+
option is only available with `scipy>=1.8`.
643662
644663
.. versionadded:: 1.2
645664
@@ -851,6 +870,12 @@ def fit(self, X, y=None, sample_weight=None):
851870
elif not np.all(np.diff(base_knots, axis=0) > 0):
852871
raise ValueError("knots must be sorted without duplicates.")
853872

873+
if self.sparse_output and sp_version < parse_version("1.8.0"):
874+
raise ValueError(
875+
"Option sparse_output=True is only available with scipy>=1.8.0, "
876+
f"but here scipy=={sp_version} is used."
877+
)
878+
854879
# number of knots for base interval
855880
n_knots = base_knots.shape[0]
856881

Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
assert_array_equal,
1414
ignore_warnings,
1515
)
16+
from sklearn.utils.fixes import np_version, parse_version
1617

1718
X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
1819

@@ -687,3 +688,18 @@ def test_KBD_inverse_transform_Xt_deprecation(strategy, quantile_method):
687688

688689
with pytest.warns(FutureWarning, match="Xt was renamed X in version 1.5"):
689690
kbd.inverse_transform(Xt=X)
691+
692+
693+
# TODO: remove this test when numpy min version >= 1.22
694+
@pytest.mark.skipif(
695+
condition=np_version >= parse_version("1.22"),
696+
reason="newer numpy versions do support the 'method' parameter",
697+
)
698+
def test_invalid_quantile_method_on_old_numpy():
699+
expected_msg = (
700+
"quantile_method='closest_observation' is not supported with numpy < 1.22"
701+
)
702+
with pytest.raises(ValueError, match=expected_msg):
703+
KBinsDiscretizer(
704+
quantile_method="closest_observation", strategy="quantile"
705+
).fit(X)
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
SplineTransformer,
1616
)
1717
from sklearn.preprocessing._csr_polynomial_expansion import (
18+
_calc_expanded_nnz,
19+
_calc_total_nnz,
1820
_get_sizeof_LARGEST_INT_t,
1921
)
2022
from sklearn.utils._testing import assert_array_almost_equal
@@ -397,6 +399,10 @@ def test_spline_transformer_kbindiscretizer(global_random_seed):
397399
assert_allclose(splines, kbins, rtol=1e-13)
398400

399401

402+
@pytest.mark.skipif(
403+
sp_version < parse_version("1.8.0"),
404+
reason="The option `sparse_output` is available as of scipy 1.8.0",
405+
)
400406
@pytest.mark.parametrize("degree", range(1, 3))
401407
@pytest.mark.parametrize("knots", ["uniform", "quantile"])
402408
@pytest.mark.parametrize(
@@ -451,6 +457,17 @@ def test_spline_transformer_sparse_output(
451457
)
452458

453459

460+
@pytest.mark.skipif(
461+
sp_version >= parse_version("1.8.0"),
462+
reason="The option `sparse_output` is available as of scipy 1.8.0",
463+
)
464+
def test_spline_transformer_sparse_output_raise_error_for_old_scipy():
465+
"""Test that SplineTransformer with sparse=True raises for scipy<1.8.0."""
466+
X = [[1], [2]]
467+
with pytest.raises(ValueError, match="scipy>=1.8.0"):
468+
SplineTransformer(sparse_output=True).fit(X)
469+
470+
454471
@pytest.mark.parametrize("n_knots", [5, 10])
455472
@pytest.mark.parametrize("include_bias", [True, False])
456473
@pytest.mark.parametrize("degree", [3, 4])
@@ -462,6 +479,9 @@ def test_spline_transformer_n_features_out(
462479
n_knots, include_bias, degree, extrapolation, sparse_output
463480
):
464481
"""Test that transform results in n_features_out_ features."""
482+
if sparse_output and sp_version < parse_version("1.8.0"):
483+
pytest.skip("The option `sparse_output` is available as of scipy 1.8.0")
484+
465485
splt = SplineTransformer(
466486
n_knots=n_knots,
467487
degree=degree,
@@ -1078,6 +1098,25 @@ def test_csr_polynomial_expansion_index_overflow(
10781098
pf.fit(X)
10791099
return
10801100

1101+
# In SciPy < 1.8, a bug occurs when an intermediate matrix in
1102+
# `to_stack` in `hstack` fits within int32 however would require int64 when
1103+
# combined with all previous matrices in `to_stack`.
1104+
if sp_version < parse_version("1.8.0"):
1105+
has_bug = False
1106+
max_int32 = np.iinfo(np.int32).max
1107+
cumulative_size = n_features + include_bias
1108+
for deg in range(2, degree + 1):
1109+
max_indptr = _calc_total_nnz(X.indptr, interaction_only, deg)
1110+
max_indices = _calc_expanded_nnz(n_features, interaction_only, deg) - 1
1111+
cumulative_size += max_indices + 1
1112+
needs_int64 = max(max_indices, max_indptr) > max_int32
1113+
has_bug |= not needs_int64 and cumulative_size > max_int32
1114+
if has_bug:
1115+
msg = r"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
1116+
with pytest.raises(ValueError, match=msg):
1117+
X_trans = pf.fit_transform(X)
1118+
return
1119+
10811120
# When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
10821121
# dtype for representing indices and indptr if `n_features` is still
10831122
# small enough so that each block matrix's indices and indptr arrays
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050
_IS_32BIT,
5151
VisibleDeprecationWarning,
5252
_in_unstable_openblas_configuration,
53+
parse_version,
54+
sp_version,
5355
)
5456
from sklearn.utils.multiclass import check_classification_targets
5557
from sklearn.utils.validation import (
@@ -1014,6 +1016,11 @@ def _convert_container(
10141016
# https://github.com/scipy/scipy/pull/18530#issuecomment-1878005149
10151017
container = np.atleast_2d(container)
10161018

1019+
if "array" in constructor_name and sp_version < parse_version("1.8"):
1020+
raise ValueError(
1021+
f"{constructor_name} is only available with scipy>=1.8.0, got "
1022+
f"{sp_version}"
1023+
)
10171024
if constructor_name in ("sparse", "sparse_csr"):
10181025
# sparse and sparse_csr are equivalent for legacy reasons
10191026
return sp.sparse.csr_matrix(container, dtype=dtype)