scikit-learn
diff --git a/‎sklearn/datasets/_lfw.py
Lines changed: 2 additions & 5 deletions b/‎sklearn/datasets/_lfw.py
Lines changed: 2 additions & 5 deletions
diff --git a/‎sklearn/datasets/_twenty_newsgroups.py b/‎sklearn/datasets/_twenty_newsgroups.py
@@ -19,6 +19,7 @@
 
 from ..utils import Bunch
 from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params
+from ..utils.fixes import tarfile_extractall
 from ._base import (
     RemoteFileMetadata,
     _fetch_remote,
@@ -117,11 +118,7 @@ def _check_fetch_lfw(
 
         logger.debug("Decompressing the data archive to %s", data_folder_path)
         with tarfile.open(archive_path, "r:gz") as fp:
-            # Use filter="data" to prevent the most dangerous security issues.
-            # For more details, see
-            # https://docs.python.org/3.9/library/tarfile.html#tarfile.TarFile.extractall
-            fp.extractall(path=lfw_home, filter="data")
-
+            tarfile_extractall(fp, path=lfw_home)
         remove(archive_path)
 
     return lfw_home, data_folder_path
 
@@ -43,6 +43,7 @@
 from ..feature_extraction.text import CountVectorizer
 from ..utils import Bunch, check_random_state
 from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.fixes import tarfile_extractall
 from . import get_data_home, load_files
 from ._base import (
     RemoteFileMetadata,
@@ -81,10 +82,7 @@ def _download_20newsgroups(target_dir, cache_path, n_retries, delay):
 
     logger.debug("Decompressing %s", archive_path)
     with tarfile.open(archive_path, "r:gz") as fp:
-        # Use filter="data" to prevent the most dangerous security issues.
-        # For more details, see
-        # https://docs.python.org/3.9/library/tarfile.html#tarfile.TarFile.extractall
-        fp.extractall(path=target_dir, filter="data")
+        tarfile_extractall(fp, path=target_dir)
 
     with suppress(FileNotFoundError):
         os.remove(archive_path)
 
@@ -743,6 +743,8 @@ def test_logistic_regression_solvers_multiclass_unpenalized(
     fit_intercept, global_random_seed
 ):
     """Test and compare solver results for unpenalized multinomial multiclass."""
+    # Our use of numpy.random.multinomial requires numpy >= 1.22
+    pytest.importorskip("numpy", minversion="1.22.0")
     # We want to avoid perfect separation.
     n_samples, n_features, n_classes = 100, 4, 3
     rng = np.random.RandomState(global_random_seed)
 
@@ -19,7 +19,7 @@
     create_memmap_backed_data,
     ignore_warnings,
 )
-from sklearn.utils.fixes import CSR_CONTAINERS
+from sklearn.utils.fixes import CSR_CONTAINERS, parse_version, sp_version
 
 
 def dist_func(x1, x2, p):
@@ -81,6 +81,13 @@ def test_cdist(metric_param_grid, X, Y, csr_container):
             # with scipy
             rtol_dict = {"rtol": 1e-6}
 
+        # TODO: Remove when scipy minimum version >= 1.7.0
+        # scipy supports 0<p<1 for minkowski metric >= 1.7.0
+        if metric == "minkowski":
+            p = kwargs["p"]
+            if sp_version < parse_version("1.7.0") and p < 1:
+                pytest.skip("scipy does not support 0<p<1 for minkowski metric < 1.7.0")
+
         D_scipy_cdist = cdist(X, Y, metric, **kwargs)
 
         dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs)
@@ -165,6 +172,12 @@ def test_pdist(metric_param_grid, X, csr_container):
             # with scipy
             rtol_dict = {"rtol": 1e-6}
 
+        # TODO: Remove when scipy minimum version >= 1.7.0
+        # scipy supports 0<p<1 for minkowski metric >= 1.7.0
+        if metric == "minkowski":
+            p = kwargs["p"]
+            if sp_version < parse_version("1.7.0") and p < 1:
+                pytest.skip("scipy does not support 0<p<1 for minkowski metric < 1.7.0")
         D_scipy_pdist = cdist(X, X, metric, **kwargs)
 
         dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs)
 
@@ -54,6 +54,8 @@
     DIA_CONTAINERS,
     DOK_CONTAINERS,
     LIL_CONTAINERS,
+    parse_version,
+    sp_version,
 )
 from sklearn.utils.validation import check_random_state
 
@@ -118,13 +120,13 @@ def _generate_test_params_for(metric: str, n_features: int):
     rng = np.random.RandomState(1)
 
     if metric == "minkowski":
-        return [
-            dict(p=1.5),
-            dict(p=2),
-            dict(p=3),
-            dict(p=np.inf),
-            dict(p=3, w=rng.rand(n_features)),
-        ]
+        minkowski_kwargs = [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)]
+        if sp_version >= parse_version("1.8.0.dev0"):
+            # TODO: remove the test once we no longer support scipy < 1.8.0.
+            # Recent scipy versions accept weights in the Minkowski metric directly:
+            # type: ignore
+            minkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
+        return minkowski_kwargs
 
     if metric == "seuclidean":
         return [dict(V=rng.rand(n_features))]
 
@@ -11,6 +11,7 @@
 from ..utils import resample
 from ..utils._param_validation import Interval, Options, StrOptions
 from ..utils.deprecation import _deprecate_Xt_in_inverse_transform
+from ..utils.fixes import np_version, parse_version
 from ..utils.stats import _averaged_weighted_percentile, _weighted_percentile
 from ..utils.validation import (
     _check_feature_names_in,
@@ -345,12 +346,26 @@ def fit(self, X, y=None, sample_weight=None):
             elif self.strategy == "quantile":
                 percentile_levels = np.linspace(0, 100, n_bins[jj] + 1)
 
+                # TODO: simplify the following when numpy min version >= 1.22.
+
                 # method="linear" is the implicit default for any numpy
                 # version. So we keep it version independent in that case by
                 # using an empty param dict.
                 percentile_kwargs = {}
                 if quantile_method != "linear" and sample_weight is None:
-                    percentile_kwargs["method"] = quantile_method
+                    if np_version < parse_version("1.22"):
+                        if quantile_method in ["averaged_inverted_cdf", "inverted_cdf"]:
+                            # The method parameter is not supported in numpy <
+                            # 1.22 but we can define unit sample weight to use
+                            # our own implementation instead:
+                            sample_weight = np.ones(X.shape[0], dtype=X.dtype)
+                        else:
+                            raise ValueError(
+                                f"quantile_method='{quantile_method}' is not "
+                                "supported with numpy < 1.22"
+                            )
+                    else:
+                        percentile_kwargs["method"] = quantile_method
 
                 if sample_weight is None:
                     bin_edges[jj] = np.asarray(
 
@@ -59,6 +59,24 @@ def _create_expansion(X, interaction_only, deg, n_features, cumulative_size=0):
     needs_int64 = max(max_indices, max_indptr) > max_int32
     index_dtype = np.int64 if needs_int64 else np.int32
 
+    # This is a pretty specific bug that is hard to work around by a user,
+    # hence we do not detail the entire bug and all possible avoidance
+    # mechnasisms. Instead we recommend upgrading scipy or shrinking their data.
+    cumulative_size += expanded_col
+    if (
+        sp_version < parse_version("1.8.0")
+        and cumulative_size - 1 > max_int32
+        and not needs_int64
+    ):
+        raise ValueError(
+            "In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
+            " sometimes produces negative columns when the output shape contains"
+            " `n_cols` too large to be represented by a 32bit signed"
+            " integer. To avoid this error, either use a version"
+            " of scipy `>=1.8.0` or alter the `PolynomialFeatures`"
+            " transformer to produce fewer than 2^31 output features."
+        )
+
     # Result of the expansion, modified in place by the
     # `_csr_polynomial_expansion` routine.
     expanded_data = np.empty(shape=total_nnz, dtype=X.data.dtype)
@@ -639,7 +657,8 @@ class SplineTransformer(TransformerMixin, BaseEstimator):
         may slow down subsequent estimators.
 
     sparse_output : bool, default=False
-        Will return sparse CSR matrix if set True else will return an array.
+        Will return sparse CSR matrix if set True else will return an array. This
+        option is only available with `scipy>=1.8`.
 
         .. versionadded:: 1.2
 
@@ -851,6 +870,12 @@ def fit(self, X, y=None, sample_weight=None):
             elif not np.all(np.diff(base_knots, axis=0) > 0):
                 raise ValueError("knots must be sorted without duplicates.")
 
+        if self.sparse_output and sp_version < parse_version("1.8.0"):
+            raise ValueError(
+                "Option sparse_output=True is only available with scipy>=1.8.0, "
+                f"but here scipy=={sp_version} is used."
+            )
+
         # number of knots for base interval
         n_knots = base_knots.shape[0]
 
 
@@ -13,6 +13,7 @@
     assert_array_equal,
     ignore_warnings,
 )
+from sklearn.utils.fixes import np_version, parse_version
 
 X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]]
 
@@ -687,3 +688,18 @@ def test_KBD_inverse_transform_Xt_deprecation(strategy, quantile_method):
 
     with pytest.warns(FutureWarning, match="Xt was renamed X in version 1.5"):
         kbd.inverse_transform(Xt=X)
+
+
+# TODO: remove this test when numpy min version >= 1.22
+@pytest.mark.skipif(
+    condition=np_version >= parse_version("1.22"),
+    reason="newer numpy versions do support the 'method' parameter",
+)
+def test_invalid_quantile_method_on_old_numpy():
+    expected_msg = (
+        "quantile_method='closest_observation' is not supported with numpy < 1.22"
+    )
+    with pytest.raises(ValueError, match=expected_msg):
+        KBinsDiscretizer(
+            quantile_method="closest_observation", strategy="quantile"
+        ).fit(X)
@@ -15,6 +15,8 @@
     SplineTransformer,
 )
 from sklearn.preprocessing._csr_polynomial_expansion import (
+    _calc_expanded_nnz,
+    _calc_total_nnz,
     _get_sizeof_LARGEST_INT_t,
 )
 from sklearn.utils._testing import assert_array_almost_equal
@@ -397,6 +399,10 @@ def test_spline_transformer_kbindiscretizer(global_random_seed):
     assert_allclose(splines, kbins, rtol=1e-13)
 
 
+@pytest.mark.skipif(
+    sp_version < parse_version("1.8.0"),
+    reason="The option `sparse_output` is available as of scipy 1.8.0",
+)
 @pytest.mark.parametrize("degree", range(1, 3))
 @pytest.mark.parametrize("knots", ["uniform", "quantile"])
 @pytest.mark.parametrize(
@@ -451,6 +457,17 @@ def test_spline_transformer_sparse_output(
         )
 
 
+@pytest.mark.skipif(
+    sp_version >= parse_version("1.8.0"),
+    reason="The option `sparse_output` is available as of scipy 1.8.0",
+)
+def test_spline_transformer_sparse_output_raise_error_for_old_scipy():
+    """Test that SplineTransformer with sparse=True raises for scipy<1.8.0."""
+    X = [[1], [2]]
+    with pytest.raises(ValueError, match="scipy>=1.8.0"):
+        SplineTransformer(sparse_output=True).fit(X)
+
+
 @pytest.mark.parametrize("n_knots", [5, 10])
 @pytest.mark.parametrize("include_bias", [True, False])
 @pytest.mark.parametrize("degree", [3, 4])
@@ -462,6 +479,9 @@ def test_spline_transformer_n_features_out(
     n_knots, include_bias, degree, extrapolation, sparse_output
 ):
     """Test that transform results in n_features_out_ features."""
+    if sparse_output and sp_version < parse_version("1.8.0"):
+        pytest.skip("The option `sparse_output` is available as of scipy 1.8.0")
+
     splt = SplineTransformer(
         n_knots=n_knots,
         degree=degree,
@@ -1078,6 +1098,25 @@ def test_csr_polynomial_expansion_index_overflow(
             pf.fit(X)
         return
 
+    # In SciPy < 1.8, a bug occurs when an intermediate matrix in
+    # `to_stack` in `hstack` fits within int32 however would require int64 when
+    # combined with all previous matrices in `to_stack`.
+    if sp_version < parse_version("1.8.0"):
+        has_bug = False
+        max_int32 = np.iinfo(np.int32).max
+        cumulative_size = n_features + include_bias
+        for deg in range(2, degree + 1):
+            max_indptr = _calc_total_nnz(X.indptr, interaction_only, deg)
+            max_indices = _calc_expanded_nnz(n_features, interaction_only, deg) - 1
+            cumulative_size += max_indices + 1
+            needs_int64 = max(max_indices, max_indptr) > max_int32
+            has_bug |= not needs_int64 and cumulative_size > max_int32
+        if has_bug:
+            msg = r"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
+            with pytest.raises(ValueError, match=msg):
+                X_trans = pf.fit_transform(X)
+            return
+
     # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
     # dtype for representing indices and indptr if `n_features` is still
     # small enough so that each block matrix's indices and indptr arrays
 
@@ -50,6 +50,8 @@
     _IS_32BIT,
     VisibleDeprecationWarning,
     _in_unstable_openblas_configuration,
+    parse_version,
+    sp_version,
 )
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.utils.validation import (
@@ -1014,6 +1016,11 @@ def _convert_container(
             # https://github.com/scipy/scipy/pull/18530#issuecomment-1878005149
             container = np.atleast_2d(container)
 
+        if "array" in constructor_name and sp_version < parse_version("1.8"):
+            raise ValueError(
+                f"{constructor_name} is only available with scipy>=1.8.0, got "
+                f"{sp_version}"
+            )
         if constructor_name in ("sparse", "sparse_csr"):
             # sparse and sparse_csr are equivalent for legacy reasons
             return sp.sparse.csr_matrix(container, dtype=dtype)