scikit-learn
diff --git a/‎doc/whats_new/upcoming_changes/sklearn.preprocessing/31227.fix.rst
Lines changed: 6 additions & 0 deletions b/‎doc/whats_new/upcoming_changes/sklearn.preprocessing/31227.fix.rst
Lines changed: 6 additions & 0 deletions
diff --git a/‎sklearn/preprocessing/_data.py
Lines changed: 4 additions & 3 deletions b/‎sklearn/preprocessing/_data.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎sklearn/preprocessing/tests/test_data.py
Lines changed: 51 additions & 0 deletions b/‎sklearn/preprocessing/tests/test_data.py
Lines changed: 51 additions & 0 deletions
diff --git a/‎sklearn/utils/fixes.py
Lines changed: 33 additions & 0 deletions b/‎sklearn/utils/fixes.py
Lines changed: 33 additions & 0 deletions
@@ -0,0 +1,6 @@
+- Now using ``scipy.stats.yeojohnson`` instead of our own implementation of the Yeo-Johnson transform.
+  Fixed numerical stability (mostly overflows) of the Yeo-Johnson transform with
+  `PowerTransformer(method="yeo-johnson")` when scipy version is `>= 1.12`.
+  Initial PR by :user:`Xuefeng Xu <xuefeng-xu>` completed by :user:`Mohamed Yaich <yaichm>`,
+  :user:`Oussama Er-rabie <eroussama>`, :user:`Mohammed Yaslam Dlimi <Dlimim>`,
+  :user:`Hamza Zaroual <HamzaLuffy>`, :user:`Amine Hannoun <AmineHannoun>` and :user:`Sylvain Marié <smarie>`.
@@ -6,7 +6,7 @@
 from numbers import Integral, Real
 
 import numpy as np
-from scipy import optimize, sparse, stats
+from scipy import sparse, stats
 from scipy.special import boxcox, inv_boxcox
 
 from sklearn.utils import metadata_routing
@@ -28,6 +28,7 @@
 )
 from ..utils._param_validation import Interval, Options, StrOptions, validate_params
 from ..utils.extmath import _incremental_mean_and_var, row_norms
+from ..utils.fixes import _yeojohnson_lambda
 from ..utils.sparsefuncs import (
     incr_mean_variance_axis,
     inplace_column_scale,
@@ -3542,8 +3543,8 @@ def _neg_log_likelihood(lmbda):
         # the computation of lambda is influenced by NaNs so we need to
         # get rid of them
         x = x[~np.isnan(x)]
-        # choosing bracket -2, 2 like for boxcox
-        return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
+
+        return _yeojohnson_lambda(_neg_log_likelihood, x)
 
     def _check_input(self, X, in_fit, check_positive=False, check_shape=False):
         """Validate the input before fit and transform.
 
@@ -12,6 +12,7 @@
 from sklearn import config_context, datasets
 from sklearn.base import clone
 from sklearn.exceptions import NotFittedError
+from sklearn.externals._packaging.version import parse as parse_version
 from sklearn.metrics.pairwise import linear_kernel
 from sklearn.model_selection import cross_val_predict
 from sklearn.pipeline import Pipeline
@@ -62,6 +63,7 @@
     CSC_CONTAINERS,
     CSR_CONTAINERS,
     LIL_CONTAINERS,
+    sp_version,
 )
 from sklearn.utils.sparsefuncs import mean_variance_axis
 
@@ -2640,3 +2642,52 @@ def test_power_transformer_constant_feature(standardize):
             assert_allclose(Xt_, np.zeros_like(X))
         else:
             assert_allclose(Xt_, X)
+
+
+@pytest.mark.skipif(
+    sp_version < parse_version("1.12"),
+    reason="scipy version 1.12 required for stable yeo-johnson",
+)
+def test_power_transformer_no_warnings():
+    """Verify that PowerTransformer operates without raising any warnings on valid data.
+
+    This test addresses numerical issues with floating point numbers (mostly
+    overflows) with the Yeo-Johnson transform, see
+    https://github.com/scikit-learn/scikit-learn/issues/23319#issuecomment-1464933635
+    """
+    x = np.array(
+        [
+            2003.0,
+            1950.0,
+            1997.0,
+            2000.0,
+            2009.0,
+            2009.0,
+            1980.0,
+            1999.0,
+            2007.0,
+            1991.0,
+        ]
+    )
+
+    def _test_no_warnings(data):
+        """Internal helper to test for unexpected warnings."""
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            warnings.simplefilter("always")  # Ensure all warnings are captured
+            PowerTransformer(method="yeo-johnson", standardize=True).fit_transform(data)
+
+        assert not caught_warnings, "Unexpected warnings were raised:\n" + "\n".join(
+            str(w.message) for w in caught_warnings
+        )
+
+    # Full dataset: Should not trigger overflow in variance calculation.
+    _test_no_warnings(x.reshape(-1, 1))
+
+    # Subset of data: Should not trigger overflow in power calculation.
+    _test_no_warnings(x[:5].reshape(-1<
2364
/span>, 1))
+
+
+def test_yeojohnson_for_different_scipy_version():
+    """Check that the results are consistent across different SciPy versions."""
+    pt = PowerTransformer(method="yeo-johnson").fit(X_1col)
+    pt.lambdas_[0] == 
F438
pytest.approx(0.99546157, rel=1e-7)
@@ -14,6 +14,7 @@
 import scipy
 import scipy.sparse.linalg
 import scipy.stats
+from scipy import optimize
 
 try:
     import pandas as pd
@@ -80,6 +81,38 @@ def _sparse_linalg_cg(A, b, **kwargs):
         return scipy.sparse.linalg.cg(A, b, **kwargs)
 
 
+# TODO : remove this when required minimum version of scipy >= 1.9.0
+def _yeojohnson_lambda(_neg_log_likelihood, x):
+    """Estimate the optimal Yeo-Johnson transformation parameter (lambda).
+
+    This function provides a compatibility workaround for versions of SciPy
+    older than 1.9.0, where `scipy.stats.yeojohnson` did not return
+    the estimated lambda directly.
+
+    Parameters
+    ----------
+    _neg_log_likelihood : callable
+        A function that computes the negative log-likelihood of the Yeo-Johnson
+        transformation for a given lambda. Used only for SciPy versions < 1.9.0.
+
+    x : array-like
+        Input data to estimate the Yeo-Johnson transformation parameter.
+
+    Returns
+    -------
+    lmbda : float
+        The estimated lambda parameter for the Yeo-Johnson transformation.
+    """
+    min_scipy_version = "1.9.0"
+
+    if sp_version < parse_version(min_scipy_version):
+        # choosing bracket -2, 2 like for boxcox
+        return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
+
+    _, lmbda = scipy.stats.yeojohnson(x, lmbda=None)
+    return lmbda
+
+
 # TODO: Fuse the modern implementations of _sparse_min_max and _sparse_nan_min_max
 # into the public min_max_axis function when Scipy 1.11 is the minimum supported
 # version and delete the backport in the else branch below.