diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index bb35a1db224b4..5a8cdafbf3c08 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -632,6 +632,11 @@ Changelog using `method="box-cox"` on data with a constant `np.nan` column. :pr:`26400` by :user:`Yao Xiao `. +- |Fix| :class:`preprocessing.PowerTransformer` with `method="yeo-johnson"` now leaves + constant features unchanged instead of transforming with an arbitrary value for + the `lambdas_` fitted parameter. + :pr:`26566` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.svm` .................. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 013f1f57e9373..469948e129d49 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -3150,21 +3150,34 @@ def _fit(self, X, y=None, force_transform=False): if not self.copy and not force_transform: # if call from fit() X = X.copy() # force copy so that fit does not change X inplace + n_samples = X.shape[0] + mean = np.mean(X, axis=0, dtype=np.float64) + var = np.var(X, axis=0, dtype=np.float64) + optim_function = { "box-cox": self._box_cox_optimize, "yeo-johnson": self._yeo_johnson_optimize, }[self.method] + + transform_function = { + "box-cox": boxcox, + "yeo-johnson": self._yeo_johnson_transform, + }[self.method] + with np.errstate(invalid="ignore"): # hide NaN warnings - self.lambdas_ = np.array([optim_function(col) for col in X.T]) - - if self.standardize or force_transform: - transform_function = { - "box-cox": boxcox, - "yeo-johnson": self._yeo_johnson_transform, - }[self.method] - for i, lmbda in enumerate(self.lambdas_): - with np.errstate(invalid="ignore"): # hide NaN warnings - X[:, i] = transform_function(X[:, i], lmbda) + self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype) + for i, col in enumerate(X.T): + # For yeo-johnson, leave constant features unchanged + # lambda=1 corresponds to the identity transformation + is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples) + if self.method == "yeo-johnson" and is_constant_feature: + self.lambdas_[i] = 1.0 + continue + + self.lambdas_[i] = optim_function(col) + + if self.standardize or force_transform: + X[:, i] = transform_function(X[:, i], self.lambdas_[i]) if self.standardize: self._scaler = StandardScaler(copy=False) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 2e6fd810fedac..c00de906a7dbb 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2669,3 +2669,22 @@ def test_kernel_centerer_feature_names_out(): names_out = centerer.get_feature_names_out() samples_out2 = X_pairwise.shape[1] assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)]) + + +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_constant_feature(standardize): + """Check that PowerTransfomer leaves constant features unchanged.""" + X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]] + + pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X) + + assert_allclose(pt.lambdas_, [1, 1, 1]) + + Xft = pt.fit_transform(X) + Xt = pt.transform(X) + + for Xt_ in [Xft, Xt]: + if standardize: + assert_allclose(Xt_, np.zeros_like(X)) + else: + assert_allclose(Xt_, X)