8000 Fix PowerTransformer leaves constant feature unchanged by jeremiedbb · Pull Request #26566 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

Fix PowerTransformer leaves constant feature unchanged #26566

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
5 changes: 5 additions & 0 deletions doc/whats_new/v1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,11 @@ Changelog
using `method="box-cox"` on data with a constant `np.nan` column.
:pr:`26400` by :user:`Yao Xiao <Charlie-XIAO>`.

- |Fix| :class:`preprocessing.PowerTransformer` with `method="yeo-johnson"` now leaves
constant features unchanged instead of transforming with an arbitrary value for
the `lambdas_` fitted parameter.
:pr:`26566` by :user:`Jérémie du Boisberranger <jeremiedbb>`.

:mod:`sklearn.svm`
..................

Expand Down
33 changes: 23 additions & 10 deletions sklearn/preprocessing/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3150,21 +3150,34 @@ def _fit(self, X, y=None, force_transform=False):
if not self.copy and not force_transform: # if call from fit()
X = X.copy() # force copy so that fit does not change X inplace

n_samples = X.shape[0]
mean = np.mean(X, axis=0, dtype=np.float64)
var = np.var(X, axis=0, dtype=np.float64)

optim_function = {
"box-cox": self._box_cox_optimize,
"yeo-johnson": self._yeo_johnson_optimize,
}[self.method]

transform_function = {
"box-cox": boxcox,
"yeo-johnson": self._yeo_johnson_transform,
}[self.method]

with np.errstate(invalid="ignore"): # hide NaN warnings
self.lambdas_ = np.array([optim_function(col) for col in X.T])

if self.standardize or force_transform:
transform_function = {
"box-cox": boxcox,
"yeo-johnson": self._yeo_johnson_transform,
}[self.method]
for i, lmbda in enumerate(self.lambdas_):
with np.errstate(invalid="ignore"): # hide NaN warnings
X[:, i] = transform_function(X[:, i], lmbda)
self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype)
for i, col in enumerate(X.T):
# For yeo-johnson, leave constant features unchanged
# lambda=1 corresponds to the identity transformation
is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples)
if self.method == "yeo-johnson" and is_constant_feature:
self.lambdas_[i] = 1.0
continue

self.lambdas_[i] = optim_function(col)

if self.standardize or force_transform:
X[:, i] = transform_function(X[:, i], self.lambdas_[i])

if self.standardize:
self._scaler = StandardScaler(copy=False)
Expand Down
19 changes: 19 additions & 0 deletions sklearn/preprocessing/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2669,3 +2669,22 @@ def test_kernel_centerer_feature_names_out():
names_out = centerer.get_feature_names_out()
samples_out2 = X_pairwise.shape[1]
assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])


@pytest.mark.parametrize("standardize", [True, False])
def test_power_transformer_constant_feature(standardize):
"""Check that PowerTransfomer leaves constant features unchanged."""
X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]]

pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X)

assert_allclose(pt.lambdas_, [1, 1, 1])

Xft = pt.fit_transform(X)
Xt = pt.transform(X)

for Xt_ in [Xft, Xt]:
if standardize:
assert_allclose(Xt_, np.zeros_like(X))
else:
assert_allclose(Xt_, X)
0