8000 Fix PowerTransformer leaves constant feature unchanged (#26566) · scikit-learn/scikit-learn@e5df5fe · GitHub
[go: up one dir, main page]

Skip to content

Commit e5df5fe

Browse files
jeremiedbbogrisel
andauthored
Fix PowerTransformer leaves constant feature unchanged (#26566)
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
1 parent 83cb686 commit e5df5fe

File tree

3 files changed

+47
-10
lines changed

3 files changed

+47
-10
lines changed

doc/whats_new/v1.3.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,11 @@ Changelog
640640
using `method="box-cox"` on data with a constant `np.nan` column.
641641
:pr:`26400` by :user:`Yao Xiao <Charlie-XIAO>`.
642642

643+
- |Fix| :class:`preprocessing.PowerTransformer` with `method="yeo-johnson"` now leaves
644+
constant features unchanged instead of transforming with an arbitrary value for
645+
the `lambdas_` fitted parameter.
646+
:pr:`26566` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
647+
643648
:mod:`sklearn.svm`
644649
..................
645650

sklearn/preprocessing/_data.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3150,21 +3150,34 @@ def _fit(self, X, y=None, force_transform=False):
31503150
if not self.copy and not force_transform: # if call from fit()
31513151
X = X.copy() # force copy so that fit does not change X inplace
31523152

3153+
n_samples = X.shape[0]
3154+
mean = np.mean(X, axis=0, dtype=np.float64)
3155+
var = np.var(X, axis=0, dtype=np.float64)
3156+
31533157
optim_function = {
31543158
"box-cox": self._box_cox_optimize,
31553159
"yeo-johnson": self._yeo_johnson_optimize,
31563160
}[self.method]
3161+
3162+
transform_function = {
3163+
"box-cox": boxcox,
3164+
"yeo-johnson": self._yeo_johnson_transform,
3165+
}[self.method]
3166+
31573167
with np.errstate(invalid="ignore"): # hide NaN warnings
3158-
self.lambdas_ = np.array([optim_function(col) for col in X.T])
3159-
3160-
if self.standardize or force_transform:
3161-
transform_function = {
3162-
"box-cox": boxcox,
3163-
"yeo-johnson": self._yeo_johnson_transform,
3164-
}[self.method]
3165-
for i, lmbda in enumerate(self.lambdas_):
3166-
with np.errstate(invalid="ignore"): # hide NaN warnings
3167-
X[:, i] = transform_function(X[:, i], lmbda)
3168+
self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype)
3169+
for i, col in enumerate(X.T):
3170+
# For yeo-johnson, leave constant features unchanged
3171+
# lambda=1 corresponds to the identity transformation
3172+
is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples)
3173+
if self.method == "yeo-johnson" and is_constant_feature:
3174+
self.lambdas_[i] = 1.0
3175+
continue
3176+
3177+
self.lambdas_[i] = optim_function(col)
3178+
3179+
if self.standardize or force_transform:
3180+
X[:, i] = transform_function(X[:, i], self.lambdas_[i])
31683181

31693182
if self.standardize:
31703183
self._scaler = StandardScaler(copy=False)

sklearn/preprocessing/tests/test_data.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2669,3 +2669,22 @@ def test_kernel_centerer_feature_names_out():
26692669
names_out = centerer.get_feature_names_out()
26702670
samples_out2 = X_pairwise.shape[1]
26712671
assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])
2672+
2673+
2674+
@pytest.mark.parametrize("standardize", [True, False])
2675+
def test_power_transformer_constant_feature(standardize):
2676+
"""Check that PowerTransfomer leaves constant features unchanged."""
2677+
X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]]
2678+
2679+
pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X)
2680+
2681+
assert_allclose(pt.lambdas_, [1, 1, 1])
2682+
2683+
Xft = pt.fit_transform(X)
2684+
Xt = pt.transform(X)
2685+
2686+
for Xt_ in [Xft, Xt]:
2687+
if standardize:
2688+
assert_allclose(Xt_, np.zeros_like(X))
2689+
else:
2690+
assert_allclose(Xt_, X)

0 commit comments

Comments
 (0)
0