8000 FIX YeoJohnson transform lambda bounds (#12522) · jnothman/scikit-learn@b71d7c8 · GitHub
[go: up one dir, main page]

Skip to content

Commit b71d7c8

Browse files
NicolasHugjnothman
authored andcommitted
FIX YeoJohnson transform lambda bounds (scikit-learn#12522)
1 parent a1c835a commit b71d7c8

File tree

3 files changed

+25
-25
lines changed

3 files changed

+25
-25
lines changed

doc/whats_new/v0.20.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,10 @@ Changelog
149149
:issue:`12317` by :user:`Eric Chang <chang>`.
150150

151151

152+
- |Fix| Fixed a bug in :class:`preprocessing.PowerTransformer` where the
153+
Yeo-Johnson transform was incorrect for lambda parameters outside of `[0, 2]`
154+
:issue:`12522` by :user:`Nicolas Hug<NicolasHug>`.
155+
152156
:mod:`sklearn.utils`
153157
........................
154158

sklearn/preprocessing/data.py

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2528,7 +2528,7 @@ class PowerTransformer(BaseEstimator, TransformerMixin):
25282528
>>> print(pt.fit(data))
25292529
PowerTransformer(copy=True, method='yeo-johnson', standardize=True)
25302530
>>> print(pt.lambdas_)
2531-
[1.38668178e+00 5.93926346e-09]
2531+
[ 1.38668178 -3.10053309]
25322532
>>> print(pt.transform(data))
25332533
[[-1.31616039 -0.70710678]
25342534
[ 0.20998268 -0.70710678]
@@ -2709,23 +2709,18 @@ def _box_cox_inverse_tranform(self, x, lmbda):
27092709
def _yeo_johnson_inverse_transform(self, x, lmbda):
27102710
"""Return inverse-transformed input x following Yeo-Johnson inverse
27112711
transform with parameter lambda.
2712-
2713-
Notes
2714-
-----
2715-
We're comparing lmbda to 1e-19 instead of strict equality to 0. See
2716-
scipy/special/_boxcox.pxd for a rationale behind this
27172712
"""
2718-
x_inv = np.zeros(x.shape, dtype=x.dtype)
2713+
x_inv = np.zeros_like(x)
27192714
pos = x >= 0
27202715

27212716
# when x >= 0
2722-
if lmbda < 1e-19:
2717+
if abs(lmbda) < np.spacing(1.):
27232718
x_inv[pos] = np.exp(x[pos]) - 1
27242719
else: # lmbda != 0
27252720
x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
27262721

27272722
# when x < 0
2728-
if lmbda < 2 - 1e-19:
2723+
if abs(lmbda - 2) > np.spacing(1.):
27292724
x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1,
27302725
1 / (2 - lmbda))
27312726
else: # lmbda == 2
@@ -2736,27 +2731,22 @@ def _yeo_johnson_inverse_transform(self, x, lmbda):
27362731
def _yeo_johnson_transform(self, x, lmbda):
27372732
"""Return transformed input x following Yeo-Johnson transform with
27382733
parameter lambda.
2739-
2740-
Notes
2741-
-----
2742-
We're comparing lmbda to 1e-19 instead of strict equality to 0. See
2743-
scipy/special/_boxcox.pxd for a rationale behind this
27442734
"""
27452735

2746-
out = np.zeros(shape=x.shape, dtype=x.dtype)
2736+
out = np.zeros_like(x)
27472737
pos = x >= 0 # binary mask
27482738

27492739
# when x >= 0
2750-
if lmbda < 1e-19:
2751-
out[pos] = np.log(x[pos] + 1)
2740+
if abs(lmbda) < np.spacing(1.):
2741+
out[pos] = np.log1p(x[pos])
27522742
else: # lmbda != 0
27532743
out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
27542744

27552745
# when x < 0
2756-
if lmbda < 2 - 1e-19:
2746+
if abs(lmbda - 2) > np.spacing(1.):
27572747
out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
27582748
else: # lmbda == 2
2759-
out[~pos] = -np.log(-x[~pos] + 1)
2749+
out[~pos] = -np.log1p(-x[~pos])
27602750

27612751
return out
27622752

@@ -2785,12 +2775,8 @@ def _neg_log_likelihood(lmbda):
27852775
x_trans = self._yeo_johnson_transform(x, lmbda)
27862776
n_samples = x.shape[0]
27872777

2788-
# Estimated mean and variance of the normal distribution
2789-
est_mean = x_trans.sum() / n_samples
2790-
est_var = np.power(x_trans - est_mean, 2).sum() / n_samples
2791-
2792-
loglike = -n_samples / 2 * np.log(est_var)
2793-
loglike += (lmbda - 1) * (np.sign(x) * np.log(np.abs(x) + 1)).sum()
2778+
loglike = -n_samples / 2 * np.log(x_trans.var())
2779+
loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum()
27942780

27952781
return -loglike
27962782

sklearn/preprocessing/tests/test_data.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2207,6 +2207,16 @@ def test_optimization_power_transformer(method, lmbda):
22072207
assert_almost_equal(1, X_inv_trans.std(), decimal=1)
22082208

22092209

2210+
def test_yeo_johnson_darwin_example():
2211+
# test from original paper "A new family of power transformations to
2212+
# improve normality or symmetry" by Yeo and Johnson.
2213+
X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3,
2214+
7.5, -6.0]
2215+
X = np.array(X).reshape(-1, 1)
2216+
lmbda = PowerTransformer(method='yeo-johnson').fit(X).lambdas_
2217+
assert np.allclose(lmbda, 1.305, atol=1e-3)
2218+
2219+
22102220
@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
22112221
def test_power_transformer_nans(method):
22122222
# Make sure lambda estimation is not influenced by NaN values

0 commit comments

Comments
 (0)
0