8000 FIX YeoJohnson transform lambda bounds (#12522) · xhluca/scikit-learn@2585410 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2585410

Browse files
NicolasHugXing
authored and
Xing
committed
FIX YeoJohnson transform lambda bounds (scikit-learn#12522)
1 parent c657b03 commit 2585410

File tree

3 files changed

+25
-25
lines changed

3 files changed

+25
-25
lines changed

doc/whats_new/v0.20.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,10 @@ Changelog
149149
:issue:`12317` by :user:`Eric Chang <chang>`.
150150

151151

152+
- |Fix| Fixed a bug in :class:`preprocessing.PowerTransformer` where the
153+
Yeo-Johnson transform was incorrect for lambda parameters outside of `[0, 2]`
154+
:issue:`12522` by :user:`Nicolas Hug<NicolasHug>`.
155+
152156
:mod:`sklearn.utils`
153157
........................
154158

sklearn/preprocessing/data.py

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2537,7 +2537,7 @@ class PowerTransformer(BaseEstimator, TransformerMixin):
25372537
>>> print(pt.fit(data))
25382538
PowerTransformer(copy=True, method='yeo-johnson', standardize=True)
25392539
>>> print(pt.lambdas_)
2540-
[1.38668178e+00 5.93926346e-09]
2540+
[ 1.38668178 -3.10053309]
25412541
>>> print(pt.transform(data))
25422542
[[-1.31616039 -0.70710678]
25432543
[ 0.20998268 -0.70710678]
@@ -2718,23 +2718,18 @@ def _box_cox_inverse_tranform(self, x, lmbda):
27182718
def _yeo_johnson_inverse_transform(self, x, lmbda):
27192719
"""Return inverse-transformed input x following Yeo-Johnson inverse
27202720
transform with parameter lambda.
2721-
2722-
Notes
2723-
-----
2724-
We're comparing lmbda to 1e-19 instead of strict equality to 0. See
2725-
scipy/special/_boxcox.pxd for a rationale behind this
27262721
"""
2727-
x_inv = np.zeros(x.shape, dtype=x.dtype)
2722+
x_inv = np.zeros_like(x)
27282723
pos = x >= 0
27292724

27302725
# when x >= 0
2731-
if lmbda < 1e-19:
2726+
if abs(lmbda) < np.spacing(1.):
27322727
x_inv[pos] = np.exp(x[pos]) - 1
27332728
else: # lmbda != 0
27342729
x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
27352730

27362731
# when x < 0
2737-
if lmbda < 2 - 1e-19:
2732+
if abs(lmbda - 2) > np.spacing(1.):
27382733
x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1,
27392734
1 / (2 - lmbda))
27402735
else: # lmbda == 2
8000
@@ -2745,27 +2740,22 @@ def _yeo_johnson_inverse_transform(self, x, lmbda):
27452740
def _yeo_johnson_transform(self, x, lmbda):
27462741
"""Return transformed input x following Yeo-Johnson transform with
27472742
parameter lambda.
2748-
2749-
Notes
2750-
-----
2751-
We're comparing lmbda to 1e-19 instead of strict equality to 0. See
2752-
scipy/special/_boxcox.pxd for a rationale behind this
27532743
"""
27542744

2755-
out = np.zeros(shape=x.shape, dtype=x.dtype)
2745+
out = np.zeros_like(x)
27562746
pos = x >= 0 # binary mask
27572747

27582748
# when x >= 0
2759-
if lmbda < 1e-19:
2760-
out[pos] = np.log(x[pos] + 1)
2749+
if abs(lmbda) < np.spacing(1.):
2750+
out[pos] = np.log1p(x[pos])
27612751
else: # lmbda != 0
27622752
out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
27632753

27642754
# when x < 0
2765-
if lmbda < 2 - 1e-19:
2755+
if abs(lmbda - 2) > np.spacing(1.):
27662756
out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
27672757
else: # lmbda == 2
2768-
out[~pos] = -np.log(-x[~pos] + 1)
2758+
out[~pos] = -np.log1p(-x[~pos])
27692759

27702760
return out
27712761

@@ -2794,12 +2784,8 @@ def _neg_log_likelihood(lmbda):
27942784
x_trans = self._yeo_johnson_transform(x, lmbda)
27952785
n_samples = x.shape[0]
27962786

2797-
# Estimated mean and variance of the normal distribution
2798-
est_mean = x_trans.sum() / n_samples
2799-
est_var = np.power(x_trans - est_mean, 2).sum() / n_samples
2800-
2801-
loglike = -n_samples / 2 * np.log(est_var)
2802-
loglike += (lmbda - 1) * (np.sign(x) * np.log(np.abs(x) + 1)).sum()
2787+
loglike = -n_samples / 2 * np.log(x_trans.var())
2788+
loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum()
28032789

28042790
return -loglike
28052791

sklearn/preprocessing/tests/test_data.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2345,6 +2345,16 @@ def test_optimization_power_transformer(method, lmbda):
23452345
assert_almost_equal(1, X_inv_trans.std(), decimal=1)
23462346

23472347

2348+
def test_yeo_johnson_darwin_example():
2349+
# test from original paper "A new family of power transformations to
2350+
# improve normality or symmetry" by Yeo and Johnson.
2351+
X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3,
2352+
7.5, -6.0]
2353+
X = np.array(X).reshape(-1, 1)
2354+
lmbda = PowerTransformer(method='yeo-johnson').fit(X).lambdas_
2355+
assert np.allclose(lmbda, 1.305, atol=1e-3)
2356+
2357+
23482358
@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
23492359
def test_power_transformer_nans(method):
23502360
# Make sure lambda estimation is not influenced by NaN values

0 commit comments

Comments
 (0)
0