8000 Revert "FIX YeoJohnson transform lambda bounds (#12522)" · xhluca/scikit-learn@3695472 · GitHub
[go: up one dir, main page]

Skip to content

Commit 3695472

Browse files
author
Xing
authored
Revert "FIX YeoJohnson transform lambda bounds (scikit-learn#12522)"
This reverts commit 2585410.
1 parent 1a894b4 commit 3695472

File tree

3 files changed

+25
-25
lines changed

3 files changed

+25
-25
lines changed

doc/whats_new/v0.20.rst

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,6 @@ Changelog
149149
:issue:`12317` by :user:`Eric Chang <chang>`.
150150

151151

152-
- |Fix| Fixed a bug in :class:`preprocessing.PowerTransformer` where the
153-
Yeo-Johnson transform was incorrect for lambda parameters outside of `[0, 2]`
154-
:issue:`12522` by :user:`Nicolas Hug<NicolasHug>`.
155-
156152
:mod:`sklearn.utils`
157153
........................
158154

sklearn/preprocessing/data.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2537,7 +2537,7 @@ class PowerTransformer(BaseEstimator, TransformerMixin):
25372537
>>> print(pt.fit(data))
25382538
PowerTransformer(copy=True, method='yeo-johnson', standardize=True)
25392539
>>> print(pt.lambdas_)
2540-
[ 1.38668178 -3.10053309]
2540+
[1.38668178e+00 5.93926346e-09]
25412541
>>> print(pt.transform(data))
25422542
[[-1.31616039 -0.70710678]
25432543
[ 0.20998268 -0.70710678]
@@ -2718,18 +2718,23 @@ def _box_cox_inverse_tranform(self, x, lmbda):
27182718
def _yeo_johnson_inverse_transform(self, x, lmbda):
27192719
"""Return inverse-transformed input x following Yeo-Johnson inverse
27202720
transform with parameter lambda.
2721+
2722+
Notes
2723+
-----
2724+
We're comparing lmbda to 1e-19 instead of strict equality to 0. See
2725+
scipy/special/_boxcox.pxd for a rationale behind this
27212726
"""
2722-
x_inv = np.zeros_like(x)
2727+
x_inv = np.zeros(x.shape, dtype=x.dtype)
27232728
pos = x >= 0
27242729

27252730
# when x >= 0
2726-
if abs(lmbda) < np.spacing(1.):
2731+
if lmbda < 1e-19:
27272732
x_inv[pos] = np.exp(x[pos]) - 1
27282733
else: # lmbda != 0
27292734
x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
27302735

27312736
# when x < 0
2732-
if abs(lmbda - 2) > np.spacing(1.):
2737+
if lmbda < 2 - 1e-19:
27332738
x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1,
27342739
1 / (2 - lmbda))
27352740
else: # lmbda == 2
@@ -2740,22 +2745,27 @@ def _yeo_johnson_inverse_transform(self, x, lmbda):
27402745
def _yeo_johnson_transform(self, x, lmbda):
27412746
"""Return transformed input x following Yeo-Johnson transform with
27422747
parameter lambda.
2748+
2749+
Notes
2750+
-----
2751+
We're comparing lmbda to 1e-19 instead of strict equality to 0. See
2752+
scipy/special/_boxcox.pxd for a rationale behind this
27432753
"""
27442754

2745-
out = np.zeros_like(x)
2755+
out = np.zeros(shape=x.shape, dtype=x.dtype)
27462756
pos = x >= 0 # binary mask
27472757

27482758
# when x >= 0
2749-
if abs(lmbda) < np.spacing(1.):
2750-
out[pos] = np.log1p(x[pos])
2759+
if lmbda < 1e-19:
2760+
out[pos] = np.log(x[pos] + 1)
27512761
else: # lmbda != 0
27522762
out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
27532763

27542764
# when x < 0
2755-
if abs(lmbda - 2) > np.spacing(1.):
2765+
if lmbda < 2 - 1e-19:
27562766
out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
27572767
else: # lmbda == 2
2758-
out[~pos] = -np.log1p(-x[~pos])
2768+
out[~pos] = -np.log(-x[~pos] + 1)
27592769

27602770
return out
27612771

@@ -2784,8 +2794,12 @@ def _neg_log_likelihood(lmbda):
27842794
x_trans = self._yeo_johnson_transform(x, lmbda)
27852795
n_samples = x.shape[0]
27862796

2787-
loglike = -n_samples / 2 * np.log(x_trans.var())
2788-
loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum()
2797+
# Estimated mean and variance of the normal distribution
2798+
est_mean = x_trans.sum() / n_samples
2799+
est_var = np.power(x_trans - est_mean, 2).sum() / n_samples
2800+
2801+
loglike = -n_samples / 2 * np.log(est_var)
2802+
loglike += (lmbda - 1) * (np.sign(x) * np.log(np.abs(x) + 1)).sum()
27892803

27902804
return -loglike
27912805

sklearn/preprocessing/tests/test_data.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2345,16 +2345,6 @@ def test_optimization_power_transformer(method, lmbda):
23452345
assert_almost_equal(1, X_inv_trans.std(), decimal=1)
23462346

23472347

2348-
def test_yeo_johnson_darwin_example():
2349-
# test from original paper "A new family of power transformations to
2350-
# improve normality or symmetry" by Yeo and Johnson.
2351-
X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3,
2352-
7.5, -6.0]
2353-
X = np.array(X).reshape(-1, 1)
2354-
lmbda = PowerTransformer(method='yeo-johnson').fit(X).lambdas_
2355-
assert np.allclose(lmbda, 1.305, atol=1e-3)
2356-
2357-
23582348
@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson'])
23592349
def test_power_transformer_nans(method):
23602350
# Make sure lambda estimation is not influenced by NaN values

0 commit comments

Comments
 (0)
0