amueller · glemaitre · Jun 6, 2017 · Jun 6, 2017 · Jun 7, 2017 · Jun 7, 2017
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
@@ -13,7 +13,7 @@
 import numpy as np
 import scipy.sparse as sp
 
-from ..base import (BaseEstimator, TransformerMixin, clone, is_classifier,
+from ..base import (BaseEstimator, TransformerMixin, RegressorMixin, clone,
                     is_regressor)
 
 from ..utils.fixes import np_version
@@ -29,6 +29,8 @@
 
 from ..externals import six
 
+from ._function_transformer import FunctionTransformer
+
 zip = six.moves.zip
 map = six.moves.map
 
@@ -37,6 +39,7 @@
     'LabelBinarizer',
     'LabelEncoder',
     'MultiLabelBinarizer',
+    'TransformedTargetRegressor'
 ]
 
 
@@ -54,69 +57,117 @@ def _check_numpy_unicode_bug(labels):
                            " NumPy to use LabelEncoder with unicode inputs.")
 
 
-class TargetTransformer(BaseEstimator):
-    """Meta-estimator to apply a transformation to the target before fitting
+class TransformedTargetRegressor(BaseEstimator, RegressorMixin):
+    """Meta-estimator to apply a transformation to the target before fitting.
 
-    Useful for applying a non-linear transformation like np.log
-    to the target in a regression problem.
+    Useful for applying a non-linear transformation in regression
+    problems. This transformation can be given as a Transformer such as the
+    QuantileTransformer or as a function and its inverse such as ``np.log`` and
+    ``np.exp``.
 
     The computation during ``fit`` is::
 
         estimator.fit(X, func(y))
 
+    or::
+
+        estimator.fit(X, transformer.transform(y))
+
     The computation during ``predict`` is::
 
         inverse_func(estimator.predict(X))
 
+    or::
+
+        transformer.inverse_transform(estimator.predict(X))
+
+    Parameters
+    ----------
+    estimator : object, (default=LinearRegression())
+        Estimator object derived from ``RegressorMixin``.
+
+    transformer : object, (default=None)
+        Estimator object derived from ``TransformerMixin``. Cannot be set at
+        the same time as ``func`` and ``inverse_func``. If ``None`` and
+        ``func`` and ``inverse_func`` are ``None`` as well, the transformer will
+        be an identity transformer.
 
-    Parameter
-    ---------
-    estimator : object
-        Estimator object to wrap.
+    func : function, (default=None)
+        Function to apply to ``y`` before passing to ``fit``. Cannot be set at
+        the same time than ``transformer``. If ``None`` and ``transformer`` is
+        ``None`` as well, the function used will be the identity function.
 
-    func : function
-        Function to apply to y before passing to fit.
+    inverse_func : function, (default=None)
+        Function apply to the prediction of the estimator. Cannot be set at
+        the same time than ``transformer``. If ``None`` and ``transformer`` as
+        well, the function used will be the identity function.
 
-    inverse_func : function
-        Function apply to the prediction of the estimator.
+    check_invertible : bool, (default=True)
+        Whether to check that ``transform`` followed by ``inverse_transform``
+        or ``func`` followed by ``inverse_func`` lead to the original data.
 
     Attributes
     ----------
     estimator_ : object
         Fitted estimator.
 
+    transformer_ : object
+        Used transformer in ``fit`` and ``predict``.
+
     Examples
     --------
     >>> import numpy as np
     >>> from sklearn.linear_model import LinearRegression
-    >>> from sklearn.preprocessing.label import TargetTransformer
-    >>> tt = TargetTransformer(LinearRegression(), func=np.log,
-    ...                        inverse_func=np.exp)
+    >>> from sklearn.preprocessing.label import TransformedTargetRegressor
+    >>> tt = TransformedTargetRegressor(estimator=LinearRegression(),
+    ...                                 func=np.log, inverse_func=np.exp)
     >>> X = np.arange(4).reshape(-1, 1)
     >>> y = np.exp(2 * X).ravel()
     >>> tt.fit(X, y)
     ... #doctest: +NORMALIZE_WHITESPACE
-    TargetTransformer(estimator=LinearRegression(copy_X=True,
-        fit_intercept=True, n_jobs=1, normalize=False), func=<ufunc 'log'>,
-        inverse_func=<ufunc 'exp'>)
+    TransformedTargetRegressor(check_invertible=True,
+        estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1,
+                                   normalize=False),
+        func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>, transformer=None)
     >>> tt.score(X, y)
     1.0
     >>> tt.estimator_.coef_
     array([ 2.])
 
-
-    Notes
-    -----
-    It is not checked whether func and inverse_func are actually
-    inverse to each other.
     """
-    def __init__(self, estimator, func, inverse_func):
+    def __init__(self, estimator, transformer=None,
+                 func=None, inverse_func=None, check_invertible=True):
         self.estimator = estimator
+        self.transformer = transformer
         self.func = func
         self.inverse_func = inverse_func
+        self.check_invertible = check_invertible
         # we probably need to change this ones we have tags
         self._estimator_type = estimator._estimator_type
 
+    def _validate_transformer(self, y):
+        if (self.transformer is not None and
+                (self.func is not None or self.inverse_func is not None)):
+            raise ValueError("Both 'transformer' and functions 'func'/"
+                             "'inverse_func' cannot be set at the same time.")
+        elif self.transformer is not None:
+            self.transformer_ = clone(self.transformer)
+        else:
+            self.transformer_ = FunctionTransformer(
+                func=self.func, inverse_func=self.inverse_func, validate=False)
+        self.transformer_.fit(y)
+        # XXX: is it a necessary test? one might not want an invertible
+        # function.
+        if self.check_invertible:
+            n_subsample = min(1000, y.shape[0])
+            subsample_idx = np.random.permutation(y.shape[0], size=n_subsample)
+            diff = np.abs((y[subsample_idx] -
+                           self.transformer_.inverse_transform(
+                               self.transformer_.transform(y[subsample_idx]))))
+            if np.sum(diff) > 1e-7:
+                raise ValueError("The provided functions or transformer are"
+                                 " not strictly inverse of each other.")
+
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -138,12 +189,23 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns self.
         """
+        # memorize if y should be a multi-output
+        self.y_ndim_ = y.ndim
+        if y.ndim == 1:
+            y_2d = y.reshape(-1, 1)
+        else:
+            y_2d = y
+        self._validate_transformer(y_2d)
         self.estimator_ = clone(self.estimator)
-        self.estimator_.fit(X, self.func(y), sample_weight=sample_weight)
+        self.estimator_.fit(X, self.transformer_.transform(y_2d),
+                            sample_weight=sample_weight)
         return self
 
     def predict(self, X):
-        """Predict using the base model, apply inverse_func.
+        """Predict using the base estimator, applying inverse.
+
+        The estimator is used to predict and the ``inverse_func`` or
+        ``inverse_transform`` is applied before returning the prediction.
 
         Parameters
         ----------
@@ -154,38 +216,56 @@ def predict(self, X):
         -------
         y_hat : array, shape = (n_samples,)
             Predicted values.
+
         """
         check_is_fitted(self, "estimator_")
-        return self.inverse_func(self.estimator_.predict(X))
+        pred = self.transformer_.inverse_transform(self.estimator_.predict(X))
+        # if y is not a multi-output, it should be ravel
+        if self.y_ndim_ == 1:
+            return pred.ravel()
+        else:
+            return pred
 
     def score(self, X, y, sample_weight=None):
-        """Computes score for regression and classification models.
+        """Returns the coefficient of determination R^2 of the prediction.
+
+        The coefficient R^2 is defined as (1 - u/v), where u is the regression
+        sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual sum
+        of squares ((y_true - y_true.mean()) ** 2).sum().  Best possible score
+        is 1.0 and it can be negative (because the model can be arbitrarily
+        worse). A constant model that always predicts the expected value of y,
+        disregarding the input features, would get a R^2 score of 0.0.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Test samples.
+
+        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
+            True values for X.
+
+        sample_weight : array-like, shape = [n_samples], optional
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            R^2 of self.predict(X) wrt. y.
 
-        For regression, r2_score is used, for classification accuracy_score.
-        See the docstring of these functions for more details.
-        The ``r2_score`` uses variance-weighted averaging in the multi-output
-        case.
         """
 
         check_is_fitted(self, "estimator_")
-        if is_classifier(self.estimator):
-            from ..metrics import accuracy_score
-            return accuracy_score(y, self.predict(X),
-                                  sample_weight=sample_weight)
-        elif is_regressor(self.estimator):
-            from ..metrics import r2_score
-            return r2_score(y, self.predict(X), sample_weight=sample_weight,
-                            multioutput='variance_weighted')
-        else:
-            # I'm not sure if this is too many internals for an error message
+        if not is_regressor(self.estimator_):
             if not hasattr(self.estimator_, "_estimator_type"):
                 err = "estimator has declared no _estimator_type."
             else:
                 err = "estimator has _estimator_type {}".format(
                     self.estimator_._estimator_type)
-            raise NotImplementedError(
-                "TargetTransformer only implements a score method if "
-                "estimator is a classifier or regressor, but " + err)
+            raise NotImplementedError("TransformedTargetRegressor should be a"
+                                      " regressor. This " + err)
+        else:
+            return super(TransformedTargetRegressor, self).score(X, y,
+                                                                 sample_weight)
 
 
 class LabelEncoder(BaseEstimator, TransformerMixin):

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
@@ -14,19 +14,25 @@
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raise_message
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import ignore_warnings
 
 from sklearn.preprocessing.label import LabelBinarizer
 from sklearn.preprocessing.label import MultiLabelBinarizer
 from sklearn.preprocessing.label import LabelEncoder
 from sklearn.preprocessing.label import label_binarize
+from sklearn.preprocessing.label import TransformedTargetRegressor
 
 from sklearn.preprocessing.label import _inverse_binarize_thresholding
 from sklearn.preprocessing.label import _inverse_binarize_multiclass
 
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import StandardScaler
+
 from sklearn import datasets
 
 iris = datasets.load_iris()
+friedman = datasets.make_friedman1(random_state=0)
 
 
 def toarray(a):
@@ -511,3 +517,45 @@ def test_inverse_binarize_multiclass():
                                                    [0, 0, 0]]),
                                        np.arange(3))
     assert_array_equal(got, np.array([1, 1, 0]))
+
+
+def test_transformed_target_regressor_error_kwargs():
+    X = friedman[0]
+    y = friedman[1]
+    # provide a transformer and functions at the same time
+    clf = TransformedTargetRegressor(estimator=LinearRegression(),
+                                     transformer=StandardScaler(),
+                                     func=np.exp, inverse_func=np.log)
+    assert_raises_regex(ValueError, "Both 'transformer' and functions"
+                        " 'func'/'inverse_func' cannot be set at the"
+                        " same time.", clf.fit, X, y)
+
+
+def test_transformed_target_regressor_invertible():
+    X = friedman[0]
+    y = friedman[1]
+    clf = TransformedTargetRegressor(estimator=LinearRegression(),
+                                     func=np.exp, inverse_func=np.exp,
+                                     check_invertible=True)
+    assert_raise_message(ValueError, "The provided functions or transformer"
+                         " are not strictly inverse of each other.",
+                         clf.fit, X, y)
+    clf = TransformedTargetRegressor(estimator=LinearRegression(),
+                                     func=np.exp, inverse_func=np.exp,
+                                     check_invertible=False)
+    # the transformer/functions are not checked to be invertible the fitting
+    # should pass
+    clf.fit(X, y)
+
+
+def test_target_transformer_friedman():
+    X = friedman[0]
+    y = friedman[1]
+    clf = TransformedTargetRegressor(estimator=LinearRegression(),
+                                     func=np.log, inverse_func=np.exp)
+    pred = clf.fit(X, y).predict(X)
+    assert_equal(y.shape, pred.shape)
+    clf = TransformedTargetRegressor(estimator=LinearRegression(),
+                                     transformer=StandardScaler())
+    pred = clf.fit(X, y).predict(X)
+    assert_equal(y.shape, pred.shape)