8000 [WIP] EHN/TST advance TTR by glemaitre · Pull Request #30 · amueller/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] EHN/TST advance TTR #30

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 126 additions & 46 deletions sklearn/preprocessing/label.py
6D40
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import numpy as np
import scipy.sparse as sp

from ..base import (BaseEstimator, TransformerMixin, clone, is_classifier,
from ..base import (BaseEstimator, TransformerMixin, RegressorMixin, clone,
is_regressor)

from ..utils.fixes import np_version
Expand All @@ -29,6 +29,8 @@

from ..externals import six

from ._function_transformer import FunctionTransformer

zip = six.moves.zip
map = six.moves.map

Expand All @@ -37,6 +39,7 @@
'LabelBinarizer',
'LabelEncoder',
'MultiLabelBinarizer',
'TransformedTargetRegressor'
]


Expand All @@ -54,69 +57,117 @@ def _check_numpy_unicode_bug(labels):
" NumPy to use LabelEncoder with unicode inputs.")


class TargetTransformer(BaseEstimator):
"""Meta-estimator to apply a transformation to the target before fitting
class TransformedTargetRegressor(BaseEstimator, RegressorMixin):
"""Meta-estimator to apply a transformation to the target before fitting.

Useful for applying a non-linear transformation like np.log
to the target in a regression problem.
Useful for applying a non-linear transformation in regression
problems. This transformation can be given as a Transformer such as the
QuantileTransformer or as a function and its inverse such as ``np.log`` and
``np.exp``.

The computation during ``fit`` is::

estimator.fit(X, func(y))

or::

estimator.fit(X, transformer.transform(y))

The computation during ``predict`` is::

inverse_func(estimator.predict(X))

or::

transformer.inverse_transform(estimator.predict(X))

Parameters
----------
estimator : object, (default=LinearRegression())
Estimator object derived from ``RegressorMixin``.

transformer : object, (default=None)
Estimator object derived from ``TransformerMixin``. Cannot be set at
the same time as ``func`` and ``inverse_func``. If ``None`` and
``func`` and ``inverse_func`` are ``None`` as well, the transformer will
be an identity transformer.

Parameter
---------
estimator : object
Estimator object to wrap.
func : function, (default=None)
Function to apply to ``y`` before passing to ``fit``. Cannot be set at
the same time than ``transformer``. If ``None`` and ``transformer`` is
``None`` as well, the function used will be the identity function.

func : function
Function to apply to y before passing to fit.
inverse_func : function, (default=None)
Function apply to the prediction of the estimator. Cannot be set at
the same time than ``transformer``. If ``None`` and ``transformer`` as
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

well, the function used will be the identity function.

inverse_func : function
Function apply to the prediction of the estimator.
check_invertible : bool, (default=True)
Whether to check that ``transform`` followed by ``inverse_transform``
or ``func`` followed by ``inverse_func`` lead to the original data.

Attributes
----------
estimator_ : object
Fitted estimator.

transformer_ : object
Used transformer in ``fit`` and ``predict``.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only present if transformer is not None? Or always?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

always since we create a transformer from the functions


Examples
--------
>>> import numpy as np
>>> from sklearn.linear_model import LinearRegression
>>> from sklearn.preprocessing.label import TargetTransformer
>>> tt = TargetTransformer(LinearRegression(), func=np.log,
... inverse_func=np.exp)
>>> from sklearn.preprocessing.label import TransformedTargetRegressor
>>> tt = TransformedTargetRegressor(estimator=LinearRegression(),
... func=np.log, inverse_func=np.exp)
>>> X = np.arange(4).reshape(-1, 1)
>>> y = np.exp(2 * X).ravel()
>>> tt.fit(X, y)
... #doctest: +NORMALIZE_WHITESPACE
TargetTransformer(estimator=LinearRegression(copy_X=True,
fit_intercept=True, n_jobs=1, normalize=False), func=<ufunc 'log'>,
inverse_func=<ufunc 'exp'>)
TransformedTargetRegressor(check_invertible=True,
estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1,
normalize=False),
func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>, transformer=None)
>>> tt.score(X, y)
1.0
>>> tt.estimator_.coef_
array([ 2.])


Notes
-----
It is not checked whether func and inverse_func are actually
inverse to each other.
"""
def __init__(self, estimator, func, inverse_func):
def __init__(self, estimator, transformer=None,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't you say LinearRegression is the default estimator? That might be good for now before we got better common tests.

func=None, inverse_func=None, check_invertible=True):
self.estimator = estimator
self.transformer = transformer
self.func = func
self.inverse_func = inverse_func
self.check_invertible = check_invertible
# we probably need to change this ones we have tags
self._estimator_type = estimator._estimator_type

def _validate_transformer(self, y):
if (self.transformer is not None and
(self.func is not None or self.inverse_func is not None)):
raise ValueError("Both 'transformer' and functions 'func'/"
"'inverse_func' cannot be set at the same time.")
elif self.transformer is not None:
self.transformer_ = clone(self.transformer)
else:
self.transformer_ = FunctionTransformer(
func=self.func, inverse_func=self.inverse_func, validate=False)
self.transformer_.fit(y)
# XXX: is it a necessary test? one might not want an invertible
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you need the comment as we have the flag now.

# function.
if self.check_invertible:
n_subsample = min(1000, y.shape[0])
subsample_idx = np.random.permutation(y.shape[0], size=n_subsample)
diff = np.abs((y[subsample_idx] -
self.transformer_.inverse_transform(
self.transformer_.transform(y[subsample_idx]))))
if np.sum(diff) > 1e-7:
raise ValueError("The provided functions or transformer are"
" not strictly inverse of each other.")

def fit(self, X, y, sample_weight=None):
"""Fit the model according to the given training data.

Expand All @@ -138,12 +189,23 @@ def fit(self, X, y, sample_weight=None):
self : object
Returns self.
"""
# memorize if y should be a multi-output
self.y_ndim_ = y.ndim
if y.ndim == 1:
y_2d = y.reshape(-1, 1)
else:
y_2d = y
self._validate_transformer(y_2d)
self.estimator_ = clone(self.estimator)
self.estimator_.fit(X, self.func(y), sample_weight=sample_weight)
self.estimator_.fit(X, self.transformer_.transform(y_2d),
sample_weight=sample_weight)
return self

def predict(self, X):
"""Predict using the base model, apply inverse_func.
"""Predict using the base estimator, applying inverse.

The estimator is used to predict and the ``inverse_func`` or
``inverse_transform`` is applied before returning the prediction.

Parameters
----------
Expand All @@ -154,38 +216,56 @@ def predict(self, X):
-------
y_hat : array, shape = (n_samples,)
Predicted values.

"""
check_is_fitted(self, "estimator_")
return self.inverse_func(self.estimator_.predict(X))
pred = self.transformer_.inverse_transform(self.estimator_.predict(X))
# if y is not a multi-output, it should be ravel
if self.y_ndim_ == 1:
return pred.ravel()
else:
return pred

def score(self, X, y, sample_weight=None):
"""Computes score for regression and classification models.
"""Returns the coefficient of determination R^2 of the prediction.

The coefficient R^2 is defined as (1 - u/v), where u is the regression
sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual sum
of squares ((y_true - y_true.mean()) ** 2).sum(). Best possible score
is 1.0 and it can be negative (because the model can be arbitrarily
worse). A constant model that always predicts the expected value of y,
disregarding the input features, would get a R^2 score of 0.0.

Parameters
----------
X : array-like, shape = (n_samples, n_features)
Test samples.

y : array-like, shape = (n_samples) or (n_samples, n_outputs)
True values for X.

sample_weight : array-like, shape = [n_samples], optional
Sample weights.

Returns
-------
score : float
R^2 of self.predict(X) wrt. y.

For regression, r2_score is used, for classification accuracy_score.
See the docstring of these functions for more details.
The ``r2_score`` uses variance-weighted averaging in the multi-output
case.
"""

check_is_fitted(self, "estimator_")
if is_classifier(self.estimator):
from ..metrics import accuracy_score
return accuracy_score(y, self.predict(X),
sample_weight=sample_weight)
elif is_regressor(self.estimator):
from ..metrics import r2_score
return r2_score(y, self.predict(X), sample_weight=sample_weight,
multioutput='variance_weighted')
else:
# I'm not sure if this is too many internals for an error message
if not is_regressor(self.estimator_):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should even remove this? I guess it prevents people from using his with a classifier and then being surprised by R2, so on second though leave it in.

if not hasattr(self.estimator_, "_estimator_type"):
err = "estimator has declared no _estimator_type."
else:
err = "estimator has _estimator_type {}".format(
self.estimator_._estimator_type)
raise NotImplementedError(
"TargetTransformer only implements a score method if "
"estimator is a classifier or regressor, but " + err)
raise NotImplementedError("TransformedTargetRegressor should be a"
" regressor. This " + err)
else:
return super(TransformedTargetRegressor, self).score(X, y,
sample_weight)


class LabelEncoder(BaseEstimator, TransformerMixin):
Expand Down
48 changes: 48 additions & 0 deletions sklearn/preprocessing/tests/test_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,25 @@
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import assert_raises_regex
from sklearn.utils.testing import ignore_warnings

from sklearn.preprocessing.label import LabelBinarizer
from sklearn.preprocessing.label import MultiLabelBinarizer
from sklearn.preprocessing.label import LabelEncoder
from sklearn.preprocessing.label import label_binarize
from sklearn.preprocessing.label import TransformedTargetRegressor

from sklearn.preprocessing.label import _inverse_binarize_thresholding
from sklearn.preprocessing.label import _inverse_binarize_multiclass

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

from sklearn import datasets

iris = datasets.load_iris()
friedman = datasets.make_friedman1(random_state=0)


def toarray(a):
Expand Down Expand Up @@ -511,3 +517,45 @@ def test_inverse_binarize_multiclass():
[0, 0, 0]]),
np.arange(3))
assert_array_equal(got, np.array([1, 1, 0]))


def test_transformed_target_regressor_error_kwargs():
X = friedman[0]
y = friedman[1]
# provide a transformer and functions at the same time
clf = TransformedTargetRegressor(estimator=LinearRegression(),
transformer=StandardScaler(),
func=np.exp, inverse_func=np.log)
assert_raises_regex(ValueError, "Both 'transformer' and functions"
" 'func'/'inverse_func' cannot be set at the"
" same time.", clf.fit, X, y)


def test_transformed_target_regressor_invertible():
X = friedman[0]
y = friedman[1]
clf = TransformedTargetRegressor(estimator=LinearRegression(),
func=np.exp, inverse_func=np.exp,
check_invertible=True)
assert_raise_message(ValueError, "The provided functions or transformer"
" are not strictly inverse of each other.",
clf.fit, X, y)
clf = TransformedTargetRegressor(estimator=LinearRegression(),
func=np.exp, inverse_func=np.exp,
check_invertible=False)
# the transformer/functions are not checked to be invertible the fitting
# should pass
clf.fit(X, y)


def test_target_transformer_friedman():
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no test that the functions are actually applied, right?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That, and as you said, you should have one with y.shape = (n_samples, 1) and one with y.shape = (n_samples, 3) or something.

X = friedman[0]
y = friedman[1]
clf = TransformedTargetRegressor(estimator=LinearRegression(),
func=np.log, inverse_func=np.exp)
pred = clf.fit(X, y).predict(X)
assert_equal(y.shape, pred.shape)
clf = TransformedTargetRegressor(estimator=LinearRegression(),
transformer=StandardScaler())
pred = clf.fit(X, y).predict(X)
assert_equal(y.shape, pred.shape)
0