-
Notifications
You must be signed in to change notification settings - Fork 21
[WIP] EHN/TST advance TTR #30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,7 @@ | |
import numpy as np | ||
import scipy.sparse as sp | ||
|
||
from ..base import (BaseEstimator, TransformerMixin, clone, is_classifier, | ||
from ..base import (BaseEstimator, TransformerMixin, RegressorMixin, clone, | ||
is_regressor) | ||
|
||
from ..utils.fixes import np_version | ||
|
@@ -29,6 +29,8 @@ | |
|
||
from ..externals import six | ||
|
||
from ._function_transformer import FunctionTransformer | ||
|
||
zip = six.moves.zip | ||
map = six.moves.map | ||
|
||
|
@@ -37,6 +39,7 @@ | |
'LabelBinarizer', | ||
'LabelEncoder', | ||
'MultiLabelBinarizer', | ||
'TransformedTargetRegressor' | ||
] | ||
|
||
|
||
|
@@ -54,69 +57,117 @@ def _check_numpy_unicode_bug(labels): | |
" NumPy to use LabelEncoder with unicode inputs.") | ||
|
||
|
||
class TargetTransformer(BaseEstimator): | ||
"""Meta-estimator to apply a transformation to the target before fitting | ||
class TransformedTargetRegressor(BaseEstimator, RegressorMixin): | ||
"""Meta-estimator to apply a transformation to the target before fitting. | ||
|
||
Useful for applying a non-linear transformation like np.log | ||
to the target in a regression problem. | ||
Useful for applying a non-linear transformation in regression | ||
problems. This transformation can be given as a Transformer such as the | ||
QuantileTransformer or as a function and its inverse such as ``np.log`` and | ||
``np.exp``. | ||
|
||
The computation during ``fit`` is:: | ||
|
||
estimator.fit(X, func(y)) | ||
|
||
or:: | ||
|
||
estimator.fit(X, transformer.transform(y)) | ||
|
||
The computation during ``predict`` is:: | ||
|
||
inverse_func(estimator.predict(X)) | ||
|
||
or:: | ||
|
||
transformer.inverse_transform(estimator.predict(X)) | ||
|
||
Parameters | ||
---------- | ||
estimator : object, (default=LinearRegression()) | ||
Estimator object derived from ``RegressorMixin``. | ||
|
||
transformer : object, (default=None) | ||
Estimator object derived from ``TransformerMixin``. Cannot be set at | ||
the same time as ``func`` and ``inverse_func``. If ``None`` and | ||
``func`` and ``inverse_func`` are ``None`` as well, the transformer will | ||
be an identity transformer. | ||
|
||
Parameter | ||
--------- | ||
estimator : object | ||
Estimator object to wrap. | ||
func : function, (default=None) | ||
Function to apply to ``y`` before passing to ``fit``. Cannot be set at | ||
the same time than ``transformer``. If ``None`` and ``transformer`` is | ||
``None`` as well, the function used will be the identity function. | ||
|
||
func : function | ||
Function to apply to y before passing to fit. | ||
inverse_func : function, (default=None) | ||
Function apply to the prediction of the estimator. Cannot be set at | ||
the same time than ``transformer``. If ``None`` and ``transformer`` as | ||
well, the function used will be the identity function. | ||
|
||
inverse_func : function | ||
Function apply to the prediction of the estimator. | ||
check_invertible : bool, (default=True) | ||
Whether to check that ``transform`` followed by ``inverse_transform`` | ||
or ``func`` followed by ``inverse_func`` lead to the original data. | ||
|
||
Attributes | ||
---------- | ||
estimator_ : object | ||
Fitted estimator. | ||
|
||
transformer_ : object | ||
Used transformer in ``fit`` and ``predict``. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only present if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. always since we create a transformer from the functions |
||
|
||
Examples | ||
-------- | ||
>>> import numpy as np | ||
>>> from sklearn.linear_model import LinearRegression | ||
>>> from sklearn.preprocessing.label import TargetTransformer | ||
>>> tt = TargetTransformer(LinearRegression(), func=np.log, | ||
... inverse_func=np.exp) | ||
>>> from sklearn.preprocessing.label import TransformedTargetRegressor | ||
>>> tt = TransformedTargetRegressor(estimator=LinearRegression(), | ||
... func=np.log, inverse_func=np.exp) | ||
>>> X = np.arange(4).reshape(-1, 1) | ||
>>> y = np.exp(2 * X).ravel() | ||
>>> tt.fit(X, y) | ||
... #doctest: +NORMALIZE_WHITESPACE | ||
TargetTransformer(estimator=LinearRegression(copy_X=True, | ||
fit_intercept=True, n_jobs=1, normalize=False), func=<ufunc 'log'>, | ||
inverse_func=<ufunc 'exp'>) | ||
TransformedTargetRegressor(check_invertible=True, | ||
estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, | ||
normalize=False), | ||
func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>, transformer=None) | ||
>>> tt.score(X, y) | ||
1.0 | ||
>>> tt.estimator_.coef_ | ||
array([ 2.]) | ||
|
||
|
||
Notes | ||
----- | ||
It is not checked whether func and inverse_func are actually | ||
inverse to each other. | ||
""" | ||
def __init__(self, estimator, func, inverse_func): | ||
def __init__(self, estimator, transformer=None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Didn't you say LinearRegression is the default estimator? That might be good for now before we got better common tests. |
||
func=None, inverse_func=None, check_invertible=True): | ||
self.estimator = estimator | ||
self.transformer = transformer | ||
self.func = func | ||
self.inverse_func = inverse_func | ||
self.check_invertible = check_invertible | ||
# we probably need to change this ones we have tags | ||
self._estimator_type = estimator._estimator_type | ||
|
||
def _validate_transformer(self, y): | ||
if (self.transformer is not None and | ||
(self.func is not None or self.inverse_func is not None)): | ||
raise ValueError("Both 'transformer' and functions 'func'/" | ||
"'inverse_func' cannot be set at the same time.") | ||
elif self.transformer is not None: | ||
self.transformer_ = clone(self.transformer) | ||
else: | ||
self.transformer_ = FunctionTransformer( | ||
func=self.func, inverse_func=self.inverse_func, validate=False) | ||
self.transformer_.fit(y) | ||
# XXX: is it a necessary test? one might not want an invertible | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you need the comment as we have the flag now. |
||
# function. | ||
if self.check_invertible: | ||
n_subsample = min(1000, y.shape[0]) | ||
subsample_idx = np.random.permutation(y.shape[0], size=n_subsample) | ||
diff = np.abs((y[subsample_idx] - | ||
self.transformer_.inverse_transform( | ||
self.transformer_.transform(y[subsample_idx])))) | ||
if np.sum(diff) > 1e-7: | ||
raise ValueError("The provided functions or transformer are" | ||
" not strictly inverse of each other.") | ||
|
||
def fit(self, X, y, sample_weight=None): | ||
"""Fit the model according to the given training data. | ||
|
||
|
@@ -138,12 +189,23 @@ def fit(self, X, y, sample_weight=None): | |
self : object | ||
Returns self. | ||
""" | ||
# memorize if y should be a multi-output | ||
self.y_ndim_ = y.ndim | ||
if y.ndim == 1: | ||
y_2d = y.reshape(-1, 1) | ||
else: | ||
y_2d = y | ||
self._validate_transformer(y_2d) | ||
self.estimator_ = clone(self.estimator) | ||
self.estimator_.fit(X, self.func(y), sample_weight=sample_weight) | ||
self.estimator_.fit(X, self.transformer_.transform(y_2d), | ||
sample_weight=sample_weight) | ||
return self | ||
|
||
def predict(self, X): | ||
"""Predict using the base model, apply inverse_func. | ||
"""Predict using the base estimator, applying inverse. | ||
|
||
The estimator is used to predict and the ``inverse_func`` or | ||
``inverse_transform`` is applied before returning the prediction. | ||
|
||
Parameters | ||
---------- | ||
|
@@ -154,38 +216,56 @@ def predict(self, X): | |
------- | ||
y_hat : array, shape = (n_samples,) | ||
Predicted values. | ||
|
||
""" | ||
check_is_fitted(self, "estimator_") | ||
return self.inverse_func(self.estimator_.predict(X)) | ||
pred = self.transformer_.inverse_transform(self.estimator_.predict(X)) | ||
# if y is not a multi-output, it should be ravel | ||
if self.y_ndim_ == 1: | ||
return pred.ravel() | ||
else: | ||
return pred | ||
|
||
def score(self, X, y, sample_weight=None): | ||
"""Computes score for regression and classification models. | ||
"""Returns the coefficient of determination R^2 of the prediction. | ||
|
||
The coefficient R^2 is defined as (1 - u/v), where u is the regression | ||
sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual sum | ||
of squares ((y_true - y_true.mean()) ** 2).sum(). Best possible score | ||
is 1.0 and it can be negative (because the model can be arbitrarily | ||
worse). A constant model that always predicts the expected value of y, | ||
disregarding the input features, would get a R^2 score of 0.0. | ||
|
||
Parameters | ||
---------- | ||
X : array-like, shape = (n_samples, n_features) | ||
Test samples. | ||
|
||
y : array-like, shape = (n_samples) or (n_samples, n_outputs) | ||
True values for X. | ||
|
||
sample_weight : array-like, shape = [n_samples], optional | ||
Sample weights. | ||
|
||
Returns | ||
------- | ||
score : float | ||
R^2 of self.predict(X) wrt. y. | ||
|
||
For regression, r2_score is used, for classification accuracy_score. | ||
See the docstring of these functions for more details. | ||
The ``r2_score`` uses variance-weighted averaging in the multi-output | ||
case. | ||
""" | ||
|
||
check_is_fitted(self, "estimator_") | ||
if is_classifier(self.estimator): | ||
from ..metrics import accuracy_score | ||
return accuracy_score(y, self.predict(X), | ||
sample_weight=sample_weight) | ||
elif is_regressor(self.estimator): | ||
from ..metrics import r2_score | ||
return r2_score(y, self.predict(X), sample_weight=sample_weight, | ||
multioutput='variance_weighted') | ||
else: | ||
# I'm not sure if this is too many internals for an error message | ||
if not is_regressor(self.estimator_): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should even remove this? I guess it prevents people from using his with a classifier and then being surprised by R2, so on second though leave it in. |
||
if not hasattr(self.estimator_, "_estimator_type"): | ||
err = "estimator has declared no _estimator_type." | ||
else: | ||
err = "estimator has _estimator_type {}".format( | ||
self.estimator_._estimator_type) | ||
raise NotImplementedError( | ||
"TargetTransformer only implements a score method if " | ||
"estimator is a classifier or regressor, but " + err) | ||
raise NotImplementedError("TransformedTargetRegressor should be a" | ||
" regressor. This " + err) | ||
else: | ||
return super(TransformedTargetRegressor, self).score(X, y, | ||
sample_weight) | ||
|
||
|
||
class LabelEncoder(BaseEstimator, TransformerMixin): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,19 +14,25 @@ | |
from sklearn.utils.testing import assert_true | ||
from sklearn.utils.testing import assert_raises | ||
from sklearn.utils.testing import assert_raise_message | ||
from sklearn.utils.testing import assert_raises_regex | ||
from sklearn.utils.testing import ignore_warnings | ||
|
||
from sklearn.preprocessing.label import LabelBinarizer | ||
from sklearn.preprocessing.label import MultiLabelBinarizer | ||
from sklearn.preprocessing.label import LabelEncoder | ||
from sklearn.preprocessing.label import label_binarize | ||
from sklearn.preprocessing.label import TransformedTargetRegressor | ||
|
||
from sklearn.preprocessing.label import _inverse_binarize_thresholding | ||
from sklearn.preprocessing.label import _inverse_binarize_multiclass | ||
|
||
from sklearn.linear_model import LinearRegression | ||
from sklearn.preprocessing import StandardScaler | ||
|
||
from sklearn import datasets | ||
|
||
iris = datasets.load_iris() | ||
friedman = datasets.make_friedman1(random_state=0) | ||
|
||
|
||
def toarray(a): | ||
|
@@ -511,3 +517,45 @@ def test_inverse_binarize_multiclass(): | |
[0, 0, 0]]), | ||
np.arange(3)) | ||
assert_array_equal(got, np.array([1, 1, 0])) | ||
|
||
|
||
def test_transformed_target_regressor_error_kwargs(): | ||
X = friedman[0] | ||
y = friedman[1] | ||
# provide a transformer and functions at the same time | ||
clf = TransformedTargetRegressor(estimator=LinearRegression(), | ||
transformer=StandardScaler(), | ||
func=np.exp, inverse_func=np.log) | ||
assert_raises_regex(ValueError, "Both 'transformer' and functions" | ||
" 'func'/'inverse_func' cannot be set at the" | ||
" same time.", clf.fit, X, y) | ||
|
||
|
||
def test_transformed_target_regressor_invertible(): | ||
X = friedman[0] | ||
y = friedman[1] | ||
clf = TransformedTargetRegressor(estimator=LinearRegression(), | ||
func=np.exp, inverse_func=np.exp, | ||
check_invertible=True) | ||
assert_raise_message(ValueError, "The provided functions or transformer" | ||
" are not strictly inverse of each other.", | ||
clf.fit, X, y) | ||
clf = TransformedTargetRegressor(estimator=LinearRegression(), | ||
func=np.exp, inverse_func=np.exp, | ||
check_invertible=False) | ||
# the transformer/functions are not checked to be invertible the fitting | ||
# should pass | ||
clf.fit(X, y) | ||
|
||
|
||
def test_target_transformer_friedman(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no test that the functions are actually applied, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That, and as you said, you should have one with y.shape = (n_samples, 1) and one with y.shape = (n_samples, 3) or something. |
||
X = friedman[0] | ||
y = friedman[1] | ||
clf = TransformedTargetRegressor(estimator=LinearRegression(), | ||
func=np.log, inverse_func=np.exp) | ||
pred = clf.fit(X, y).predict(X) | ||
assert_equal(y.shape, pred.shape) | ||
clf = TransformedTargetRegressor(estimator=LinearRegression(), | ||
transformer=StandardScaler()) | ||
pred = clf.fit(X, y).predict(X) | ||
assert_equal(y.shape, pred.shape) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same