10000 [MRG] Added check for idempotence of fit() by NicolasHug · Pull Request #12328 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content
8000

[MRG] Added check for idempotence of fit() #12328

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 29, 2018
6 changes: 6 additions & 0 deletions doc/whats_new/v0.21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,9 @@ Changes to estimator checks
---------------------------

These changes mostly affect library developers.

- Add ``check_fit_idempotent`` to
:func:`~utils.estimator_checks.check_estimator`, which checks that
when `fit` is called twice with the same data, the ouput of
`predit`, `predict_proba`, `transform`, and `decision_function` does not
change. :issue:`12328` by :user:`Nicolas Hug<NicolasHug>`
50 changes: 50 additions & 0 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
from sklearn.exceptions import DataConversionWarning
from sklearn.exceptions import SkipTestWarning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection._validation import _safe_split
from sklearn.metrics.pairwise import (rbf_kernel, linear_kernel,
pairwise_distances)

Expand Down Expand Up @@ -266,6 +268,7 @@ def _yield_all_checks(name, estimator):
yield check_set_params
yield check_dict_unchanged
yield check_dont_overwrite_parameters
yield check_fit_idempotent


def check_estimator(Estimator):
Expand Down Expand Up @@ -2345,3 +2348,50 @@ def check_outliers_fit_predict(name, estimator_orig):
for contamination in [-0.5, 2.3]:
estimator.set_params(contamination=contamination)
assert_raises(ValueError, estimator.fit_predict, X)


def check_fit_idempotent(name, estimator_orig):
# Check that est.fit(X) is the same as est.fit(X).fit(X). Ideally we would
# check that the estimated parameters during training (e.g. coefs_) are
# the same, but having a universal comparison function for those
# attributes 91AF is difficult and full of edge cases. So instead we check that
# predict(), predict_proba(), decision_function() and transform() return
# the same results.

check_methods = ["predict", "transform", "decision_function",
"predict_proba"]
rng = np.random.RandomState(0)

estimator = clone(estimator_orig)
set_random_state(estimator)
if 'warm_start' in estimator.get_params().keys():
estimator.set_params(warm_start=False)

n_samples = 100
X = rng.normal(loc=100, size=(n_samples, 2))
X = pairwise_estimator_convert_X(X, estimator)
if is_regressor(estimator_orig):
y = rng.normal(size=n_samples)
else:
y = rng.randint(low=0, high=2, size=n_samples)
y = multioutput_estimator_convert_y_2d(estimator, y)

train, test = next(ShuffleSplit(test_size=.2, random_state=rng).split(X))
X_train, y_train = _safe_split(estimator, X, y, train)
X_test, y_test = _safe_split(estimator, X, y, test, train)

# Fit for the first time
estimator.fit(X_train, y_train)

result = {}
for method in check_methods:
if hasattr(estimator, method):
result[method] = getattr(estimator, method)(X_test)

# Fit again
estimator.fit(X_train, y_train)

for method in check_methods:
if hasattr(estimator, method):
new_result = getattr(estimator, method)(X_test)
assert_allclose_dense_sparse(result[method], new_result)
0