8000 [WIP] API: allow cross validation to work with partial_fit by stsievert · Pull Request #11266 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] API: allow cross validation to work with partial_fit #11266

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ Model evaluation
``'balanced_accuracy'`` scorer for binary classification.
:issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia <dalmia>`.

- Keyword arguments have been added to :func:`model_selection.cross_validate`
to control calling ``estimator.partial_fit`` before scoring and to control
the validation set. These are implemented with the keyword arguments
``partial_fit``, ``X_test`` and ``y_test`` in :issue:`11266` by
:user:`Scott Sievert <stsievert>`.

Decomposition, manifold learning and clustering

- :class:`cluster.AgglomerativeClustering` now supports Single Linkage
Expand Down
1 change: 1 addition & 0 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -2058,6 +2058,7 @@ def train_test_split(*arrays, **options):
# Tell nose that train_test_split is not a test
train_test_split.__test__ = False


def _build_repr(self):
# XXX This is copied from BaseEstimator's get_params
cls = self.__class__
Expand Down
82 changes: 64 additions & 18 deletions sklearn/model_selection/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,18 @@
def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
n_jobs=1, verbose=0, fit_params=None,
pre_dispatch='2*n_jobs', return_train_score="warn",
return_estimator=False):
return_estimator=False, partial_fit=False, X_test=None,
y_test=None):
"""Evaluate metric(s) by cross-validation and also record fit/score times.

Read more in the :ref:`User Guide <multimetric_cross_validation>`.

Parameters
----------
estimator : estimator object implementing 'fit'
The object to use to fit the data.
estimator : estimator object implementing 'fit', list of estimators
The object to use to fit the data. If a list, do not clone each
estimator and it must be the same length as the number of cross
validation splits.

X : array-like
The data to fit. Can be for example a list, or an array.
Expand Down Expand Up @@ -133,6 +136,21 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
return_estimator : boolean, default False
Whether to return the estimators fitted on each split.

partial_fit : boolean, integer, default False
If False (default), call ``fit``. If True, call
``estimator.partial_fit`` once. If an integer, call ``partial_fit``
times. ``estimator`` is assumed to be pickleable if ``partial_fit``
is not True.

X_test : array-like, optional
If present, treat this as the validation set and use
``X`` and ``y`` for training as the training set.

y_test : array-like, optional
If present, treat this as the validation set and use
``X`` and ``y`` for training as the training set.


Returns
-------
scores : dict of float arrays of shape=(n_splits,)
Expand Down Expand Up @@ -205,18 +223,18 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
X, y, groups = indexable(X, y, groups)

cv = check_cv(cv, y, classifier=is_classifier(estimator))
scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)
ests = _get_estimators(estimator, cv.get_n_splits(X, y, groups))
scorers, _ = _check_multimetric_scoring(ests[0], scoring=scoring)

# We clone the estimator to make sure that all the folds are
# independent, and that it is pickle-able.
parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
pre_dispatch=pre_dispatch)
scores = parallel(
delayed(_fit_and_score)(
clone(estimator), X, y, scorers, train, test, verbose, None,
fit_params, return_train_score=return_train_score,
return_times=True, return_estimator=return_estimator)
for train, test in cv.split(X, y, groups))
est, X, y, scorers, train, test, verbose, None, fit_params,
return_train_score=return_train_score, return_times=True,
return_estimator=return_estimator, partial_fit=partial_fit,
X_test=X_test, y_test=y_test)
for est, (train, test) in zip(ests, cv.split(X, y, groups)))

zipped_scores = list(zip(*scores))
if return_train_score:
Expand Down Expand Up @@ -252,6 +270,21 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
return ret


def _get_estimators(estimator, n):
if isinstance(estimator, tuple):
estimator = list(estimator)
if isinstance(estimator, list):
ret = estimator
if len(ret) != n:
msg = ('the number of estimators ({n}) being fit is not equal to '
'the number of splits for cross validation ({n_splits}).')
raise ValueError(msg.format(n=len(ret), n_splits=n))
return ret
else:
ret = [clone(estimator) for _ in range(n)]
return ret


def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
n_jobs=1, verbose=0, fit_params=None,
pre_dispatch='2*n_jobs'):
Expand Down Expand Up @@ -369,7 +402,8 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
parameters, fit_params, return_train_score=False,
return_parameters=False, return_n_test_samples=False,
return_times=False, return_estimator=False,
error_score='raise-deprecating'):
error_score='raise-deprecating', partial_fit=False,
X_test=None, y_test=None):
"""Fit estimator and compute scores for a given dataset split.

Parameters
Expand Down Expand Up @@ -431,6 +465,12 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
return_estimator : boolean, optional, default: False
Whether to return the fitted estimator.

partial_fit : boolean, integer, default False
If False (default), call ``fit``. If True, call
``estimator.partial_fit`` once. If an integer, call ``partial_fit``
times. ``estimator`` is assumed to be pickleable if ``partial_fit``
is not True.

Returns
-------
train_scores : dict of scorer name -> float, optional
Expand Down Expand Up @@ -469,23 +509,29 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
for k, v in fit_params.items()])

train_scores = {}
if parameters is not None:
if parameters is not None and not partial_fit:
estimator.set_params(**parameters)

start_time = time.time()

X_train, y_train = _safe_split(estimator, X, y, train)
X_test, y_test = _safe_split(estimator, X, y, test, train)
if X_test is None and y_test is None:
X_train, y_train = _safe_split(estimator, X, y, train)
X_test, y_test = _safe_split(estimator, X, y, test, train)
else:
X_train, y_train = X, y
train_data = (X_train, y_train) if y_train is not None else (X_train, )

is_multimetric = not callable(scorer)
n_scorers = len(scorer.keys()) if is_multimetric else 1

try:
if y_train is None:
estimator.fit(X_train, **fit_params)
if not isinstance(partial_fit, (bool, int)):
raise ValueError('partial_fit must be a boolean or an integer')
if not partial_fit:
estimator.fit(*train_data, **fit_params)
else:
estimator.fit(X_train, y_train, **fit_params)

for _ in range(int(partial_fit)):
estimator.partial_fit(*train_data, **fit_params)
except Exception as e:
# Note fit time as time until error
fit_time = time.time() - start_time
Expand Down
88 changes: 88 additions & 0 deletions sklearn/model_selection/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,94 @@ def assert_fit_params(clf):
cross_val_score(clf, X, y, fit_params=fit_params)


@pytest.mark.parametrize("partial_fit", [True, 4, 8])
def test_cross_validate_fit_kwarg(partial_fit):
X, y = make_classification(n_samples=20, n_classes=2, random_state=0)
classes = np.unique(y)

tol = -np.inf
clf_p_fit = SGDClassifier(random_state=0, tol=tol, max_iter=10)
clf_fit = SGDClassifier(random_state=0, tol=tol, max_iter=100)

cv = 2
# scores_partial_fit
scores_p_fit = cross_validate(clf_p_fit, X, y, partial_fit=partial_fit,
fit_params={'classes': classes},
return_estimator=True, cv=cv)
# score_fit
scores_fit = cross_validate(clf_fit, X, y, return_estimator=True, cv=cv)
assert_true(set(scores_p_fit.keys()) == set(scores_fit.keys()))

clfs_p_fit = scores_p_fit['estimator']
clfs_fit = scores_fit['estimator']
for clf_fit, clf_p_fit in zip(clfs_fit, clfs_p_fit):
assert_true(clf_p_fit.t_ * 10 < clf_fit.t_)
assert_true(clf_p_fit.t_ - 1 ==
int(partial_fit * X.shape[0] * (cv - 1) / cv))


@pytest.mark.parametrize("partial_fit", ['foo', 1.0, 1, True])
def test_cross_validate_fit_kwarg_raises(partial_fit):
clf = SGDClassifier(random_state=0)
X, y = make_classification(n_samples=20, n_classes=2, random_state=0)
classes = np.unique(y)

if isinstance(partial_fit, (bool, int)):
cross_validate(clf, X, y, partial_fit=partial_fit,
fit_params={'classes': classes})
else:
with pytest.raises(ValueError, match='partial_fit must be'):
cross_validate(clf, X, y, partial_fit=partial_fit,
fit_params={'classes': classes})


def test_cross_validate_val_set():
n, d = 100, 2
X_train, y_train = make_classification(n_samples=n, n_classes=2,
n_features=d, random_state=0,
n_redundant=0, n_informative=d)
rng = np.random.RandomState(0)
X_test = rng.randn(n, d)
y_test = (np.sign(rng.randn(n)) + 1) / 2

clf = SGDClassifier(random_state=0)
r = cross_validate(clf, X_train, y_train, X_test=X_test, y_test=y_test)

assert_true(r['test_score'].mean() < 0.48 < 0.85 < r['train_score'].mean())


def test_cross_validate_repeated_call():
n, d = 100, 80
cv = 3
X, y = make_classification(n_samples=n, n_features=d, n_classes=2,
random_state=0, n_redundant=0,
n_informative=2)
classes = np.unique(y)
one_epoch = X.shape[0] * (cv - 1) / cv

clf = SGDClassifier(random_state=0)
ret1 = cross_validate(clf, X, y, fit_params={'classes': classes},
return_estimator=True, partial_fit=True, cv=cv)
iters1 = [(est.t_ - 1) / one_epoch for est in ret1['estimator']]
assert isinstance(ret1['estimator'], tuple)
assert all([isinstance(e, BaseEstimator) for e in ret1['estimator']])

ret2 = cross_validate(ret1['estimator'], X, y, return_estimator=True,
partial_fit=True, cv=cv)

assert set(ret1.keys()) == set(ret2.keys())
for k, v1 in ret1.items():
v2 = ret2[k]
assert len(v1) == len(v2)
if k == 'train_score':
assert v1.mean() < 0.90 < 0.93 < v2.mean()
if k == 'test_score':
assert v1.mean() < 0.73 < 0.75 < v2.mean()

iters2 = [(est.t_ - 1) / one_epoch for est in ret2['estimator']]
assert sum(iters1) / cv == 1 and sum(iters2) / cv == 2


def test_cross_val_score_score_func():
clf = MockClassifier()
_score_func_args = []
Expand Down
0