8000 add partial_fit to multioupt module · scikit-learn/scikit-learn@7b42573 · GitHub
[go: up one dir, main page]

Skip to content

Commit 7b42573

Browse files
committed
add partial_fit to multioupt module
1 parent 3162f98 commit 7b42573

File tree

2 files changed

+167
-11
lines changed

2 files changed

+167
-11
lines changed

sklearn/multioutput.py

Lines changed: 110 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@
1515
# License: BSD 3 clause
1616

1717
import numpy as np
18+
import copy
1819

1920
from abc import ABCMeta
2021
from .base import BaseEstimator, clone
2122
from .base import RegressorMixin, ClassifierMixin
2223
from .utils import check_array, check_X_y
2324
from .utils.fixes import parallel_helper
2425
from .utils.validation import check_is_fitted, has_fit_parameter
26+
from .utils.metaestimators import if_delegate_has_method
2527
from .externals.joblib import Parallel, delayed
2628
from .externals import six
2729

@@ -37,12 +39,87 @@ def _fit_estimator(estimator, X, y, sample_weight=None):
3739
return estimator
3840

3941

42+
def _partial_fit_estimator(estimator, X, y, classes=None, sample_weight=None,
43+
first_time=True):
44+
if first_time:
45+
estimator = clone(estimator)
46+
else:
47+
estimator = copy.copy(estimator)
48+
49+
if sample_weight is not None:
50+
if classes is not None:
51+
estimator.partial_fit(X, y, classes=classes,
52+
sample_weight=sample_weight)
53+
else:
54+
estimator.partial_fit(X, y, sample_weight=sample_weight)
55+
else:
56+
if classes is not None:
57+
estimator.partial_fit(X, y, classes=classes)
58+
else:
59+
estimator.partial_fit(X, y)
60+
return estimator
61+
62+
4063
class MultiOutputEstimator(six.with_metaclass(ABCMeta, BaseEstimator)):
4164

4265
def __init__(self, estimator, n_jobs=1):
4366
self.estimator = estimator
4467
self.n_jobs = n_jobs
4568

69+
@if_delegate_has_method('estimator')
70+
def partial_fit(self, X, y, classes=None, sample_weight=None):
71+
""" Fit linear model with Stochastic Gradient Descent..
72+
Fit a separate model for each output variable.
73+
74+
Parameters
75+
----------
76+
X : (sparse) array-like, shape (n_samples, n_features)
77+
Data.
78+
79+
y : (sparse) array-like, shape (n_samples, n_outputs)
80+
Multi-output targets. An indicator matrix turns on multilabel
81+
estimation.
82+
83+
classes : array, shape (n_classes, n_outputs)
84+
Classes across all calls to partial_fit.
85+
Can be obtained by via `[np.unique(y[:, i]) for i in xrange(y.shape[1])]`, where y is the
86+
target matrix of the entire dataset.
87+
This argument is required for the first call to partial_fit
88+
and can be omitted in the subsequent calls.
89+
Note that y doesn't need to contain all labels in `classes`.
90+
91+
sample_weight : array-like, shape = (n_samples) or None
92+
Sample weights. If None, then samples are equally weighted.
93+
Only supported if the underlying regressor supports sample
94+
weights.
95+
96+
Returns
97+
-------
98+
self : object
99+
Returns self.
100+
"""
101+
102+
X, y = check_X_y(X, y,
103+
multi_output=True,
104+
accept_sparse=True)
105+
106+
if y.ndim == 1:
107+
raise ValueError("y must have at least two dimensions for "
108+
"multi target regression but has only one.")
109+
110+
if (sample_weight is not None and
111+
not has_fit_parameter(self.estimator, 'sample_weight')):
112+
raise ValueError("Underlying regressor does not support"
113+
" sample weights.")
114+
115+
first_time = not hasattr(self, 'estimators_')
116+
117+
self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_partial_fit_estimator)(
118+
self.estimators_[i] if not first_time else self.estimator,
119+
X, y[:, i],
120+
classes[:, i] if classes is not None else None, sample_weight, first_time) for i in xrange(y.shape[1]))
121+
return self
122+
46123
def fit(self, X, y, sample_weight=None):
47124
""" Fit the model to data.
48125
Fit a separate model for each output variable.
@@ -68,7 +145,8 @@ def fit(self, X, y, sample_weight=None):
68145
"""
69146

70147
if not hasattr(self.estimator, "fit"):
71-
raise ValueError("The base estimator should implement a fit method")
148+
raise ValueError(
149+
"The base estimator should implement a fit method")
72150

73151
X, y = check_X_y(X, y,
74152
multi_output=True,
@@ -84,7 +162,7 @@ def fit(self, X, y, sample_weight=None):
84162
" sample weights.")
85163

86164
self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_estimator)(
87-
self.estimator, X, y[:, i], sample_weight) for i in range(y.shape[1]))
165+
self.estimator, X, y[:, i], sample_weight) for i in xrange(y.shape[1]))
88166
return self
89167

90168
def predict(self, X):
@@ -104,7 +182,8 @@ def predict(self, X):
104182
"""
105183
check_is_fitted(self, 'estimators_')
106184
if not hasattr(self.estimator, "predict"):
107-
raise ValueError("The base estimator should implement a predict method")
185+
raise ValueError(
186+
"The base estimator should implement a predict method")
108187

109188
X = check_array(X, accept_sparse=True)
110189

@@ -133,9 +212,36 @@ class MultiOutputRegressor(MultiOutputEstimator, RegressorMixin):
133212
using `n_jobs>1` can result in slower performance due
134213
to the overhead of spawning processes.
135214
"""
215+
136216
def __init__(self, estimator, n_jobs=1):
137217
super(MultiOutputRegressor, self).__init__(estimator, n_jobs)
138218

219+
def partial_fit(self, X, y, sample_weight=None):
220+
""" Fit linear model with Stochastic Gradient Descent..
221+
Fit a separate model for each output variable.
222+
223+
Parameters
224+
----------
225+
X : (sparse) array-like, shape (n_samples, n_features)
226+
Data.
227+
228+
y : (sparse) array-like, shape (n_samples, n_outputs)
229+
Multi-output targets. An indicator matrix turns on multilabel
230+
estimation.
231+
232+
sample_weight : array-like, shape = (n_samples) or None
233+
Sample weights. If None, then samples are equally weighted.
234+
Only supported if the underlying regressor supports sample
235+
weights.
236+
237+
Returns
238+
-------
239+
self : object
240+
Returns self.
241+
"""
242+
super(MultiOutputRegressor, self).partial_fit(
243+
X, y, sample_weight=sample_weight)
244+
139245
def score(self, X, y, sample_weight=None):
140246
"""Returns the coefficient of determination R^2 of the prediction.
141247
@@ -223,7 +329,7 @@ def predict_proba(self, X):
223329
"predict_proba method")
224330

225331
results = np.dstack([estimator.predict_proba(X) for estimator in
226-
self.estimators_])
332+
self.estimators_])
227333
return results
228334

229335
def score(self, X, y):

sklearn/tests/test_multioutput.py

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from sklearn import datasets
1111
from sklearn.base import clone
1212
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
13-
from sklearn.linear_model import Lasso
13+
from sklearn.linear_model import Lasso, SGDClassifier, SGDRegressor
1414
from sklearn.svm import LinearSVC
1515
from sklearn.multiclass import OneVsRestClassifier
1616
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
@@ -25,7 +25,7 @@ def test_multi_target_regression():
2525
for n in range(3):
2626
rgr = GradientBoostingRegressor(random_state=0)
2727
rgr.fit(X_train, y_train[:, n])
28-
references[:,n] = rgr.predict(X_test)
28+
references[:, n] = rgr.predict(X_test)
2929

3030
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
3131
rgr.fit(X_train, y_train)
@@ -34,6 +34,28 @@ def test_multi_target_regression():
3434
assert_almost_equal(references, y_pred)
3535

3636

37+
def test_multi_target_regression_partial_fit():
38+
X, y = datasets.make_regression(n_targets=3)
39+
X_train, y_train = X[:50], y[:50]
40+
X_test, y_test = X[50:], y[50:]
41+
42+
references = np.zeros_like(y_test)
43+
half_index = 25
44+
for n in range(3):
45+
sgr = SGDRegressor(random_state=0)
46+
sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
47+
sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
48+
references[:, n] = sgr.predict(X_test)
49+
50+
sgr = MultiOutputRegressor(SGDRegressor(random_state=0))
51+
52+
sgr.partial_fit(X_train[:half_index], y_train[:half_index])
53+
sgr.partial_fit(X_train[half_index:], y_train[half_index:])
54+
55+
y_pred = sgr.predict(X_test)
56+
assert_almost_equal(references, y_pred)
57+
58+
3759
def test_multi_target_regression_one_target():
3860
# Test multi target regression raises
3961
X, y = datasets.make_regression(n_targets=1)
@@ -57,11 +79,12 @@ def test_multi_target_sparse_regression():
5779
rgr.fit(X_train, y_train)
5880
rgr_sparse.fit(sparse(X_train), y_train)
5981

60-
assert_almost_equal(rgr.predict(X_test), rgr_sparse.predict(sparse(X_test)))
82+
assert_almost_equal(rgr.predict(X_test),
83+
rgr_sparse.predict(sparse(X_test)))
6184

6285

6386
def test_multi_target_sample_weights_api():
64-
X = [[1,2,3], [4,5,6]]
87+
X = [[1, 2, 3], [4, 5, 6]]
6588
y = [[3.141, 2.718], [2.718, 3.141]]
6689
w = [0.8, 0.6]
6790

@@ -76,19 +99,19 @@ def test_multi_target_sample_weights_api():
7699

77100
def test_multi_target_sample_weights():
78101
# weighted regressor
79-
Xw = [[1,2,3], [4,5,6]]
102+
Xw = [[1, 2, 3], [4, 5, 6]]
80103
yw = [[3.141, 2.718], [2.718, 3.141]]
81104
w = [2., 1.]
82105
rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
83106
rgr_w.fit(Xw, yw, w)
84107

85108
# unweighted, but with repeated samples
86-
X = [[1,2,3], [1,2,3], [4,5,6]]
109+
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
87110
y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
88111
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
89112
rgr.fit(X, y)
90113

91-
X_test = [[1.5,2.5,3.5], [3.5,4.5,5.5]]
114+
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
92115
assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
93116

94117
# Import the data
@@ -102,6 +125,33 @@ def test_multi_target_sample_weights():
102125
n_samples, n_features = X.shape
103126
n_outputs = y.shape[1]
104127
n_classes = len(np.unique(y1))
128+
classes = np.column_stack(map(np.unique, (y1, y2, y3)))
129+
130+
131+
def test_multi_output_classification_partial_fit():
132+
# test if multi_target initializes correctly with base estimator and fit
133+
# assert predictions work as expected for predict, prodict_proba and score
134+
135+
sgd_linear_clf = SGDClassifier(loss='log', random_state=1)
136+
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
137+
138+
# train the multi_target_linear and also get the predictions.
139+
half_index = int(X.shape[0] / 2)
140+
multi_target_linear.partial_fit(
141+
X[:half_index], y[:half_index], classes=classes)
142+
multi_target_linear.partial_fit(X[half_index:], y[half_index:])
143+
144+
predictions = multi_target_linear.predict(X)
145+
assert_equal((n_samples, n_outputs), predictions.shape)
146+
147+
# train the forest with each column and assert that predictions are equal
148+
for i in range(3):
149+
# create a clone with the same state
150+
sgd_linear_clf = clone(sgd_linear_clf)
151+
sgd_linear_clf.partial_fit(
152+
X[:half_index], y[:half_index, i], classes=classes[:, i])
153+
sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
154+
assert_equal(list(sgd_linear_clf.predict(X)), list(predictions[:, i]))
105155

106156

107157
def test_multi_output_classification():

0 commit comments

Comments
 (0)
0