8000 [MRG + 1] add partial_fit to multioutput module (#8054) · scikit-learn/scikit-learn@8695ff5 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8695ff5

Browse files
yupbankraghavrv
authored andcommitted
[MRG + 1] add partial_fit to multioutput module (#8054)
* add partial_fit to multioupt module * fix range in python3 * fix flake8 * fix the comments * fix according to comments * fix lint * remove pytest * fix ValueException message * py 3.5 compatiable classes * fix stuff * fix according the comments * remove used copy * flake8.. * fix docs * eventually, i use deepcopy to ensure the parallel * lint.. * address final comment * fix addressing the comments * update confirmed separate estimators * finally remove copy * compact test
1 parent 0b02125 commit 8695ff5

File tree

3 files changed

+240
-17
lines changed

3 files changed

+240
-17
lines changed

doc/whats_new.rst

+4
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ New features
4949
Enhancements
5050
............
5151

52+
- :class:`multioutput.MultiOutputRegressor` and :class:`multioutput.MultiOutputClassifier`
53+
now support online learning using `partial_fit`.
54+
issue: `8053` by :user:`Peng Yu <yupbank>`.
55+
5256
- :class:`decomposition.PCA`, :class:`decomposition.IncrementalPCA` and
5357
:class:`decomposition.TruncatedSVD` now expose the singular values
5458
from the underlying SVD. They are stored in the attribute

sklearn/multioutput.py

+110-6
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from .utils import check_array, check_X_y
2323
from .utils.fixes import parallel_helper
2424
from .utils.validation import check_is_fitted, has_fit_parameter
25+
from .utils.metaestimators import if_delegate_has_method
2526
from .externals.joblib import Parallel, delayed
2627
from .externals import six
2728

@@ -37,12 +38,86 @@ def _fit_estimator(estimator, X, y, sample_weight=None):
3738
return estimator
3839

3940

41+
def _partial_fit_estimator(estimator, X, y, classes=None, sample_weight=None,
42+
first_time=True):
43+
if first_time:
44+
estimator = clone(estimator)
45+
46+
if sample_weight is not None:
47+
if classes is not None:
48+
estimator.partial_fit(X, y, classes=classes,
49+
sample_weight=sample_weight)
50+
else:
51+
estimator.partial_fit(X, y, sample_weight=sample_weight)
52+
else:
53+
if classes is not None:
54+
estimator.partial_fit(X, y, classes=classes)
55+
else:
56+
estimator.partial_fit(X, y)
57+
return estimator
58+
59+
4060
class MultiOutputEstimator(six.with_metaclass(ABCMeta, BaseEstimator)):
4161

4262
def __init__(self, estimator, n_jobs=1):
4363
self.estimator = estimator
4464
self.n_jobs = n_jobs
4565

66+
@if_delegate_has_method('estimator')
67+
def partial_fit(self, X, y, classes=None, sample_weight=None):
68+
"""Incrementally fit the model to data.
69+
Fit a separate model for each output variable.
70+
71+
Parameters
72+
----------
73+
X : (sparse) array-like, shape (n_samples, n_features)
74+
Data.
75+
76+
y : (sparse) array-like, shape (n_samples, n_outputs)
77+
Multi-output targets.
78+
79+
classes : list of numpy arrays, shape (n_outputs)
80+
Each array is unique classes for one output in str/int
81+
Can be obtained by via
82+
``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where y is the
83+
target matrix of the entire dataset.
84+
This argument is required for the first call to partial_fit
85+
and can be omitted in the subsequent calls.
86+
Note that y doesn't need to contain all labels in `classes`.
87+
88+
sample_weight : array-like, shape = (n_samples) or None
89+
Sample weights. If None, then samples are equally weighted.
90+
Only supported if the underlying regressor supports sample
91+
weights.
92+
93+
Returns
94+
-------
95+
self : object
96+
Returns self.
97+
"""
98+
X, y = check_X_y(X, y,
99+
multi_output=True,
100+
accept_sparse=True)
101+
102+
if y.ndim == 1:
103+
raise ValueError("y must have at least two dimensions for "
104+
"multi-output regression but has only one.")
105+
106+
if (sample_weight is not None and
107+
not has_fit_parameter(self.estimator, 'sample_weight')):
108+
raise ValueError("Underlying estimator does not support"
109+
" sample weights.")
110+
111+
first_time = not hasattr(self, 'estimators_')
112+
113+
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
114+
delayed(_partial_fit_estimator)(
115+
self.estimators_[i] if not first_time else self.estimator,
116+
X, y[:, i],
117+
classes[i] if classes is not None else None,
118+
sample_weight, first_time) for i in range(y.shape[1]))
119+
return self
120+
46121
def fit(self, X, y, sample_weight=None):
47122
""" Fit the model to data.
48123
Fit a separate model for each output variable.
@@ -76,15 +151,17 @@ def fit(self, X, y, sample_weight=None):
76151

77152
if y.ndim == 1:
78153
raise ValueError("y must have at least two dimensions for "
79-
"multi target regression but has only one.")
154+
"multi-output regression but has only one.")
80155

81156
if (sample_weight is not None and
82157
not has_fit_parameter(self.estimator, 'sample_weight')):
83-
raise ValueError("Underlying regressor does not support"
158+
raise ValueError("Underlying estimator does not support"
84159
" sample weights.")
85160

86-
self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_estimator)(
87-
self.estimator, X, y[:, i], sample_weight) for i in range(y.shape[1]))
161+
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
162+
delayed(_fit_estimator)(
163+
self.estimator, X, y[:, i], sample_weight)
164+
for i in range(y.shape[1]))
88165
return self
89166

90167
def predict(self, X):
@@ -108,8 +185,9 @@ def predict(self, X):
108185

109186
X = check_array(X, accept_sparse=True)
110187

111-
y = Parallel(n_jobs=self.n_jobs)(delayed(parallel_helper)(e, 'predict', X)
112-
for e in self.estimators_)
188+
y = Parallel(n_jobs=self.n_jobs)(
189+
delayed(parallel_helper)(e, 'predict', X)
190+
for e in self.estimators_)
113191

114192
return np.asarray(y).T
115193

@@ -133,9 +211,35 @@ class MultiOutputRegressor(MultiOutputEstimator, RegressorMixin):
133211
using `n_jobs>1` can result in slower performance due
134212
to the overhead of spawning processes.
135213
"""
214+
136215
def __init__(self, estimator, n_jobs=1):
137216
super(MultiOutputRegressor, self).__init__(estimator, n_jobs)
138217

218+
def partial_fit(self, X, y, sample_weight=None):
219+
"""Incrementally fit the model to data.
220+
Fit a separate model for each output variable.
221+
222+
Parameters
223+
----------
224+
X : (sparse) array-like, shape (n_samples, n_features)
225+
Data.
226+
227+
y : (sparse) array-like, shape (n_samples, n_outputs)
228+
Multi-output targets.
229+
230+
sample_weight : array-like, shape = (n_samples) or None
231+
Sample weights. If None, then samples are equally weighted.
232+
Only supported if the underlying regressor supports sample
233+
weights.
234+
235+
Returns
236+
-------
237+
self : object
238+
Returns self.
239+
"""
240+
super(MultiOutputRegressor, self).partial_fit(
241+
X, y, sample_weight=sample_weight)
242+
139243
def score(self, X, y, sample_weight=None):
140244
"""Returns the coefficient of determination R^2 of the prediction.
141245

sklearn/tests/test_multioutput.py

+126-11
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,23 @@
1+
from __future__ import division
12
import numpy as np
23
import scipy.sparse as sp
34
from sklearn.utils import shuffle
45
from sklearn.utils.testing import assert_almost_equal
56
from sklearn.utils.testing import assert_raises
7+
from sklearn.utils.testing import assert_false
68
from sklearn.utils.testing import assert_raises_regex
79
from sklearn.utils.testing import assert_array_equal
810
from sklearn.utils.testing import assert_equal
11+
from sklearn.utils.testing import assert_not_equal
12+
from sklearn.utils.testing import assert_array_almost_equal
913
from sklearn.exceptions import NotFittedError
1014
from sklearn import datasets
1115
from sklearn.base import clone
1216
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
13-
from sklearn.linear_model import Lasso, LogisticRegression
17+
from sklearn.linear_model import Lasso
18+
from sklearn.linear_model import SGDClassifier
19+
from sklearn.linear_model import SGDRegressor
20+
from sklearn.linear_model import LogisticRegression
1421
from sklearn.svm import LinearSVC
1522
from sklearn.multiclass import OneVsRestClassifier
1623
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
@@ -25,7 +32,7 @@ def test_multi_target_regression():
2532
for n in range(3):
2633
rgr = GradientBoostingRegressor(random_state=0)
2734
rgr.fit(X_train, y_train[:, n])
28-
references[:,n] = rgr.predict(X_test)
35+
references[:, n] = rgr.predict(X_test)
2936

3037
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
3138
rgr.fit(X_train, y_train)
@@ -34,20 +41,40 @@ def test_multi_target_regression():
3441
assert_almost_equal(references, y_pred)
3542

3643

44+
def test_multi_target_regression_partial_fit():
45+
X, y = datasets.make_regression(n_targets=3)
46+
X_train, y_train = X[:50], y[:50]
47+
X_test, y_test = X[50:], y[50:]
48+
49+
references = np.zeros_like(y_test)
50+
half_index = 25
51+
for n in range(3):
52+
sgr = SGDRegressor(random_state=0)
53+
sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
54+
sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
55+
references[:, n] = sgr.predict(X_test)
56+
57+
sgr = MultiOutputRegressor(SGDRegressor(random_state=0))
58+
59+
sgr.partial_fit(X_train[:half_index], y_train[:half_index])
60+
sgr.partial_fit(X_train[half_index:], y_train[half_index:])
61+
62+
y_pred = sgr.predict(X_test)
63+
assert_almost_equal(references, y_pred)
64+
65+
3766
def test_multi_target_regression_one_target():
3867
# Test multi target regression raises
3968
X, y = datasets.make_regression(n_targets=1)
40-
X_train, y_train = X[:50], y[:50]
41-
X_test, y_test = X[50:], y[50:]
4269

4370
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
44-
assert_raises(ValueError, rgr.fit, X_train, y_train)
71+
assert_raises(ValueError, rgr.fit, X, y)
4572

4673

4774
def test_multi_target_sparse_regression():
4875
X, y = datasets.make_regression(n_targets=3)
4976
X_train, y_train = X[:50], y[:50]
50-
X_test, y_test = X[50:], y[50:]
77+
X_test = X[50:]
5178

5279
for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix,
5380
sp.lil_matrix]:
@@ -57,11 +84,12 @@ def test_multi_target_sparse_regression():
5784
rgr.fit(X_train, y_train)
5885
rgr_sparse.fit(sparse(X_train), y_train)
5986

60-
assert_almost_equal(rgr.predict(X_test), rgr_sparse.predict(sparse(X_test)))
87+
assert_almost_equal(rgr.predict(X_test),
88+
rgr_sparse.predict(sparse(X_test)))
6189

6290

6391
def test_multi_target_sample_weights_api():
64-
X = [[1,2,3], [4,5,6]]
92+
X = [[1, 2, 3], [4, 5, 6]]
6593
y = [[3.141, 2.718], [2.718, 3.141]]
6694
w = [0.8, 0.6]
6795

@@ -74,23 +102,40 @@ def test_multi_target_sample_weights_api():
74102
rgr.fit(X, y, w)
75103

76104

105+
def test_multi_target_sample_weight_partial_fit():
106+
# weighted regressor
107+
X = [[1, 2, 3], [4, 5, 6]]
108+
y = [[3.141, 2.718], [2.718, 3.141]]
109+
w = [2., 1.]
110+
rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0))
111+
rgr_w.partial_fit(X, y, w)
112+
113+
# weighted with different weights
114+
w = [2., 2.]
115+
rgr = MultiOutputRegressor(SGDRegressor(random_state=0))
116+
rgr.partial_fit(X, y, w)
117+
118+
assert_not_equal(rgr.predict(X)[0][0], rgr_w.predict(X)[0][0])
119+
120+
77121
def test_multi_target_sample_weights():
78122
# weighted regressor
79-
Xw = [[1,2,3], [4,5,6]]
123+
Xw = [[1, 2, 3], [4, 5, 6]]
80124
yw = [[3.141, 2.718], [2.718, 3.141]]
81125
w = [2., 1.]
82126
rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
83127
rgr_w.fit(Xw, yw, w)
84128

85129
# unweighted, but with repeated samples
86-
X = [[1,2,3], [1,2,3], [4,5,6]]
130+
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
87131
y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
88132
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
89133
rgr.fit(X, y)
90134

91-
X_test = [[1.5,2.5,3.5], [3.5,4.5,5.5]]
135+
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
92136
assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
93137

138+
94139
# Import the data
95140
iris = datasets.load_iris()
96141
# create a multiple targets by randomized shuffling and concatenating y.
@@ -102,6 +147,57 @@ def test_multi_target_sample_weights():
102147
n_samples, n_features = X.shape
103148
n_outputs = y.shape[1]
104149
n_classes = len(np.unique(y1))
150+
classes = list(map(np.unique, (y1, y2, y3)))
151+
152+
153+
def test_multi_output_classification_partial_fit_parallelism():
154+
sgd_linear_clf = SGDClassifier(loss='log', random_state=1)
155+
mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=-1)
156+
mor.partial_fit(X, y, classes)
157+
est1 = mor.estimators_[0]
158+
mor.partial_fit(X, y)
159+
est2 = mor.estimators_[0]
160+
# parallelism requires this to be the case for a sane implementation
161+
assert_false(est1 is est2)
162+
163+
164+
def test_multi_output_classification_partial_fit():
165+
# test if multi_target initializes correctly with base estimator and fit
166+
# assert predictions work as expected for predict
167+
168+
sgd_linear_clf = SGDClassifier(loss='log', random_state=1)
169+
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
170+
171+
# train the multi_target_linear and also get the predictions.
172+
half_index = X.shape[0] // 2
173+
multi_target_linear.partial_fit(
174+
X[:half_index], y[:half_index], classes=classes)
175+
176+
first_predictions = multi_target_linear.predict(X)
177+
assert_equal((n_samples, n_outputs), first_predictions.shape)
178+
179+
multi_target_linear.partial_fit(X[half_index:], y[half_index:])
180+
second_predictions = multi_target_linear.predict(X)
181+
assert_equal((n_samples, n_outputs), second_predictions.shape)
182+
183+
# train the linear classification with each column and assert that
184+
# predictions are equal after first partial_fit and second partial_fit
185+
for i in range(3):
186+
# create a clone with the same state
187+
sgd_linear_clf = clone(sgd_linear_clf)
188+
sgd_linear_clf.partial_fit(
189+
X[:half_index], y[:half_index, i], classes=classes[i])
190+
assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
191+
sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
192+
assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
193+
194+
195+
def test_mutli_output_classifiation_partial_fit_no_first_classes_exception():
196+
sgd_linear_clf = SGDClassifier(loss='log', random_state=1)
197+
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
198+
assert_raises_regex(ValueError, "classes must be passed on the first call "
199+
"to partial_fit.",
200+
multi_target_linear.partial_fit, X, y)
105201

106202

107203
def test_multi_output_classification():
@@ -209,6 +305,25 @@ def test_multi_output_classification_sample_weights():
209305
assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
210306

211307

308+
def test_multi_output_classification_partial_fit_sample_weights():
309+
# weighted classifier
310+
Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
311+
yw = [[3, 2], [2, 3], [3, 2]]
312+
w = np.asarray([2., 1., 1.])
313+
sgd_linear_clf = SGDClassifier(random_state=1)
314+
clf_w = MultiOutputClassifier(sgd_linear_clf)
315+
clf_w.fit(Xw, yw, w)
316+
317+
# unweighted, but with repeated samples
318+
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
319+
y = [[3, 2], [3, 2], [2, 3], [3, 2]]
320+
sgd_linear_clf = SGDClassifier(random_state=1)
321+
clf = MultiOutputClassifier(sgd_linear_clf)
322+
clf.fit(X, y)
323+
X_test = [[1.5, 2.5, 3.5]]
324+
assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
325+
326+
212327
def test_multi_output_exceptions():
213328
# NotFittedError when fit is not done but score, predict and
214329
# and predict_proba are called

0 commit comments

Comments
 (0)
0