8000 Allowing optional list of Parallel keyworded parameters · Ircama/scikit-learn@afec926 · GitHub
[go: up one dir, main page]

Skip to content

Commit afec926

Browse files
committed
Allowing optional list of Parallel keyworded parameters
Changing *OneVsRestClassifier", OneVsOneClassifier" and OutputCodeClassifier" multiclass learning algorithms within multiclass.py, by replacing "n_jobs" parameter with keyworded, variable-length argument list, in order to allow any "Parallel" parameter to be passed, as well as support "parallel_backend" context manager. "n_jobs" remains one of the possible parameters, but other ones can be added, including "max_nbytes", which might be useful in order to avoid ValueError when dealing with a large training set processed by concurrently running jobs defined by *n_jobs* > 0 or by *n_jobs* = -1. More specifically, in parallel computing of large arrays with "loky" backend, [Parallel](https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation) sets a default 1-megabyte [threshold](https://joblib.readthedocs.io/en/latest/parallel.html#automated-array-to-memmap-conversion) on the size of arrays passed to the workers. Such parameter may not be enough for large arrays and could break jobs with exception **ValueError: UPDATEIFCOPY base is read-only**. *Parallel* uses *max_nbytes* to control this threshold. Through this fix, the multiclass classifiers will offer the optional possibility to customize the max size of arrays. Fixes scikit-learn#6614 See also scikit-learn#4597 Changed _get_args in _testing.py in order to also accept 'parallel_params' vararg.
1 parent 308a54e commit afec926

File tree

2 files changed

+39
-30
lines changed

2 files changed

+39
-30
lines changed

sklearn/multiclass.py

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -157,11 +157,13 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
157157
An estimator object implementing :term:`fit` and one of
158158
:term:`decision_function` or :term:`predict_proba`.
159159
160-
n_jobs : int or None, optional (default=None)
161-
The number of jobs to use for the computation.
162-
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
163-
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
164-
for more details.
160+
parallel_params : keyworded, variable-length argument list
161+
Optional list of keyworded parameters to be passed to
162+
:class:`joblib.Parallel`. For instance, ``n_jobs`` (the number of jobs
163+
to use for the computation, see :term:`Glossary <n_jobs>`),
164+
``max_nbytes``, ``backend``, etc. (see also :class:`joblib.Parallel`
165+
reference documentation
166+
(https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html).
165167
166168
Attributes
167169
----------
@@ -200,9 +202,9 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
200202
array([2, 0, 1])
201203
202204
"""
203-
def __init__(self, estimator, n_jobs=None):
205+
def __init__(self, estimator, **parallel_params):
204206
self.estimator = estimator
205-
self.n_jobs = n_jobs
207+
self.parallel_params = parallel_params
206208

207209
def fit(self, X, y):
208210
"""Fit underlying estimators.
@@ -232,10 +234,11 @@ def fit(self, X, y):
232234
# In cases where individual estimators are very fast to train setting
233235
# n_jobs > 1 in can results in slower performance due to the overhead
234236
# of spawning threads. See joblib issue #112.
235-
self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_binary)(
236-
self.estimator, X, column, classes=[
237-
"not %s" % self.label_binarizer_.classes_[i],
238-
self.label_binarizer_.classes_[i]])
237+
self.estimators_ = Parallel(**self.parallel_params)(
238+
delayed(_fit_binary)(
239+
self.estimator, X, column, classes=[
240+
"not %s" % self.label_binarizer_.classes_[i],
241+
self.label_binarizer_.classes_[i]])
239242
for i, column in enumerate(columns))
240243

241244
return self
@@ -290,7 +293,7 @@ def partial_fit(self, X, y, classes=None):
290293
Y = Y.tocsc()
291294
columns = (col.toarray().ravel() for col in Y.T)
292295

293-
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
296+
self.estimators_ = Parallel(**self.parallel_params)(
294297
delayed(_partial_fit_binary)(estimator, X, column)
295298
for estimator, column in zip(self.estimators_, columns))
296299

@@ -480,11 +483,13 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
480483
An estimator object implementing :term:`fit` and one of
481484
:term:`decision_function` or :term:`predict_proba`.
482485
483-
n_jobs : int or None, optional (default=None)
484-
The number of jobs to use for the computation.
485-
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
486-
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
487-
for more details.
486+
parallel_params : keyworded, variable-length argument list
487+
Optional list of keyworded parameters to be passed to
488+
:class:`joblib.Parallel`. For instance, ``n_jobs`` (the number of jobs
489+
to use for the computation, see :term:`Glossary <n_jobs>`),
490+
``max_nbytes``, ``backend``, etc. (see also :class:`joblib.Parallel`
491+
reference documentation
492+
(https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html).
488493
489494
Attributes
490495
----------
@@ -502,9 +507,9 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
502507
``None`` when ``estimator`` does not have ``_pairwise`` attribute.
503508
"""
504509

505-
def __init__(self, estimator, n_jobs=None):
510+
def __init__(self, estimator, **parallel_params):
506511
self.estimator = estimator
507-
self.n_jobs = n_jobs
512+
self.parallel_params = parallel_params
508513

509514
def fit(self, X, y):
510515
"""Fit underlying estimators.
@@ -529,7 +534,7 @@ def fit(self, X, y):
529534
raise ValueError("OneVsOneClassifier can not be fit when only one"
530535
" class is present.")
531536
n_classes = self.classes_.shape[0]
532-
estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs)(
537+
estimators_indices = list(zip(*(Parallel(**self.parallel_params)(
533538
delayed(_fit_ovo_binary)
534539
(self.estimator, X, y, self.classes_[i], self.classes_[j])
535540
for i in range(n_classes) for j in range(i + 1, n_classes)))))
@@ -582,7 +587,7 @@ def partial_fit(self, X, y, classes=None):
582587
check_classification_targets(y)
583588
combinations = itertools.combinations(range(self.n_classes_), 2)
584589
self.estimators_ = Parallel(
585-
n_jobs=self.n_jobs)(
590+
**self.parallel_params)(
586591
delayed(_partial_fit_ovo_binary)(
587592
estimator, X, y, self.classes_[i], self.classes_[j])
588593
for estimator, (i, j) in zip(self.estimators_,
@@ -690,11 +695,13 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
690695
random_state is the random number generator; If None, the random number
691696
generator is the RandomState instance used by `np.random`.
692697
693-
n_jobs : int or None, optional (default=None)
694-
The number of jobs to use for the computation.
695-
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
696-
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
697-
for more details.
698+
parallel_params : keyworded, variable-length argument list
699+
Optional list of keyworded parameters to be passed to
700+
:class:`joblib.Parallel`. For instance, ``n_jobs`` (the number of jobs
701+
to use for the computation, see :term:`Glossary <n_jobs>`),
702+
``max_nbytes``, ``backend``, etc. (see also :class:`joblib.Parallel`
703+
reference documentation
704+
(https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html).
698705
699706
Attributes
700707
----------
@@ -741,11 +748,11 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
741748
"""
742749

743750
def __init__(self, estimator, code_size=1.5, random_state=None,
744-
n_jobs=None):
751+
**parallel_params):
745752
self.estimator = estimator
746753
self.code_size = code_size
747754
self.random_state = random_state
748-
self.n_jobs = n_jobs
755+
self.parallel_params = parallel_params
749756

750757
def fit(self, X, y):
751758
"""Fit underlying estimators.
@@ -790,7 +797,7 @@ def fit(self, X, y):
790797
Y = np.array([self.code_book_[classes_index[y[i]]]
791798
for i in range(X.shape[0])], dtype=np.int)
792799

793-
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
800+
self.estimators_ = Parallel(**self.parallel_params)(
794801
delayed(_fit_binary)(self.estimator, X, Y[:, i])
795802
for i in range(Y.shape[1]))
796803

sklearn/utils/_testing.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -698,7 +698,9 @@ def _get_args(function, varargs=False):
698698
# Error on builtin C function
699699
return []
700700
args = [key for key, param in params.items()
701-
if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)]
701+
if (key == 'parallel_params' and
702+
param.kind == param.VAR_KEYWORD) or
703+
param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)]
702704
if varargs:
703705
varargs = [param.name for param in params.values()
704706
if param.kind == param.VAR_POSITIONAL]

0 commit comments

Comments
 (0)
0