8000 Avoid ValueError in parallel computing of large arrays · Ircama/scikit-learn@311d1ba · GitHub
[go: up one dir, main page]

Skip to content

Commit 311d1ba

Browse files
committed
Avoid ValueError in parallel computing of large arrays
This PR introduces the optional *max_nbytes* parameter on *OneVsRestClassifier", OneVsOneClassifier" and OutputCodeClassifier" multiclass learning algorithms within *multiclass.py*. Such parameter is in addition to the already existing *n_jobs* one and might be useful when dealing with a large training set processed by concurrently running jobs defined by *n_jobs* > 0 or by *n_jobs* = -1 (meaning that the number of jobs is set to the number of CPU cores). In this case, [Parallel](https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation) is called with the default "loky" backend, that [implements multi-processing](https://joblib.readthedocs.io/en/latest/parallel.html#thread-based-parallelism-vs-process-based-parallelism); *Parallel* also sets a default 1-megabyte [threshold](https://joblib.readthedocs.io/en/latest/parallel.html#automated-array-to-memmap-conversion) on the size of arrays passed to the workers. Such parameter may not be enough for large arrays and could break the job with exception **ValueError: UPDATEIFCOPY base is read-only**. *Parallel* uses *max_nbytes* to control this threshold. Through this fix, the multiclass classifiers will offer the optional possibility to customize the max size of arrays. Fixes scikit-learn#6614 Expected to also fix scikit-learn#4597
1 parent 1c546cd commit 311d1ba

File tree

1 file changed

+38
-9
lines changed

1 file changed

+38
-9
lines changed

sklearn/multiclass.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,13 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
163163
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
164164
for more details.
165165
166+
max_nbytes : int, str, or None, optional, 1M by default
167+
Threshold on the size of arrays passed to the workers that triggers
168+
automated memory mapping in temp_folder. Can be an int in Bytes, or
169+
a human-readable string, e.g., '1M' for 1 megabyte. Use None to disable
170+
memmapping of large arrays. Only active when backend=”loky” or
171+
“multiprocessing”.
172+
166173
Attributes
167174
----------
168175
estimators_ : list of `n_classes` estimators
@@ -200,9 10000 +207,10 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
200207
array([2, 0, 1])
201208
202209
"""
203-
def __init__(self, estimator, n_jobs=None):
210+
def __init__(self, estimator, n_jobs=None, max_nbytes='1M'):
204211
self.estimator = estimator
205212
self.n_jobs = n_jobs
213+
self.max_nbytes = max_nbytes
206214

207215
def fit(self, X, y):
208216
"""Fit underlying estimators.
@@ -232,7 +240,9 @@ def fit(self, X, y):
232240
# In cases where individual estimators are very fast to train setting
233241
# n_jobs > 1 in can results in slower performance due to the overhead
234242
# of spawning threads. See joblib issue #112.
235-
self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_binary)(
243+
self.estimators_ = Parallel(n_jobs=self.n_jobs,
244+
max_nbytes=self.max_nbytes
245+
)(delayed(_fit_binary)(
236246
self.estimator, X, column, classes=[
237247
"not %s" % self.label_binarizer_.classes_[i],
238248
self.label_binarizer_.classes_[i]])
@@ -290,7 +300,8 @@ def partial_fit(self, X, y, classes=None):
290300
Y = Y.tocsc()
291301
columns = (col.toarray().ravel() for col in Y.T)
292302

293-
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
303+
self.estimators_ = Parallel(n_jobs=self.n_jobs,
304+
max_nbytes=self.max_nbytes)(
294305
delayed(_partial_fit_binary)(estimator, X, column)
295306
for estimator, column in zip(self.estimators_, columns))
296307

@@ -486,6 +497,13 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
486497
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
487498
for more details.
488499
500+
max_nbytes : int, str, or None, optional, 1M by default
501+
Threshold on the size of arrays passed to the workers that triggers
502+
automated memory mapping in temp_folder. Can be an int in Bytes, or
503+
a human-readable string, e.g., ‘1M’ for 1 megabyte. Use None to disable
504+
memmapping of large arrays. Only active when backend=”loky” or
505+
“multiprocessing”.
506+
489507
Attributes
490508
----------
491509
estimators_ : list of ``n_classes * (n_classes - 1) / 2`` estimators
@@ -502,9 +520,10 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
502520
``None`` when ``estimator`` does not have ``_pairwise`` attribute.
503521
"""
504522

505-
def __init__(self, estimator, n_jobs=None):
523+
def __init__(self, estimator, n_jobs=None, max_nbytes='1M'):
506524
self.estimator = estimator
507525
self.n_jobs = n_jobs
526+
self.max_nbytes = max_nbytes
508527

509528
def fit(self, X, y):
510529
"""Fit underlying estimators.
@@ -529,7 +548,8 @@ def fit(self, X, y):
529548
raise ValueError("OneVsOneClassifier can not be fit when only one"
530549
" class is present.")
531550
n_classes = self.classes_.shape[0]
532-
estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs)(
551+
estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs,
552+
max_nbytes=self.max_nbytes)(
533553
delayed(_fit_ovo_binary)
534554
(self.estimator, X, y, self.classes_[i], self.classes_[j])
535555
for i in range(n_classes) for j in range(i + 1, n_classes)))))
@@ -581,8 +601,8 @@ def partial_fit(self, X, y, classes=None):
581601
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
582602
check_classification_targets(y)
583603
combinations = itertools.combinations(range(self.n_classes_), 2)
584-
self.estimators_ = Parallel(
585-
n_jobs=self.n_jobs)(
604+
self.estimators_ = Parallel(n_jobs=self.n_jobs,
605+
max_nbytes=self.max_nbytes)(
586606
delayed(_partial_fit_ovo_binary)(
587607
estimator, X, y, self.classes_[i], self.classes_[j])
588608
for estimator, (i, j) in zip(self.estimators_,
@@ -696,6 +716,13 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
696716
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
697717
for more details.
698718
719+
max_nbytes : int, str, or None, optional, 1M by default
720+
Threshold on the size of arrays passed to the workers that triggers
721+
automated memory mapping in temp_folder. Can be an int in Bytes, or
722+
a human-readable string, e.g., ‘1M’ for 1 megabyte. Use None to disable
723+
memmapping of large arrays. Only active when backend=”loky” or
724+
“multiprocessing”.
725+
699726
Attributes
700727
----------
701728
estimators_ : list of `int(n_classes * code_size)` estimators
@@ -741,11 +768,12 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
741768
"""
742769

743770
def __init__(self, estimator, code_size=1.5, random_state=None,
744-
n_jobs=None):
771+
n_jobs=None, max_nbytes='1M'):
745772
self.estimator = estimator
746773
self.code_size = code_size
747774
self.random_state = random_state
748775
self.n_jobs = n_jobs
776+
self.max_nbytes = max_nbytes
749777

750778
def fit(self, X, y):
751779
"""Fit underlying estimators.
@@ -790,7 +818,8 @@ def fit(self, X, y):
790818
Y = np.array([self.code_book_[classes_index[y[i]]]
791819
for i in range(X.shape[0])], dtype=np.int)
792820

793-
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
821+
self.estimators_ = Parallel(n_jobs=self.n_jobs,
822+
max_nbytes=self.max_nbytes)(
794823
delayed(_fit_binary)(self.estimator, X, Y[:, i])
795824
for i in range(Y.shape[1]))
796825

0 commit comments

Comments
 (0)
0