Closed
Description
Hi,
First thanks for your awesome work !
I have an issue with GridSearchCV and n_jobs for a ExtraTreesClassifier model.
platform.platform()
: Linux-3.13.0-74-generic-x86_64-with-debian-jessie-sidcpu_count()
: 8- RAM : 32 GB (never exceeds 6 GO during exec)
- Python 2.7.11 :: Anaconda 2.4.1 (64-bit)
sklearn.__version__
: '0.17'numpy.__version__
: '1.10.4'scipy.__version__
: '0.16.1'pandas.__version__
: '0.17.1'joblib.__version__
: '0.9.3'
Code KO :
model = ExtraTreesClassifier(class_weight='balanced')
parameters = {'criterion': ['gini', 'entropy'],
'max_depth': [4, 10, 20],
'min_samples_split' : [2, 4, 8],
'max_depth' : [3, 10, 20]}
clf = GridSearchCV(model, parameters, verbose=3, scoring='roc_auc',
cv=StratifiedKFold(y_train, n_folds=5, shuffle=True),
n_jobs=4)
clf.fit(X_train.values, y_train.values)
Traceback (most recent call last):
File "create_extratrees.py", line 305, in <module>
clf.fit(X_train.values, y_train.values)
File "/home/gillesa/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py", line 804, in fit
return self._fit(X, y, ParameterGrid(self.param_grid))
File "/home/gillesa/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit
for parameters in parameter_iterable
File "/home/gillesa/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 812, in __call__
self.retrieve()
File "/home/gillesa/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 762, in retrieve
raise exception
sklearn.externals.joblib.my_exceptions.JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/home/gillesa/github/mailling/create_extratrees.py in <module>()
300 'max_depth' : [3, 10, 20]}
301
302 clf = GridSearchCV(model, parameters,
303 cv=StratifiedKFold(y_train, n_folds=5, shuffle=True), verbose=3, scoring='roc_auc', n_jobs=4)
304
--> 305 clf.fit(X_train.values, y_train.values)
306
307 best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
308 print(clf.scoring + ' score : ', score)
309 for param_name in sorted(best_parameters.keys()):
...........................................................................
/home/gillesa/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py in fit(self=GridSearchCV(cv=sklearn.cross_validation.Stratif..._jobs', refit=True, scoring='roc_auc', verbose=3), X=array([[ 0., 9., 56., ..., 1., 0., 0.]... [ 0., 7., 68., ..., 0., 0., 0.]]), y=array([0, 0, 0, ..., 1, 0, 0]))
799 y : array-like, shape = [n_samples] or [n_samples, n_output], optional
800 Target relative to X for classification or regression;
801 None for unsupervised learning.
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
self._fit = <bound method GridSearchCV._fit of GridSearchCV(...jobs', refit=True, scoring='roc_auc', verbose=3)>
X = array([[ 0., 9., 56., ..., 1., 0., 0.]... [ 0., 7., 68., ..., 0., 0., 0.]])
y = a
8000
rray([0, 0, 0, ..., 1, 0, 0])
self.param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [3, 10, 20], 'min_samples_split': [2, 4, 8]}
805
806
807 class RandomizedSearchCV(BaseSearchCV):
808 """Randomized search on hyper parameters.
...........................................................................
/home/gillesa/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.py in _fit(self=GridSearchCV(cv=sklearn.cross_validation.Stratif..._jobs', refit=True, scoring='roc_auc', verbose=3), X=array([[ 0., 9., 56., ..., 1., 0., 0.]... [ 0., 7., 68., ..., 0., 0., 0.]]), y=array([0, 0, 0, ..., 1, 0, 0]), parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
548 )(
549 delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
550 train, test, self.verbose, parameters,
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
parameters = undefined
parameter_iterable = <sklearn.grid_search.ParameterGrid object>
554 for train, test in cv)
555
556 # Out is a list of triplet: score, estimator, n_test_samples
557 n_fits = len(out)
...........................................................................
/home/gillesa/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=4), iterable=<generator object <genexpr>>)
807 if pre_dispatch == "all" or n_jobs == 1:
808 # The iterable was consumed all at once by the above for loop.
809 # No need to wait for async callbacks to trigger to
810 # consumption.
811 self._iterating = False
--> 812 self.retrieve()
self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=4)>
813 # Make sure that we get a last message telling us we are done
814 elapsed_time = time.time() - self._start_time
815 self._print('Done %3i out of %3i | elapsed: %s finished',
816 (len(self._output), len(self._output),
---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError Sat Jan 9 16:42:09 2016
PID: 18076 Python 2.7.11: /home/gillesa/anaconda2/bin/python
...........................................................................
/home/gillesa/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
67 def __init__(self, iterator_slice):
68 self.items = list(iterator_slice)
69 self._size = len(self.items)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
75 return self._size
76
...........................................................................
/home/gillesa/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator=ExtraTreesClassifier(bootstrap=False, class_weig..., random_state=None, verbose=0, warm_start=False), X=memmap([[ 0., 9., 56., ..., 1., 0., 0.... [ 0., 7., 68., ..., 0., 0., 0.]]), y=memmap([0, 0, 0, ..., 1, 0, 0]), scorer=make_scorer(roc_auc_score, needs_threshold=True), train=memmap([ 0, 1, 2, ..., 1217841, 1217842, 1217843]), test=memmap([ 0, 2, 3, ..., 1217824, 1217825, 1217833]), verbose=3, parameters={'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 4}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
1545 " numeric value. (Hint: if using 'raise', please"
1546 " make sure that it has been spelled correctly.)"
1547 )
1548
1549 else:
-> 1550 test_score = _score(estimator, X_test, y_test, scorer)
1551 if return_train_score:
1552 train_score = _score(estimator, X_train, y_train, scorer)
1553
1554 scoring_time = time.time() - start_time
...........................................................................
/home/gillesa/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator=ExtraTreesClassifier(bootstrap=False, class_weig..., random_state=None, verbose=0, warm_start=False), X_test=memmap([[ 0., 9., 56., ..., 1., 0., 0.... [ 0., 6., 57., ..., 1., 0., 0.]]), y_test=memmap([0, 0, 0, ..., 0, 0, 0]), scorer=make_scorer(roc_auc_score, needs_threshold=True))
1604 score = scorer(estimator, X_test)
1605 else:
1606 score = scorer(estimator, X_test, y_test)
1607 if not isinstance(score, numbers.Number):
1608 raise ValueError("scoring must return a number, got %s (%s) instead."
-> 1609 % (str(score), type(score)))
1610 return score
1611
1612
1613 def _permutation_test_score(estimator, X, y, cv, scorer):
ValueError: scoring must return a number, got 0.671095795498 (<class 'numpy.core.memmap.memmap'>) instead.
If I set my n_jobs model to 8 and n_jobs GridSearchCV to 1, it's OK
model = ExtraTreesClassifier(class_weight='balanced', n_jobs=8)
parameters = {'criterion': ['gini', 'entropy'],
'max_depth': [4, 10, 20],
'min_samples_split' : [2, 4, 8],
'max_depth' : [3, 10, 20]}
clf = GridSearchCV(model, parameters, verbose=3, scoring='roc_auc',
cv=StratifiedKFold(y_train, n_folds=5, shuffle=True),
n_jobs=1)
clf.fit(X_train.values, y_train.values)
I try different setup but if GridSearchCV n_jobs > 1 it fails.
I would like to optimize my CPU and i think n_jobs > 1 on GridSearchCV it better than n_jobs on your model. Maybe someone has feedback ?
Possible relation with #6023