I am using a pipeline as an estimator for the GridSearchCV
. Which works fine. However, if I enable caching with the memory paramater and set n_jobs
to larger than one, cv_results_
's score columns are NaN
and it is done in like a second instead of several minutes.
Do you cannot use the caching functionality with GridSearchCV
or what am I doing wrong?
gsCV = GridSearchCV(
estimator=Pipeline(
# memory='../Cache/AW1MP_N10_DIN276_Pipeline', # not working if enabled
steps=[
('we', FastTextTransformer()),
('se', AverageWordVectorTransformer()),
('rf', RandomForestClassifier())
]
),
param_grid=[
{
'we__min_count': [5],
'we__size': [64],
'we__window': [5],
'we__min_n': [3],
'we__max_n': [6],
'rf__n_estimators': [1, 2, 3, 4, 5, 10],# 25, 64, 128], # number of trees in forest
'rf__criterion':['gini'],#'entropy'], # split criterion
'rf__max_features':['auto'], # number of features per tree,
'rf__max_depth':[4, 8, 16]#, 32, 64, 128]
}
],
cv=CV,
verbose=VERBOSE,
n_jobs=N_JOBS,
return_train_score=True,
scoring=None
)
gsCV.fit(X_train, label_encoder.inverse_transform(Y_train).reshape(-1))
Output without the memory parameter
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 1 tasks | elapsed: 28.9s
[Parallel(n_jobs=6)]: Done 6 tasks | elapsed: 29.4s
[Parallel(n_jobs=6)]: Done 13 tasks | elapsed: 1.5min
[Parallel(n_jobs=6)]: Done 20 tasks | elapsed: 2.0min
[Parallel(n_jobs=6)]: Done 29 tasks | elapsed: 2.5min
[Parallel(n_jobs=6)]: Done 38 tasks | elapsed: 3.5min
[Parallel(n_jobs=6)]: Done 49 tasks | elapsed: 4.5min
[Parallel(n_jobs=6)]: Done 60 tasks | elapsed: 5.1min
[Parallel(n_jobs=6)]: Done 73 tasks | elapsed: 6.6min
[Parallel(n_jobs=6)]: Done 90 out of 90 | elapsed: 7.6min finished
Output with the memory parameter set to a path
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 1 tasks | elapsed: 3.3s
[Parallel(n_jobs=6)]: Done 6 tasks | elapsed: 3.3s
[Parallel(n_jobs=6)]: Done 13 tasks | elapsed: 3.7s
[Parallel(n_jobs=6)]: Done 20 tasks | elapsed: 4.0s
[Parallel(n_jobs=6)]: Done 29 tasks | elapsed: 4.3s
[Parallel(n_jobs=6)]: Done 38 tasks | elapsed: 4.7s
[Parallel(n_jobs=6)]: Done 49 tasks | elapsed: 5.0s
[Parallel(n_jobs=6)]: Done 60 tasks | elapsed: 5.4s
[Parallel(n_jobs=6)]: Done 73 tasks | elapsed: 5.9s
[Parallel(n_jobs=6)]: Done 90 out of 90 | elapsed: 6.4s finished
C:\Users\username\anaconda3\envs\SDaC\lib\site-packages\sklearn\pipeline.py:296: UserWarning: Persisting input arguments took 1.40s to run.
If this happens often in your code, it can cause performance problems
(results will be correct in all cases).
The reason for this is probably some large input arguments for a wrapped
function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
example so that they can fix the problem.
**fit_params_steps[name])
C:\Users\username\anaconda3\envs\SDaC\lib\site-packages\sklearn\pipeline.py:296: UserWarning: Persisting input arguments took 5.32s to run.
If this happens often in your code, it can cause performance problems
(results will be correct in all cases).
The reason for this is probably some large input arguments for a wrapped
function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
example so that they can fix the problem.
**fit_params_steps[name])
Output with error_score='raise'
The above exception was the direct cause of the following exception:
PicklingError Traceback (most recent call last)
<ipython-input-247-f1d887547f42> in <module>
19 )
20
---> 21 gsCV_clf.fit(X_train, label_encoder.inverse_transform(Y_train).reshape(-1)) # use class because of Random Forest Classifier
22 print('hi')
~\anaconda3\envs\SDaC\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\anaconda3\envs\SDaC\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\anaconda3\envs\SDaC\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~\anaconda3\envs\SDaC\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
713 for parameters, (train, test)
714 in product(candidate_params,
--> 715 cv.split(X, y, groups)))
716
717 if len(out) < 1:
~\anaconda3\envs\SDaC\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~\anaconda3\envs\SDaC\lib\site-packages\joblib\parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~\anaconda3\envs\SDaC\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~\anaconda3\envs\SDaC\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\envs\SDaC\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
PicklingError: ("Can't pickle <class '__main__.CustomTokenizer'>: it's not found as __main__.CustomTokenizer", 'PicklingError while hashing {\'transformer\': CustomTokenizer(), \'X\': kostenposition_bau_nr_komplett ... text\n12862 326 ... Fenster Holzfenster AQ 1\n17556 326 ... Scheiben verkratzt Holzfenster AQ 7, 8.1-8.2\n11648 314 ... Boden am Übergang zwischen Naturstein und Beto...\n2344 300 ... Farbverschmutzung Decke (Lampe) Farbverschmutz...\n13097 326 ... Sonnenschutz einstellen linkes Fenster klapper...\n... ... ... ...\n17213 327 ... 105 Küche Fuge Arbeitsplatte Rückwand fehlt Ti...\n4200 300 ... offene Hartverfugung Boden (Dusche) offene Har...\n12443 327 ... Leichter Versatzder verkleidungsteile am Lich...\n14023 324 ... Fuge mit Lücken Bad GU AQ 4, 5, 6, 8.1, 8.2\n3635 300 ... reinigen Glashalteleiste (WC) reinigen Glashal...\n\n[9731 rows x 3 columns], \'y\': array([\'326\', \'326\', \'314\', ..., \'327\', \'324\', \'300\'], dtype=\'<U3\'), \'weight\': None, \'message_clsname\': \'Pipeline\', \'message\': None, \'**\': {}}: PicklingError("Can\'t pickle <class \'__main__.CustomTokenizer\'>: it\'s not found as __main__.CustomTokenizer")')
Additional Information
- scikit-learn 0.23.2