I'm having this issue both on windows and ubuntu:
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
exception calling callback for <Future at 0x7f45139d8580 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
callback(self)
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/parallel.py", line 347, in __call__
self.parallel.dispatch_next()
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/parallel.py", line 780, in dispatch_next
if not self.dispatch_one_batch(self._original_iterator):
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/parallel.py", line 847, in dispatch_one_batch
self._dispatch(tasks)
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/parallel.py", line 765, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 529, in apply_async
future = self._workers.submit(SafeFunction(func))
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/externals/loky/reusable_executor.py", line 177, in submit
return super(_ReusablePoolExecutor, self).submit(
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 1102, in submit
raise self._flags.broken
joblib.externals.loky.process_executor.TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
The exit codes of the workers are {SIGABRT(-6)}
Traceback (most recent call last):
File "/home/vhviveiros/GitHub/trabalho_covid/classify.py", line 18, in <module>
cf.validation(batch_size=[32, 16, 24], epochs=[100, 250, 200, 500])
File "/home/vhviveiros/GitHub/trabalho_covid/classifier.py", line 67, in validation
grid_search = grid_search.fit(self.X_train, self.y_train)
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/sklearn/utils/validation.py", line 73, in inner_f
return f(**kwargs)
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 736, in fit
self._run_search(evaluate_candidates)
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 1188, in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 708, in evaluate_candidates
out = parallel(delayed(_fit_and_score)(clone(base_estimator),
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/parallel.py", line 1042, in __call__
self.retrieve()
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/parallel.py", line 921, in retrieve
self._output.extend(job.get(timeout=self.timeout))
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 540, in wrap_future_result
return future.result(timeout=timeout)
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/concurrent/futures/_base.py", line 439, in result
return self.__get_result()
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/concurrent/futures/_base.py", line 388, in __get_result
raise self._exception
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
callback(self)
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/parallel.py", line 347, in __call__
self.parallel.dispatch_next()
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/parallel.py", line 780, in dispatch_next
if not self.dispatch_one_batch(self._original_iterator):
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/parallel.py", line 847, in dispatch_one_batch
self._dispatch(tasks)
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/parallel.py", line 765, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 529, in apply_async
future = self._workers.submit(SafeFunction(func))
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/externals/loky/reusable_executor.py", line 177, in submit
return super(_ReusablePoolExecutor, self).submit(
File "/home/vhviveiros/anaconda3/envs/tf-gpu/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 1102, in submit
raise self._flags.broken
TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
The exit codes of the workers are {SIGABRT(-6)}
What already been done
- changed intel-openmp version
- reinstalled conda and tf-gpu env
- tried in both windows 10 and ubuntu 20.04
- How do I fix/debug this Multi-Process terminated worker error thrown in scikit learn
OBS: When running this code at windows, instead it fills the entire both ram and gpu memory freezing the system. With n_jobs=1, the process used average 2 GB ram (with other parameters). The input file is just a 524x254 .csv.
Environment: Conda
- conda 4.8.3
- intel-openmp 2020.1
- scikit-learn 0.23.1
- tensorflow 2.2.0
- tensorflow-base 2.2.0
- tensorflow-estimator 2.2.0
- tensorflow-gpu 2.2.0
- keras 2.4.3
- keras-base 2.4.3
- keras-preprocessing 1.1.0
Hardware
- Ryzen 3600
- 16GB Ram
- RTX 2060 S
Code example:
validation(batch_size=[32, 16, 24], epochs=[100, 250, 200, 500])
Different file
from keras.wrappers.scikit_learn import KerasClassifier
from models import classifier_model
from sklearn.model_selection import GridSearchCV, train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
import datetime
from utils import check_folder
import tensorflow as tf
from sklearn.metrics import confusion_matrix
def validation(self, cv=10, batch_size=-1, epochs=-1):
classifier = KerasClassifier(build_fn=classifier_model)
parameters = {'batch_size': batch_size,
'epochs': epochs,
'optimizer': ['adam'],
'activation': ['relu'],
'activationOutput': ['sigmoid']}
self.metrics = ['accuracy', 'roc_auc', 'precision', 'recall']
grid_search = GridSearchCV(estimator=classifier,
verbose=2,
param_grid=parameters,
n_jobs=2,
scoring=self.metrics,
refit='precision',
return_train_score=False,
cv=cv)
grid_search = grid_search.fit(self.X_train, self.y_train)
return grid_search
Different file
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras.models import Sequential
def classifier_model(optimizer, activation, activationOutput):
classifier = Sequential()
classifier.add(Dense(units=200, activation=activation, input_shape=(254,)))
classifier.add(Dropout(rate=0.2))
classifier.add(Dense(units=200, activation=activation))
classifier.add(Dropout(rate=0.2))
classifier.add(Dense(units=200, activation=activation))
classifier.add(Dropout(rate=0.2))
classifier.add(Dense(units=200, activation=activation))
classifier.add(Dropout(rate=0.2))
classifier.add(Dense(units=200, activation=activation))
classifier.add(Dropout(rate=0.2))
classifier.add(Dense(units=200, activation=activation))
classifier.add(Dropout(rate=0.2))
classifier.add(Dense(units=1, activation=activationOutput))
classifier.compile(optimizer=optimizer,
loss='binary_crossentropy', metrics=['accuracy', AUC(), Precision(), Recall()])
return classifier