I am running Ray rllib
on sagemaker with 8 cores CPU using the sagemaker_rl
library, I set num_workers to 7.
After a long execution I face The actor died unexpectedly before finishing this task
class MyLauncher(SageMakerRayLauncher):
def register_env_creator(self):
register_env(
"RiveRL-v1",
lambda env_config: create_env(env_config),
)
def get_experiment_config(self):
return {
"training": {
"env": "RiveRL-v1",
"run": "PPO",
"config": {
"ignore_worker_failures": True,
"gamma": 0.6,
"num_sgd_iter": 5,
"lr": 0.0001,
"sgd_minibatch_size": 32768,
"train_batch_size": 100000,
"use_gae": False,
"num_workers": (self.num_cpus - 1),
"num_gpus": self.num_gpus,
"batch_mode": "complete_episodes",
"env_config": {
"window_size": 25,
"max_allowed_loss": 0.2
},
"observation_filter": "MeanStdFilter",
"entropy_coeff": 0.01,
},
"checkpoint_freq": 2,
}
}
Failure # 1 (occurred at 2021-10-20_18-35-15) Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/ray/tune/trial_runner.py", line 467, in _process_trial result = self.trial_executor.fetch_result(trial) File "/usr/local/lib/python3.6/dist-packages/ray/tune/ray_trial_executor.py", line 431, in fetch_result result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT) File "/usr/local/lib/python3.6/dist-packages/ray/worker.py", line 1517, in get raise value ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.
But whenever I change num_worker
to 1
the problem solves. Any idea how can I fix this issue?