2

I am running Ray rllib on sagemaker with 8 cores CPU using the sagemaker_rl library, I set num_workers to 7.

After a long execution I face The actor died unexpectedly before finishing this task


class MyLauncher(SageMakerRayLauncher):
    def register_env_creator(self):
        register_env(
            "RiveRL-v1",
            lambda env_config: create_env(env_config),
        )

    def get_experiment_config(self):
        return {
            "training": {
                "env": "RiveRL-v1",
                "run": "PPO",
                "config": {
                    "ignore_worker_failures": True,
                    "gamma": 0.6,
                    "num_sgd_iter": 5,
                    "lr": 0.0001,
                    "sgd_minibatch_size": 32768,
                    "train_batch_size": 100000,
                    "use_gae": False,
                    "num_workers": (self.num_cpus - 1),
                    "num_gpus": self.num_gpus,
                    "batch_mode": "complete_episodes",
                    "env_config": {
                        "window_size": 25,
                        "max_allowed_loss": 0.2
                    },
                    "observation_filter": "MeanStdFilter",
                    "entropy_coeff": 0.01,
                },
                "checkpoint_freq": 2,
            }
        }

Failure # 1 (occurred at 2021-10-20_18-35-15) Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/ray/tune/trial_runner.py", line 467, in _process_trial result = self.trial_executor.fetch_result(trial) File "/usr/local/lib/python3.6/dist-packages/ray/tune/ray_trial_executor.py", line 431, in fetch_result result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT) File "/usr/local/lib/python3.6/dist-packages/ray/worker.py", line 1517, in get raise value ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.

But whenever I change num_worker to 1 the problem solves. Any idea how can I fix this issue?

Abhishek Dutt
  • 1,308
  • 7
  • 14
  • 24

0 Answers0