Error while using offline experiences for DDPG. custom environment dimensions (action space and state space) seem to be inconsistent with what is expected in DDPG RLLIB trainer.
Ubuntu, Ray 0.7 version (latest ray), DDPG example, offline dataset. Used sampler builder for offline dataset.
Estimated DQN with this experience data and it ran through. Changed environment action space to be continuous (Box(,1)) and DDPG did not work.
from ray.tune.registry import register_env
TRAIN_BATCH_SIZE = 512
class mmt_ctns_offline_logs(gym.Env):
def __init__(self):
self.action_space = Box(0,50,shape=(,1), dtype=np.float32) #one dimension action space, values range 0 to 50 max
self.observation_space = Box(-100000, 100000, shape=(,58), dtype=np.float32) #58 columns in state space
register_env("mmt_env_ctnaction", lambda config: mmt_ctns_offline_logs()) #register custom environment
#define the configuration. Some of these are defaults. But I have explicitely defined them for clarify (within my team)
config_dict = {"env": "mmt_env_ctnaction", "evaluation_num_episodes":50, "num_workers": 11, "sample_batch_size": 512,
"train_batch_size": TRAIN_BATCH_SIZE,
"input": "<experience_replay_folder>/",
"output": "<any_folder>", "gamma": 0.99,
"horizon": None,
"optimizer_class": "SyncReplayOptimizer",
"optimizer": {"prioritized_replay":True},
"actor_hiddens": [128, 64], "actor_hidden_activation": "relu",
"critic_hiddens": [64, 64], "critic_hidden_activation": "relu", "n_step": 1,
"target_network_update_freq": 500,
"input_evaluation": [],
"ignore_worker_failures":True, 'log_level': "DEBUG",
"buffer_size": 50000,
"prioritized_replay": True,
"prioritized_replay_alpha": 0.6,
"prioritized_replay_beta": 0.4,
"prioritized_replay_eps": 1e-6,
"compress_observations": False,
"lr": 1e-3,
"actor_loss_coeff": 0.1,
"critic_loss_coeff": 1.0,
"use_huber": False,
"huber_threshold": 1.0,
"l2_reg": 1e-6,
"grad_norm_clipping": True,
"learning_starts": 1500,
}
config = ddpg.DEFAULT_CONFIG.copy() #dqn.DEFAULT_CONFIG.copy()
for k,v in config_dict.items():
config[k] = v
config_ddpg = config
config_ddpg
run_experiments({
'NM_testing_DDPG_offpolicy_noIS': {
'run': 'DDPG',
'env': 'mmt_env_ctnaction',
'config': config_ddpg,
'local_dir': "/oxygen/narasimham/ray/tmp/mmt/mmt_user_27_DDPG/"
},
})
Expected results from DDPG iterations.
Actual - ERROR:-
ray.exceptions.RayTaskError: ray_DDPGTrainer:train() (pid=89635, host=ip-10-114-53-179)
File "/home/ubuntu/anaconda3/envs/tf_p36n/lib/python3.6/site-packages/ray/rllib/utils/tf_run_builder.py", line 49, in get
self.feed_dict, os.environ.get("TF_TIMELINE_DIR"))
File "/home/ubuntu/anaconda3/envs/tf_p36n/lib/python3.6/site-packages/ray/rllib/utils/tf_run_builder.py", line 91, in run_timeline
fetches = sess.run(ops, feed_dict=feed_dict)
File "/home/ubuntu/anaconda3/envs/tf_p36n/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 877, in run
run_metadata_ptr)
File "/home/ubuntu/anaconda3/envs/tf_p36n/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1076, in _run
str(subfeed_t.get_shape())))
ValueError: Cannot feed value of shape (512,) for Tensor 'default_policy/action:0', which has shape '(?, 1)'
During handling of the above exception, another exception occurred: