I wanted to try the DDPG algorithm with the mujoco's task, 'Reacher-v2', witch can be seen in this code.
I ran my code but the performance like the episode rewards in the training didn't go up.
Some of my code mimics the package, cleanrl, mainly on increasing the exploration noise on actions.
import time
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from cherry.experience_replay import ExperienceReplay
DEVICE = 'cuda'
DETYPE = torch.float32
ENV_NAME = 'Reacher-v2'
LEARNINGRATE = 3e-4
TOTAL_TIMESTEPS = 10000
LEARNING_STARTS = 2500
NOISE = 0.1
SAMPLE_SIZE = 256
GAMMA = 0.99
POLICY_FREQUENCY = 2
TAU = 0.005
class Actor(nn.Module):
def __init__(self, state_dim, action_dim):
super().__init__()
self.state_dim = state_dim
self.action_dim = action_dim
self.input = nn.Linear(self.state_dim, 64)
self.layer1 = nn.Linear(64, 64)
self.output = nn.Linear(64, self.action_dim)
self.to(DEVICE)
def forward(self, x):
x = F.relu(self.input(x))
x = F.relu(self.layer1(x))
x = F.tanh(self.output(x))
return x
class Critic(nn.Module):
def __init__(self, state_dim, action_dim):
super().__init__()
self.state_dim = state_dim
self.action_dim = action_dim
self.input = nn.Linear(self.state_dim + self.action_dim, 64)
self.layer1 = nn.Linear(64, 64)
self.output = nn.Linear(64, 1)
self.to(DEVICE)
def forward(self, s, a):
x = torch.cat([s, a], dim=1)
x = F.relu(self.input(x))
x = F.relu(self.layer1(x))
x = F.sigmoid(self.output(x))
return x
env = gym.make(ENV_NAME)
actionDim = env.action_space.shape[0]
observationDim = env.observation_space.shape[0]
actionLowBound = env.action_space.low
actionHighBound = env.action_space.high
action_scale = (actionLowBound - actionHighBound) / 2.0
action_bias = (actionLowBound + actionHighBound) / 2.0
observationLowBound = env.observation_space.low
observationHighBound = env.observation_space.high
actor = Actor(state_dim=observationDim, action_dim=actionDim)
target_actor = Actor(state_dim=observationDim, action_dim=actionDim)
critic = Critic(state_dim=observationDim, action_dim=actionDim)
target_critic = Critic(state_dim=observationDim, action_dim=actionDim)
critic_optimizer = torch.optim.Adam(critic.parameters(), lr=LEARNINGRATE)
actor_optimizer = torch.optim.Adam(actor.parameters(), lr=LEARNINGRATE)
buffer = ExperienceReplay(device=DEVICE)
start = time.time()
reward_sum = []
average_reward_s = 0.0
for epoches in range(TOTAL_TIMESTEPS):
state = env.reset()
obs = state
done = False
reward_s = 0.0
while not done:
if epoches < LEARNING_STARTS:
action = env.action_space.sample()
else:
with torch.no_grad():
obs_ = torch.autograd.Variable(torch.tensor(obs, device=DEVICE, dtype=torch.float32))
action_ = actor(obs_)
action_ += torch.normal(mean=torch.zeros_like(action_).to(DEVICE),
std=torch.tensor(action_scale * NOISE, device=DEVICE))
action = action_.cpu().numpy().clip(env.action_space.low, env.action_space.high)
nextstate, reward, done, info = env.step(action)
reward_s += reward
done_value = 1 if (done == True) else 0
buffer.append(obs, action, reward, nextstate, done_value)
state = nextstate
if epoches > LEARNING_STARTS:
datas = buffer.sample(size=SAMPLE_SIZE)
states = torch.tensor(datas.state(), dtype=torch.float32, device=DEVICE)
actions = torch.tensor(datas.action(), dtype=DETYPE, device=DEVICE)
next_states = torch.tensor(datas.next_state(), dtype=DETYPE, device=DEVICE)
dones = torch.tensor(datas.done(), dtype=DETYPE, device=DEVICE)
rewards = torch.tensor(datas.reward(), dtype=DETYPE, device=DEVICE)
with torch.no_grad():
next_states_actions = target_actor(next_states)
next_critic_value = target_critic(next_states, next_states_actions)
next_Q_value = rewards + (1 - dones) * GAMMA * next_critic_value.view(-1, 1)
current_Q_value = critic(states, actions)
current_Q_loss = F.mse_loss(current_Q_value, next_Q_value)
critic_optimizer.zero_grad()
current_Q_loss.backward()
critic_optimizer.step()
if epoches % POLICY_FREQUENCY == 0:
actor_loss = -1 * critic(states, actor(states)).mean()
actor_optimizer.zero_grad()
actor_loss.backward()
actor_optimizer.step()
# update the target network
for param, target_param in zip(actor.parameters(), target_actor.parameters()):
target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)
for param, target_param in zip(critic.parameters(), target_critic.parameters()):
target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)
reward_sum.append(reward_s)
average_reward_s += reward_s
if epoches % 50 == 0:
print("Epoch:{},\taverage_reward_s:{}\n".format(epoches, average_reward_s / 50))
average_reward_s = 0.0
I tried running my code, but the cumulative episode rewards didn't go up. I want to know if there is something wrong with the design of my code.