0

I wanted to try the DDPG algorithm with the mujoco's task, 'Reacher-v2', witch can be seen in this code.

I ran my code but the performance like the episode rewards in the training didn't go up.

Some of my code mimics the package, cleanrl, mainly on increasing the exploration noise on actions.

import time

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from cherry.experience_replay import ExperienceReplay

DEVICE = 'cuda'
DETYPE = torch.float32
ENV_NAME = 'Reacher-v2'
LEARNINGRATE = 3e-4
TOTAL_TIMESTEPS = 10000
LEARNING_STARTS = 2500
NOISE = 0.1
SAMPLE_SIZE = 256
GAMMA = 0.99
POLICY_FREQUENCY = 2
TAU = 0.005


class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.input = nn.Linear(self.state_dim, 64)
        self.layer1 = nn.Linear(64, 64)
        self.output = nn.Linear(64, self.action_dim)
        self.to(DEVICE)

    def forward(self, x):
        x = F.relu(self.input(x))
        x = F.relu(self.layer1(x))
        x = F.tanh(self.output(x))
        return x


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.input = nn.Linear(self.state_dim + self.action_dim, 64)
        self.layer1 = nn.Linear(64, 64)
        self.output = nn.Linear(64, 1)
        self.to(DEVICE)

    def forward(self, s, a):
        x = torch.cat([s, a], dim=1)
        x = F.relu(self.input(x))
        x = F.relu(self.layer1(x))
        x = F.sigmoid(self.output(x))
        return x


env = gym.make(ENV_NAME)

actionDim = env.action_space.shape[0]
observationDim = env.observation_space.shape[0]

actionLowBound = env.action_space.low
actionHighBound = env.action_space.high

action_scale = (actionLowBound - actionHighBound) / 2.0
action_bias = (actionLowBound + actionHighBound) / 2.0

observationLowBound = env.observation_space.low
observationHighBound = env.observation_space.high

actor = Actor(state_dim=observationDim, action_dim=actionDim)
target_actor = Actor(state_dim=observationDim, action_dim=actionDim)
critic = Critic(state_dim=observationDim, action_dim=actionDim)
target_critic = Critic(state_dim=observationDim, action_dim=actionDim)

critic_optimizer = torch.optim.Adam(critic.parameters(), lr=LEARNINGRATE)
actor_optimizer = torch.optim.Adam(actor.parameters(), lr=LEARNINGRATE)

buffer = ExperienceReplay(device=DEVICE)

start = time.time()

reward_sum = []

average_reward_s = 0.0

for epoches in range(TOTAL_TIMESTEPS):

    state = env.reset()

    obs = state

    done = False

    reward_s = 0.0

    while not done:

        if epoches < LEARNING_STARTS:
            action = env.action_space.sample()
        else:
            with torch.no_grad():

                obs_ = torch.autograd.Variable(torch.tensor(obs, device=DEVICE, dtype=torch.float32))

                action_ = actor(obs_)

                action_ += torch.normal(mean=torch.zeros_like(action_).to(DEVICE),
                                        std=torch.tensor(action_scale * NOISE, device=DEVICE))

                action = action_.cpu().numpy().clip(env.action_space.low, env.action_space.high)

        nextstate, reward, done, info = env.step(action)

        reward_s += reward

        done_value = 1 if (done == True) else 0

        buffer.append(obs, action, reward, nextstate, done_value)

        state = nextstate

        if epoches > LEARNING_STARTS:
            datas = buffer.sample(size=SAMPLE_SIZE)

            states = torch.tensor(datas.state(), dtype=torch.float32, device=DEVICE)
            actions = torch.tensor(datas.action(), dtype=DETYPE, device=DEVICE)
            next_states = torch.tensor(datas.next_state(), dtype=DETYPE, device=DEVICE)
            dones = torch.tensor(datas.done(), dtype=DETYPE, device=DEVICE)
            rewards = torch.tensor(datas.reward(), dtype=DETYPE, device=DEVICE)

            with torch.no_grad():
                next_states_actions = target_actor(next_states)
                next_critic_value = target_critic(next_states, next_states_actions)
                next_Q_value = rewards + (1 - dones) * GAMMA * next_critic_value.view(-1, 1)

            current_Q_value = critic(states, actions)
            current_Q_loss = F.mse_loss(current_Q_value, next_Q_value)

            critic_optimizer.zero_grad()
            current_Q_loss.backward()
            critic_optimizer.step()

            if epoches % POLICY_FREQUENCY == 0:
                actor_loss = -1 * critic(states, actor(states)).mean()
                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()

                # update the target network
                for param, target_param in zip(actor.parameters(), target_actor.parameters()):
                    target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)
                for param, target_param in zip(critic.parameters(), target_critic.parameters()):
                    target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)

    reward_sum.append(reward_s)

    average_reward_s += reward_s
    if epoches % 50 == 0:
        print("Epoch:{},\taverage_reward_s:{}\n".format(epoches, average_reward_s / 50))
        average_reward_s = 0.0

I tried running my code, but the cumulative episode rewards didn't go up. I want to know if there is something wrong with the design of my code.

0 Answers0