How i can use dqn and ddpg to successfully train an agent excellent in customized environment?

Question

I'm new in AI, and i want to get in the field, i have spent some time finishing a program to train an agent for a simple customized environment, but when i perform the training in colab for 10000 episodes, it still can not get well performance. I guess whether there is something wrong with the customized env or there is something wrong with the training process.

Env: a helicopter tries to get throw the continous flow of birds (max num: 10), the birds moves from the right to the left, and there is fuel randomly. If the helicopter is still alive, i.e., it has not collided with a bird and still has fuel (initialized by 1000, when it collides with the fuel icon (max num: 2), fuel_left will be reset to 1000), its rewards plus 1.

the environment is shown in the figure: after 10000 episode in ddpg/dqn, the agent still can not play more than 15 seconds, could you point out where the problem is?

Action space(1 dim): 0, 1, 2, 3, 4 -> helicopter moves up, down, left, right and keep static.

State space(28 dim): (x,y) for 10 birds, 2 fuel, and 1 helicopter. Besides, there is fuel left and rewards obtained.

Rewards: If the helicopter is alive, rewards plus 1.

the env settings code is as follwos (custom.py):

import numpy as np 
import cv2
import matplotlib.pyplot as plt
import random
import math
import time
 
from gym import Env, spaces
import time

font = cv2.FONT_HERSHEY_COMPLEX_SMALL

class ChopperScape(Env):
    def __init__(self):
        super(ChopperScape,self).__init__()
        self.maxbirdnum = 10
        self.maxfuelnum = 2
        self.observation_shape = (28,)
        self.canvas_shape = (600,800,3)
        self.action_space = spaces.Discrete(5,)
        self.last_action = 0
        self.obs = np.zeros(self.observation_shape)
        self.canvas = np.ones(self.canvas_shape) * 1
        self.elements = []
        self.maxfuel = 1000
        self.y_min = int (self.canvas_shape[0] * 0.1)
        self.x_min = 0
        self.y_max = int (self.canvas_shape[0] * 0.9)
        self.x_max = self.canvas_shape[1]
      
    def draw_elements_on_canvas(self): 
        self.canvas = np.ones(self.canvas_shape) * 1

        for elem in self.elements:
            elem_shape = elem.icon.shape
            x,y = elem.x, elem.y
            self.canvas[y : y + elem_shape[1], x:x + elem_shape[0]] = elem.icon

        text = 'Fuel Left: {} | Rewards: {}'.format(self.fuel_left, self.ep_return)
        self.canvas = cv2.putText(self.canvas, text, (10,20), font, 0.8, (0,0,0), 1, cv2.LINE_AA)

    def reset(self):
        
        self.fuel_left = self.maxfuel
        self.ep_return  = 0
        
        self.obs = np.zeros(self.observation_shape)
        self.obs[26] = self.maxfuel
        
        self.bird_count = 0
        self.fuel_count = 0

        x = random.randrange(int(self.canvas_shape[0] * 0.05), int(self.canvas_shape[0] * 0.90))
        y = random.randrange(int(self.canvas_shape[1] * 0.05), int(self.canvas_shape[1] * 0.90))
        self.chopper = Chopper("chopper", self.x_max, self.x_min, self.y_max, self.y_min)
        self.chopper.set_position(x,y)
        self.obs[24] = x
        self.obs[25] = y
        self.elements = [self.chopper]

        self.canvas = np.ones(self.canvas_shape) * 1

        self.draw_elements_on_canvas()

        return self.obs
  
    def get_action_meanings(self):
        return {0: "Right", 1: "Left", 2: "Down", 3: "Up", 4: "Do Nothing"}
  
    def has_collided(self, elem1, elem2):
        x_col = False
        y_col = False
        elem1_x, elem1_y = elem1.get_position()
        elem2_x, elem2_y = elem2.get_position()

        if 2 * abs(elem1_x - elem2_x) <= (elem1.icon_w + elem2.icon_w):
            x_col = True
        if 2 * abs(elem1_y - elem2_y) <= (elem1.icon_h + elem2.icon_h):
            y_col = True
        if x_col and y_col:
            return True
        return False

    def step(self, action):
        done = False
        
        reward = 1

        assert self.action_space.contains(action), "invalid action" 

        if action == 4:
            self.chopper.move(0,5)
        elif action == 1:
            self.chopper.move(0,-5)
        elif action == 2:
            self.chopper.move(5,0)
        elif action == 0:
            self.chopper.move(-5,0)
        elif action == 3:
            self.chopper.move(0,0)

        if random.random() < 0.1 and self.bird_count<self.maxbirdnum:
            spawned_bird = Bird("bird_{}".format(self.bird_count), self.x_max, self.x_min, self.y_max, self.y_min)
            self.bird_count += 1
            bird_y = random.randrange(self.y_min, self.y_max)
            spawned_bird.set_position(self.x_max, bird_y)
            self.elements.append(spawned_bird)    

        if random.random() < 0.05 and self.fuel_count<self.maxfuelnum:
            spawned_fuel = Fuel("fuel_{}".format(self.bird_count), self.x_max, self.x_min, self.y_max, self.y_min)
            self.fuel_count += 1
            fuel_x = random.randrange(self.x_min, self.x_max)
            fuel_y = self.y_max
            spawned_fuel.set_position(fuel_x, fuel_y)
            self.elements.append(spawned_fuel)

        for elem in self.elements:
            if isinstance(elem, Bird):
                if elem.get_position()[0] <= self.x_min:
                    self.elements.remove(elem)
                    self.bird_count -= 1
                else:
                    elem.move(-5,0)
                
                if self.has_collided(self.chopper, elem):
                    done = True
                    reward = -100000.0*(1.0/self.ep_return+1)

            if isinstance(elem, Fuel):
                flag1 = False
                flag2 = False
                if self.has_collided(self.chopper, elem):
                    self.fuel_left = self.maxfuel
                    flag1 = True
                    reward += 2
                    # time.sleep(0.5)
                    
                if elem.get_position()[1] <= self.y_min:
                    flag2 = True
                    self.fuel_count -= 1
                else:
                    elem.move(0, -5)
                
                if flag1 == True or flag2 == True:
                    self.elements.remove(elem)
                    
                
        
        self.fuel_left -= 1
        if self.fuel_left == 0:
            done = True
        self.draw_elements_on_canvas()
        
        self.ep_return += 1
        
        birdnum = 0
        fuelnum = 0
        x_, y_ = self.chopper.get_position()
        dis = 0.0
        for elem in self.elements:
            x,y = elem.get_position()
            if isinstance(elem,Bird):
                self.obs[2*birdnum] = x
                self.obs[2*birdnum+1] = y
                birdnum += 1
                dis += math.hypot(x_-x,y_-y)
            if isinstance(elem,Fuel):
                base = self.maxbirdnum*2
                self.obs[base+2*fuelnum] = x
                self.obs[base+2*fuelnum+1] = y
                fuelnum += 1
        self.obs[24] = x_
        self.obs[25] = y_
        self.obs[26] = self.fuel_left
        self.obs[27] = self.ep_return
        
        if x_ == self.x_min or x_ == self.x_max or y_ == self.y_max or y_ == self.y_min:
            reward -= random.random()
        
        for i in range(26):
            if i%2 == 0:
                self.obs[i]/=800.0
            else:
                self.obs[i]/=600.0
        self.obs[26]/=1000.0
        self.obs[27]/=100.0
        
        # print('reward:',reward)
        # if done == True:
        #     time.sleep(1)
        return self.obs, reward, done, {}
    
    def render(self, mode = "human"):
        assert mode in ["human", "rgb_array"], "Invalid mode, must be either \"human\" or \"rgb_array\""
        if mode == "human":
            cv2.imshow("Game", self.canvas)
            cv2.waitKey(10)
        
        elif mode == "rgb_array":
            return self.canvas
      
    def close(self):
        cv2.destroyAllWindows()

class Point(object):
    def __init__(self, name, x_max, x_min, y_max, y_min):
        self.x = 0
        self.y = 0
        self.x_min = x_min
        self.x_max = x_max
        self.y_min = y_min
        self.y_max = y_max
        self.name = name
    
    def set_position(self, x, y):
        self.x = self.clamp(x, self.x_min, self.x_max - self.icon_w)
        self.y = self.clamp(y, self.y_min, self.y_max - self.icon_h)
    
    def get_position(self):
        return (self.x, self.y)
    
    def move(self, del_x, del_y):
        self.x += del_x
        self.y += del_y
        
        self.x = self.clamp(self.x, self.x_min, self.x_max - self.icon_w)
        self.y = self.clamp(self.y, self.y_min, self.y_max - self.icon_h)

    def clamp(self, n, minn, maxn):
        return max(min(maxn, n), minn)
class Chopper(Point):
    def __init__(self, name, x_max, x_min, y_max, y_min):
        super(Chopper, self).__init__(name, x_max, x_min, y_max, y_min)
        self.icon = cv2.imread("chopper1.jpg") / 255.0
        self.icon_w = 64
        self.icon_h = 64
        self.icon = cv2.resize(self.icon, (self.icon_h, self.icon_w))
        
class Bird(Point):
    def __init__(self, name, x_max, x_min, y_max, y_min):
        super(Bird, self).__init__(name, x_max, x_min, y_max, y_min)
        self.icon = cv2.imread("bird1.jpg") / 255.0
        self.icon_w = 32
        self.icon_h = 32
        self.icon = cv2.resize(self.icon, (self.icon_h, self.icon_w))
    
class Fuel(Point):
    def __init__(self, name, x_max, x_min, y_max, y_min):
        super(Fuel, self).__init__(name, x_max, x_min, y_max, y_min)
        self.icon = cv2.imread("fuel1.jpg") / 255.0
        self.icon_w = 32
        self.icon_h = 32
        self.icon = cv2.resize(self.icon, (self.icon_h, self.icon_w))

if __name__ == '__main__':
    from IPython import display

    env = ChopperScape()
    obs = env.reset()

    while True:
        # random agent
        action = random.randrange(-1,1)
        obs, reward, done, info = env.step(action)
        
        # Render the game
        env.render()
        
        if done == True:
            break

    env.close()

the ddpg algorithm to train the agent is as follows (ddpg.py):

from custom import ChopperScape
import random
import collections
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#超参数
lr_mu        = 0.005
lr_q         = 0.01
gamma        = 0.99
batch_size   = 32
buffer_limit = 50000
tau          = 0.005 # for target network soft update

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []

        for transition in mini_batch:
            s, a, r, s_prime, done = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append(r)
            s_prime_lst.append(s_prime)
            done_mask = 0.0 if done else 1.0
            done_mask_lst.append(done_mask)
        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst, dtype=torch.float), \
                torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \
                torch.tensor(done_mask_lst, dtype=torch.float)
    
    def size(self):
        return len(self.buffer)

class MuNet(nn.Module):
    def __init__(self):
        super(MuNet, self).__init__()
        self.fc1 = nn.Linear(28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc_mu = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = torch.tanh(self.fc_mu(x))
        return mu

class QNet(nn.Module):
    def __init__(self):
        super(QNet, self).__init__()
        self.fc_s = nn.Linear(28, 64)
        self.fc_a = nn.Linear(1,64)
        self.fc_q = nn.Linear(128, 32)
        self.fc_out = nn.Linear(32,1)

    def forward(self, x, a):
        h1 = F.relu(self.fc_s(x))
        h2 = F.relu(self.fc_a(a))
        cat = torch.cat([h1,h2], dim=1)
        q = F.relu(self.fc_q(cat))
        q = self.fc_out(q)
        return q

class OrnsteinUhlenbeckNoise:
    def __init__(self, mu):
        self.theta, self.dt, self.sigma = 0.1, 0.01, 0.1
        self.mu = mu
        self.x_prev = np.zeros_like(self.mu)

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x
      
def train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer):
    s,a,r,s_prime,done_mask  = memory.sample(batch_size)
    core = q_target(s_prime, mu_target(s_prime)) * done_mask
    target = r + gamma * core
    
    q_loss = F.smooth_l1_loss(q(s,a), target.detach())
    q_optimizer.zero_grad()
    q_loss.backward()
    q_optimizer.step()
    
    mu_loss = -q(s,mu(s)).mean() # That's all for the policy loss.
    mu_optimizer.zero_grad()
    mu_loss.backward()
    mu_optimizer.step()
    
def soft_update(net, net_target):
    for param_target, param in zip(net_target.parameters(), net.parameters()):
        param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)
    
def main():
    env = ChopperScape()
    memory = ReplayBuffer()

    q, q_target = QNet(), QNet()
    q_target.load_state_dict(q.state_dict())
    mu, mu_target = MuNet(), MuNet()
    mu_target.load_state_dict(mu.state_dict())

    score = 0.0
    print_interval = 20

    mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu)
    q_optimizer  = optim.Adam(q.parameters(), lr=lr_q)
    ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(1))

    for n_epi in range(10000):
        s = env.reset()
        done = False
        
        while not done:
            a = mu(torch.from_numpy(s).float())
            a = a.item() + ou_noise()[0]
            print('action:',a)
            s_prime, r, done, info = env.step(a)
            env.render()
            memory.put((s,a,r/100.0,s_prime,done))
            score += r
            s = s_prime
                
        if memory.size()>20000:
            for _ in range(10):
                train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer)
                soft_update(mu, mu_target)
                soft_update(q,  q_target)
        
        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
            score = 0.0

    env.close()

if __name__ == '__main__':
    main()

and the dqn algorithm is as follows(dqn.py):

import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from custom import ChopperScape

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
buffer_limit  = 50000
batch_size    = 32

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(28, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 5)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        # coin = random.random()
        # if coin < epsilon:
        #     return random.randint(0,1)
        # else : 
        #     return out.argmax().item()
        return out.argmax().item()
            
def train(q, q_target, memory, optimizer):
    for _ in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)

        q_out = q(s)
        q_a = q_out.gather(1,a)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def main():
    env = ChopperScape()
    q = torch.load('10000_dqn_3.pt')
    q_target = torch.load('10000_dqn_3_qtarget.pt')
    # q_target.load_state_dict(q.state_dict())
    memory = ReplayBuffer()

    print_interval = 20
    score = 0.0  
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)

    for n_epi in range(10000):
        epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
        s = env.reset()
        done = False

        while not done:
            a = q.sample_action(torch.from_numpy(s).float(), epsilon)      
            s_prime, r, done, info = env.step(a)
            env.render()
            done_mask = 0.0 if done else 1.0
            memory.put((s,a,r,s_prime, done_mask))
            s = s_prime

            if done:
                break
            score += r
            
        if memory.size()>20000:
            train(q, q_target, memory, optimizer)

        if n_epi%print_interval==0 and n_epi!=0:
            q_target.load_state_dict(q.state_dict())
            print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(n_epi, score/print_interval, memory.size(), epsilon*100))
            score = 0.0
    env.close()

def test():
    env = ChopperScape()
    q = torch.load('10000_dqn_q.pt')
    done = False
    s = env.reset()
    while not done:
        a = q.sample_action(torch.from_numpy(s).float(), 1)      
        s_prime, r, done, info = env.step(a)
        env.render()
        s = s_prime
        if done:
            break

if __name__ == '__main__':
    main()

when perform dqn, please annotate the action convert part in custom.py/class ChoperScape/step

after 10000 episode in ddpg/dqn, the agent still can not play more than 15 seconds, could you point out where the problem is?

when i changes rewards and modifies the observation space, the results is relatively well — guanming Bao, Jan 17 '23 at 09:40

How i can use dqn and ddpg to successfully train an agent excellent in customized environment?

0 Answers0