1

I'm recently learning deep reinforcement learning and I wanted to apply what I learned to a problem from gym using Keras.

During training I realized that it is too slow, after checking the reason I saw that "fit" function takes so much time.

Running each episode takes 3-4 minutes.

Is there something wrong at what I'm doing? Or can you suggest an improvement?

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from collections import deque
import random
import gym
import datetime

class DQN():
    def __init__(self, env):
        self.env = env
        self.memory = deque(maxlen=2000)

        self.gamma = 0.98
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.998
        self.learning_rate = 0.001

        self.model = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        model = keras.Sequential()
        state_shape = self.env.observation_space.shape
        model.add(keras.layers.Dense(48, activation="relu", input_dim=state_shape[0]))
        model.add(keras.layers.Dense(24, activation="relu"))
        model.add(keras.layers.Dense(self.env.action_space.n, activation="relu"))
        model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])
    
    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size:
            return
        
        samples = random.sample(self.memory, batch_size)
        # states, actions, rewards, states_, dones = samples
        # targets = self.target_model.predict(states)
        # _states = [i for i in range(len(samples))]
        # targets = [[0 for j in range(self.env.action_space.n)] for i in range(len(samples))]
        _states = np.zeros((len(samples), 8))
        targets = np.zeros((len(samples), self.env.action_space.n))

        for i, sample in enumerate(samples):
            state, action, reward, new_state, done = sample
            _states[i] = state
            # target = self.target_model.predict(state)
            if done:
                targets[i][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                targets[i][action] = reward + Q_future*self.gamma

        self.model.fit(_states, targets, epochs=1, verbose=0)

         
             
        # for sample in samples:
        #     state, action, reward, new_state, done = sample
        #     target = self.target_model.predict(state)
        #     if done:
        #         target[0][action] = reward
        #     else:
        #         Q_future = max(self.target_model.predict(new_state)[0])
        #         target[0][action] = reward + Q_future*self.gamma

        #         start_time = datetime.datetime.now()
        #         self.model.fit(state, target, epochs=1, verbose=0)
        #         end_time = datetime.datetime.now()
        #         print("--fit--")
        #         print(end_time-start_time)

            

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i]
        self.target_model.set_weights(target_weights)
    
    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state)[0])

    def save_model(self, fn):
        self.model.save(fn)

    def act_eval(self, state):
        return np.argmax(self.model.predict(state)[0])

    def evaluation(self, n_eval=10):
        total_reward = 0
        for _ in range(n_eval):
            self.env.reset()
            cur_state = self.env.reset().reshape(1,8)
            done = False
            while not done:
                action = self.act_eval(cur_state)
                new_state, reward, done, _ = self.env.step(action)
                total_reward += reward
                cur_state = new_state.reshape(1,8)
        
        return total_reward / n_eval



def main():
    save_path = "policies/"
    env = gym.make("LunarLander-v2")
    
    trials = 2000
    trial_len = 500

    update_target_network = 500
    agent = DQN(env=env)
    for trial in range(trials):
        cur_state = env.reset().reshape(1,8)
        time_step_cntr = 0


        # check execution durations
        dur_replay = 0
        dur_step = 0
        dur_act = 0


        for step in range(trial_len):
            print("Trial {0}, step {1}".format(trial, step))
            action = agent.act(cur_state) # 



            new_state, reward, done, _ = env.step(action) # 

            new_state = new_state.reshape(1,8)
            agent.remember(cur_state, action, reward, new_state, done)

            # learn from experience
            agent.replay() # 

            # after "update_target_network" steps, update target network
            if time_step_cntr % update_target_network == 0:
                agent.target_train()
            time_step_cntr += 1

            cur_state = new_state
            if done:
                break
        
        # print("Duration replay {0}, duration act {1}, duration step {2}".format(dur_replay, dur_act, dur_step))
        
        # at each N steps, evaluate
        print("Evaluation over 10 episodes", agent.evaluation())

        
        print("Trial #{0} completed.".format(trial))
        # # print the progress
        # if trial % 100 == 0:
        #     print("Trial #{0} completed.".format(trial))

        # save the model
        # if trial % 20 == 0:
        agent.save_model(save_path + str(trial) + "__.model")

    agent.save_model(save_path + "_final" + "__.model")

if __name__ == "__main__":
    main()
ML85
  • 709
  • 7
  • 19
m.a.a.
  • 137
  • 1
  • 9

1 Answers1

0

Your problem is not in the fit call, but in the loop that you have in the replay() method. Try always substituting loops by numpy operations in these cases, that make the operations much more agile.

Replace your replay method by the following one and let me know if it works faster for you

def replay(self):
    batch_size = 32
    if len(self.memory) >= batch_size:
        # Draw a sample
        samples = random.sample(self.memory, batch_size)
        
        # Prepare the batch
        state, action, reward, new_state, done = zip(*samples)
        next_state = np.concatenate(new_state)
        done = np.array(done)[:,None]
        state = np.concatenate(state)
        reward = np.array(reward)[:,None]
        q_future = self.target_model.predict(next_state)
        targets = reward + self.gamma*np.max(q_future, axis=1, keepdims=True)*(1-done)
        
        # Fit the model
        self.model.fit(state, targets, epochs=1, verbose=0)
ivallesp
  • 2,018
  • 1
  • 14
  • 21
  • Thanks for your answer. After posting, I've changed my code to work on numpy arrays as you suggested, but in a different way. The problems seems to be solved. – m.a.a. Dec 21 '20 at 10:52
  • Could you post the code you used if you still have it please ? – Ness May 28 '22 at 17:57