I'm recently learning deep reinforcement learning and I wanted to apply what I learned to a problem from gym using Keras.
During training I realized that it is too slow, after checking the reason I saw that "fit" function takes so much time.
Running each episode takes 3-4 minutes.
Is there something wrong at what I'm doing? Or can you suggest an improvement?
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from collections import deque
import random
import gym
import datetime
class DQN():
def __init__(self, env):
self.env = env
self.memory = deque(maxlen=2000)
self.gamma = 0.98
self.epsilon = 1
self.epsilon_min = 0.01
self.epsilon_decay = 0.998
self.learning_rate = 0.001
self.model = self.create_model()
self.target_model = self.create_model()
def create_model(self):
model = keras.Sequential()
state_shape = self.env.observation_space.shape
model.add(keras.layers.Dense(48, activation="relu", input_dim=state_shape[0]))
model.add(keras.layers.Dense(24, activation="relu"))
model.add(keras.layers.Dense(self.env.action_space.n, activation="relu"))
model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
return model
def remember(self, state, action, reward, new_state, done):
self.memory.append([state, action, reward, new_state, done])
def replay(self):
batch_size = 32
if len(self.memory) < batch_size:
return
samples = random.sample(self.memory, batch_size)
# states, actions, rewards, states_, dones = samples
# targets = self.target_model.predict(states)
# _states = [i for i in range(len(samples))]
# targets = [[0 for j in range(self.env.action_space.n)] for i in range(len(samples))]
_states = np.zeros((len(samples), 8))
targets = np.zeros((len(samples), self.env.action_space.n))
for i, sample in enumerate(samples):
state, action, reward, new_state, done = sample
_states[i] = state
# target = self.target_model.predict(state)
if done:
targets[i][action] = reward
else:
Q_future = max(self.target_model.predict(new_state)[0])
targets[i][action] = reward + Q_future*self.gamma
self.model.fit(_states, targets, epochs=1, verbose=0)
# for sample in samples:
# state, action, reward, new_state, done = sample
# target = self.target_model.predict(state)
# if done:
# target[0][action] = reward
# else:
# Q_future = max(self.target_model.predict(new_state)[0])
# target[0][action] = reward + Q_future*self.gamma
# start_time = datetime.datetime.now()
# self.model.fit(state, target, epochs=1, verbose=0)
# end_time = datetime.datetime.now()
# print("--fit--")
# print(end_time-start_time)
def target_train(self):
weights = self.model.get_weights()
target_weights = self.target_model.get_weights()
for i in range(len(target_weights)):
target_weights[i] = weights[i]
self.target_model.set_weights(target_weights)
def act(self, state):
self.epsilon *= self.epsilon_decay
self.epsilon = max(self.epsilon_min, self.epsilon)
if np.random.random() < self.epsilon:
return self.env.action_space.sample()
return np.argmax(self.model.predict(state)[0])
def save_model(self, fn):
self.model.save(fn)
def act_eval(self, state):
return np.argmax(self.model.predict(state)[0])
def evaluation(self, n_eval=10):
total_reward = 0
for _ in range(n_eval):
self.env.reset()
cur_state = self.env.reset().reshape(1,8)
done = False
while not done:
action = self.act_eval(cur_state)
new_state, reward, done, _ = self.env.step(action)
total_reward += reward
cur_state = new_state.reshape(1,8)
return total_reward / n_eval
def main():
save_path = "policies/"
env = gym.make("LunarLander-v2")
trials = 2000
trial_len = 500
update_target_network = 500
agent = DQN(env=env)
for trial in range(trials):
cur_state = env.reset().reshape(1,8)
time_step_cntr = 0
# check execution durations
dur_replay = 0
dur_step = 0
dur_act = 0
for step in range(trial_len):
print("Trial {0}, step {1}".format(trial, step))
action = agent.act(cur_state) #
new_state, reward, done, _ = env.step(action) #
new_state = new_state.reshape(1,8)
agent.remember(cur_state, action, reward, new_state, done)
# learn from experience
agent.replay() #
# after "update_target_network" steps, update target network
if time_step_cntr % update_target_network == 0:
agent.target_train()
time_step_cntr += 1
cur_state = new_state
if done:
break
# print("Duration replay {0}, duration act {1}, duration step {2}".format(dur_replay, dur_act, dur_step))
# at each N steps, evaluate
print("Evaluation over 10 episodes", agent.evaluation())
print("Trial #{0} completed.".format(trial))
# # print the progress
# if trial % 100 == 0:
# print("Trial #{0} completed.".format(trial))
# save the model
# if trial % 20 == 0:
agent.save_model(save_path + str(trial) + "__.model")
agent.save_model(save_path + "_final" + "__.model")
if __name__ == "__main__":
main()