I try to familiarize myself with Q-learning and Deep Neural Networks, currently try to implement Playing Atari with Deep Reinforcement Learning.
To test my implementation and play around with it, I tought I try a simple gridworld. Where i have a N x N Grid and start in the top left corner and finishes at the bottom right. The possible actions are: left, up, right, down.
Even though my implementation has become very similar to this(hope its a good one) it dosn't seem to learn anything. Looking at the total steps it needs to finish(I guess the average would be aroung 500 with a gridsize of 10x10, but there also very low and high values), it seams more random than anything else to me.
I tried it with and without convolutional layers and played around with all the parameters but to be honest, I've no idea if something with my implementation is wrong or it needs to train longer(I let it train for quite a time) or what ever. But at least it seams to converge, here the plot of the loss value one training session:
So what is the problem in this case?
But also and maybe more importantly how can I "debug" this Deep-Q-Nets, in supervised training there are training, test and validation sets and for example with precision and recall it is possible to evaluate them. What options do I have for unsupervised learning with Deep-Q-Nets, so that the next time maybe I can fix it myself?
Finally here is the code:
This is the network:
ACTIONS = 5
# Inputs
x = tf.placeholder('float', shape=[None, 10, 10, 4])
y = tf.placeholder('float', shape=[None])
a = tf.placeholder('float', shape=[None, ACTIONS])
# Layer 1 Conv1 - input
with tf.name_scope('Layer1'):
W_conv1 = weight_variable([8,8,4,8])
b_conv1 = bias_variable([8])
h_conv1 = tf.nn.relu(conv2d(x, W_conv1, 5)+b_conv1)
# Layer 2 Conv2 - hidden1
with tf.name_scope('Layer2'):
W_conv2 = weight_variable([2,2,8,8])
b_conv2 = bias_variable([8])
h_conv2 = tf.nn.relu(conv2d(h_conv1, W_conv2, 1)+b_conv2)
h_conv2_max_pool = max_pool_2x2(h_conv2)
# Layer 3 fc1 - hidden 2
with tf.name_scope('Layer3'):
W_fc1 = weight_variable([8, 32])
b_fc1 = bias_variable([32])
h_conv2_flat = tf.reshape(h_conv2_max_pool, [-1, 8])
h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, W_fc1)+b_fc1)
# Layer 4 fc2 - readout
with tf.name_scope('Layer4'):
W_fc2 = weight_variable([32, ACTIONS])
b_fc2 = bias_variable([ACTIONS])
readout = tf.matmul(h_fc1, W_fc2)+ b_fc2
# Training
with tf.name_scope('training'):
readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices=1)
loss = tf.reduce_mean(tf.square(y - readout_action))
train = tf.train.AdamOptimizer(1e-6).minimize(loss)
loss_summ = tf.scalar_summary('loss', loss)
And here the training:
# 0 => left
# 1 => up
# 2 => right
# 3 => down
# 4 = noop
ACTIONS = 5
GAMMA = 0.95
BATCH = 50
TRANSITIONS = 2000
OBSERVATIONS = 1000
MAXSTEPS = 1000
D = deque()
epsilon = 1
average = 0
for episode in xrange(1000):
step_count = 0
game_ended = False
state = np.array([0.0]*100, float).reshape(100)
state[0] = 1
rsh_state = state.reshape(10,10)
s = np.stack((rsh_state, rsh_state, rsh_state, rsh_state), axis=2)
while step_count < MAXSTEPS and not game_ended:
reward = 0
step_count += 1
read = readout.eval(feed_dict={x: [s]})[0]
act = np.zeros(ACTIONS)
action = random.randint(0,4)
if len(D) > OBSERVATIONS and random.random() > epsilon:
action = np.argmax(read)
act[action] = 1
# play the game
pos_idx = state.argmax(axis=0)
pos = pos_idx + 1
state[pos_idx] = 0
if action == 0 and pos%10 != 1: #left
state[pos_idx-1] = 1
elif action == 1 and pos > 10: #up
state[pos_idx-10] = 1
elif action == 2 and pos%10 != 0: #right
state[pos_idx+1] = 1
elif action == 3 and pos < 91: #down
state[pos_idx+10] = 1
else: #noop
state[pos_idx] = 1
pass
if state.argmax(axis=0) == pos_idx and reward > 0:
reward -= 0.0001
if step_count == MAXSTEPS:
reward -= 100
elif state[99] == 1: # reward & finished
reward += 100
game_ended = True
else:
reward -= 1
s_old = np.copy(s)
s = np.append(s[:,:,1:], state.reshape(10,10,1), axis=2)
D.append((s_old, act, reward, s))
if len(D) > TRANSITIONS:
D.popleft()
if len(D) > OBSERVATIONS:
minibatch = random.sample(D, BATCH)
s_j_batch = [d[0] for d in minibatch]
a_batch = [d[1] for d in minibatch]
r_batch = [d[2] for d in minibatch]
s_j1_batch = [d[3] for d in minibatch]
readout_j1_batch = readout.eval(feed_dict={x:s_j1_batch})
y_batch = []
for i in xrange(0, len(minibatch)):
y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))
train.run(feed_dict={x: s_j_batch, y: y_batch, a: a_batch})
if epsilon > 0.05:
epsilon -= 0.01
I appreciate every help and ideas you may have!