So I'm trying to create a VERY simple neural network with no hidden layers, just input (3 elements) and linear output (2 elements).
I then define some variables to store configurations and weights
# some configs
input_size = 3
action_size = 2
min_delta, max_delta = -1, 1
learning_rate_op = 0.5
w = {} # weights
I then create the training network
# training network
with tf.variable_scope('prediction'):
state_tensor = tf.placeholder('float32', [None, input_size], name='state_tensor')
w['q_w'] = tf.get_variable('Matrix', [state_tensor.get_shape().as_list()[1], action_size], tf.float32, tf.random_normal_initializer(stddev=0.02))
w['q_b'] = tf.get_variable('bias', [action_size], initializer=tf.constant_initializer(0))
q = tf.nn.bias_add(tf.matmul(state_tensor, w['q_w']), w['q_b'])
I define the optimizer to minimize the square different between the target value and the training network
# weight optimizer
with tf.variable_scope('optimizer'):
# tensor to hold target value
# eg, target_q_tensor=[10;11]
target_q_tensor = tf.placeholder('float32', [None], name='target_q_tensor')
# tensors for action_tensor, for action_tensor matrix and for value deltas
# eg, action_tensor=[0;1], action_one_hot=[[1,0];[0,1]], q_acted=[Q_0,Q_1]
action_tensor = tf.placeholder('int64', [None], name='action_tensor')
action_one_hot = tf.one_hot(action_tensor, action_size, 1.0, 0.0, name='action_one_hot')
q_acted = tf.reduce_sum(q * action_one_hot, reduction_indices=1, name='q_acted')
# delta
delta = target_q_tensor - q_acted
clipped_delta = tf.clip_by_value(delta, min_delta, max_delta, name='clipped_delta')
# error function
loss = tf.reduce_mean(tf.square(clipped_delta), name='loss')
# optimizer
# optim = tf.train.AdamOptimizer(learning_rate_op).minimize(loss)
optim = tf.train.GradientDescentOptimizer(learning_rate_op).minimize(loss)
And finally, I run some values in an infinite loop. However, the weights are never updated, they maintain the random values with which they were initialized
with tf.Session() as sess:
tf.initialize_all_variables().run()
s_t = np.array([[1,0,0],[1,0,1],[1,1,0],[1,0,0]])
action = np.array([0, 1, 0, 1])
target_q = np.array([10, -11, -12, 13])
while True:
if counter % 10000 == 0:
q_values = q.eval({state_tensor: s_t})
for i in range(len(s_t)):
print("q", q_values[i])
print("w", sess.run(w['q_w']), '\nb', sess.run(w['q_b']))
sess.run(optim, {target_q_tensor: target_q, action_tensor: action, state_tensor: s_t})
I took the code from a working DQN implementation, so I figure I'm doing something blatantly wrong. The network should converge to:
# 0 | 1
####################
1,0,0 # 10 13
1,0,1 # x -11
1,1,0 # -12 x
But they do not change at all. Any pointers?
Turns out that clipping the loss is causing the issue. However, I don't understand why...