8

I am trying to implement Asynchronous Methods for Deep Reinforcement Learning and one of the steps requires to accumulate the gradient over different steps and then apply it. What is the best way to achieve this in tensorflow? I got so far as to accumulate the gradient and I don't think is the fastest way to achieve it (lots of transfers from tensorflow to python and back). Any suggestions are welcome. This is my code of a toy NN. It does not model or compute anything it just exercise the operations that I want to use.

import tensorflow as tf

from model import *


graph = tf.Graph()

with graph.as_default():

    state = tf.placeholder(tf.float32, shape=[None, 80,80,1])

    with tf.variable_scope('layer1'):
        W = weight_variable([8, 8, 1, 32])
        variable_summaries(W, "layer1/W")
        b = bias_variable([32])
        variable_summaries(b, "layer1/b")
        h = conv2d(state, W, 4) + b
        activation = tf.nn.relu(h)
        pool1 = max_pool_2x2(activation)

    print(pool1.get_shape())
    pool1 = tf.reshape(pool1, [-1, 3200])

    with tf.variable_scope('readout'):
        W = weight_variable([3200, 3])
        b = bias_variable([3])
        logits = tf.matmul(pool1, W) + b
        variable_summaries(h, "y")

    action_indexes = tf.placeholder(tf.int32, shape=[None], name="action_indexes")

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, action_indexes)

    starter_learning_rate = 1e-6

    global_step = tf.Variable(0, trainable=False)

    # decay every 1000 steps with a base of 0.96:
    learning_rate = tf.train.exponential_decay(starter_learning_rate,
        global_step,
        10000, 0.96, staircase=True)

    optimizer = tf.train.RMSPropOptimizer(learning_rate)

    gradients_and_variables = optimizer.compute_gradients(loss, tf.trainable_variables())

    discounted_values = tf.placeholder(tf.float32, shape=[None, 1])

with tf.Session(graph=graph) as s:

    for v in tf.trainable_variables():
        print(v.name, v.dtype, v.get_shape())

    s.run(tf.initialize_all_variables())

    feed_dict= {
        state : np.zeros([1, 80, 80, 1]),
        action_indexes: [1],
    }


    var_to_grad = dict((var.name, grad) for grad, var in gradients_and_variables)
    keys = sorted(var_to_grad.keys())
    print(keys)

    name_to_var = dict((var.name, var) for _, var in gradients_and_variables)

    for i in range(10):

        gradients = s.run([ var_to_grad[k] for k in keys], feed_dict=feed_dict)

        for k,v in zip(keys, gradients):
            var_to_grad[k] += v

    for k in keys:
        print(var_to_grad[k])

    s.run( optimizer.apply_gradients( (g, name_to_var[v]) for v,g in var_to_grad.iteritems()), feed_dict=feed_dict)

Updated code after @yaroslave suggestion:

import tensorflow as tf

from model import *


graph = tf.Graph()

with graph.as_default():

    minibatch = 32
    state = tf.placeholder(tf.float32, shape=[minibatch, 80,80,1], name="input")

    with tf.variable_scope('layer1'):
        W = weight_variable([8, 8, 1, 32])
        variable_summaries(W, "layer1/W")
        b = bias_variable([32])
        variable_summaries(b, "layer1/b")
        h = conv2d(state, W, 4) + b
        activation = tf.nn.relu(h)
        pool1 = max_pool_2x2(activation)

    print(pool1.get_shape())
    pool1 = tf.reshape(pool1, [-1, 3200])

    with tf.variable_scope('readout'):
        W = weight_variable([3200, 3])
        b = bias_variable([3])
        logits = tf.matmul(pool1, W) + b
        variable_summaries(h, "y")

    action_indexes = tf.placeholder(tf.int32, shape=[minibatch], name="action_indexes")

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, action_indexes)

    starter_learning_rate = 1e-6

    global_step = tf.Variable(0, trainable=False)

    # decay every 1000 steps with a base of 0.96:
    learning_rate = tf.train.exponential_decay(starter_learning_rate,
        global_step,
        10000, 0.96, staircase=True)

    optimizer = tf.train.RMSPropOptimizer(learning_rate)

    trainable_variables = tf.trainable_variables()
    varname_to_var = dict( (v.name, v) for v in trainable_variables )
    keys = sorted(varname_to_var.keys())

    gradients_and_variables = optimizer.compute_gradients(loss, [ varname_to_var[k] for k in keys])

    var_to_grad = dict((var.name, grad) for grad, var in gradients_and_variables)

    name_to_var = dict((var.name, var) for _, var in gradients_and_variables)

    # save the gradients in memory
    var_to_ref_grad = {}
    for k in keys:
        grad = var_to_grad[k]
        print(k, grad.get_shape())
        ref = tf.Variable(tf.zeros_like(grad))
        ref = ref.assign_add(grad)
        var_to_ref_grad[k] = ref

    discounted_values = tf.placeholder(tf.float32, shape=[None, 1], name='discounted_values')

    # control when to apply gradients
    compute_gradients_flag = tf.placeholder(tf.int32, name="compute_gradients")
    def fn1():
        var_grad_list = []
        for k in keys:
            grad = var_to_ref_grad[k]
            var  = varname_to_var[k]
            var_grad_list.append((grad,var))

        optimizer.apply_gradients(var_grad_list)
        return tf.no_op()

    fn2 = lambda : tf.no_op()

    last_op = tf.cond(tf.equal(compute_gradients_flag, 1), fn1, fn2)

with tf.Session(graph=graph) as s:

    feed_dict= {
        state : np.zeros([minibatch, 80, 80, 1]),
        action_indexes: [1],
        compute_gradients_flag: False,
    }

    s.run(tf.initialize_all_variables())

    for i in range(10):

        # accumulate gradients
        s.run(last_op, feed_dict=feed_dict)
Engineero
  • 12,340
  • 5
  • 53
  • 75
fabrizioM
  • 46,639
  • 15
  • 102
  • 119
  • 1
    You could keep everything in TF by running `assign` ops that save gradients in Variables instead of fetching the values, and then doing `assign_add` instead of to accumulate – Yaroslav Bulatov Jun 08 '16 at 21:20
  • @YaroslavBulatov Do you think TF will add more interfaces for implementations like this? It would be nice if we can use TF for reinforcement-learning as well. – Sung Kim Jun 09 '16 at 02:11
  • 1
    @SungKim There's an immediate execution interface [in the works](https://github.com/tensorflow/tensorflow/pull/2595) so you could use standard Python constructs while keeping data on GPU – Yaroslav Bulatov Jun 09 '16 at 02:13
  • @YaroslavBulatov so I updated the code as you suggested, but now the interesting bit is that `tf.initialize_all_variables()` requires to run the operations in the graph to initialize the variables. Maybe I am doing it wrong and I would not have to use initialize_all_variables and just initialize those manually. – fabrizioM Jun 09 '16 at 19:27
  • Hm, I don't understand the problem, `initialize_all_variables` should just run the initializers to set the initial value for the vars, what does it run that it shouldn't? – Yaroslav Bulatov Jun 09 '16 at 22:01
  • I think you should add trainable=False in this line ref = tf.Variable(tf.zeros_like(grad)). If not, the trainable_variables you called before will have the initialisation problem. – LI Xuhong Dec 19 '16 at 18:53
  • But it seems that this code doesn't reset the accumulated gradients to 0 after applying them to variables. Am I wrong? Is there any new updated code for sharing? Thanks. – LI Xuhong Dec 19 '16 at 18:55

2 Answers2

0

You don't really have to manually accumulate gradients. You can have Tensorflow accumulate them for you by applying the rollout update as a batch.

s_list = list_of_states_visited
a_list = list_of_actions_taken
R_list = list_of_value_targets

sess.run(local_net.update, feed_dict={
    local_net.input: s_list,
    local_net.a: a_list,
    local_net.R: R_list
})
0

Something like this might work to create ops for accumulating gradients, resetting the accumulated gradients, and applying the accumulated gradients (untested!):

def build_gradient_accumulators(optimizer, gradients_and_variables):
    accum_grads_and_vars = []
    accumulators = []
    resetters = []

    for grad, var in gradients_and_variables:
        accum = tf.Variable(tf.zeros_like(grad))
        accum = accum.assign_add(grad)
        accumulators.append(accum)
        accum_grads_and_vars.append((accum, var))
        resetters.append(tf.assign(accum, tf.zeros_like(accum)))

    reset_op = tf.group(*resetters)
    accum_op = tf.group(*accumulators)
    apply_op = optimizer.apply_gradients(accum_grads_and_vars)
    return reset_op, accum_op, apply_op
eqzx
  • 5,323
  • 4
  • 37
  • 54