With gradient and loss, I experiment with the learning process of a neural network.
I assumed that learning process(loss function value, gradient value) are same if learning conditions(model structure, initialization, batch input stream) of two models are same.
Under the same initialization, I thought that the gradient and loss calculated for each model would be the same, but the result was not.
The code I have run is the most basic learning code, as shown below.
(to constrained the randomness of dropout, I just deactivated the dropout layer)
import tensorflow as tf
import numpy as np
#from pprint import pprint
from tensorflow.examples.tutorials.mnist import input_data
batch_size = 128
test_size = 256
def init_weights(shape):
return tf.Variable(tf.random_normal(shape, stddev=0.01, seed=1))
def model(X, w, w2, w3, w4, w_o, p_keep_conv, p_keep_hidden):
l1a = tf.nn.relu(tf.nn.conv2d(X, w, # l1a shape=(?, 28, 28, 32)
strides=[1, 1, 1, 1], padding='SAME'))
l1 = tf.nn.max_pool(l1a, ksize=[1, 2, 2, 1], # l1 shape=(?, 14, 14, 32)
strides=[1, 2, 2, 1], padding='SAME')
#l1 = tf.nn.dropout(l1, p_keep_conv)
l2a = tf.nn.relu(tf.nn.conv2d(l1, w2, # l2a shape=(?, 14, 14, 64)
strides=[1, 1, 1, 1], padding='SAME'))
l2 = tf.nn.max_pool(l2a, ksize=[1, 2, 2, 1], # l2 shape=(?, 7, 7, 64)
strides=[1, 2, 2, 1], padding='SAME')
#l2 = tf.nn.dropout(l2, p_keep_conv)
l3a = tf.nn.relu(tf.nn.conv2d(l2, w3, # l3a shape=(?, 7, 7, 128)
strides=[1, 1, 1, 1], padding='SAME'))
l3 = tf.nn.max_pool(l3a, ksize=[1, 2, 2, 1], # l3 shape=(?, 4, 4, 128)
strides=[1, 2, 2, 1], padding='SAME')
l3 = tf.reshape(l3, [-1, w4.get_shape().as_list()[0]]) # reshape to (?, 2048)
#l3 = tf.nn.dropout(l3, p_keep_conv)
l4 = tf.nn.relu(tf.matmul(l3, w4))
#l4 = tf.nn.dropout(l4, p_keep_hidden)
pyx = tf.matmul(l4, w_o)
return pyx
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
trX, trY, teX, teY = mnist.train.images, mnist.train.labels, mnist.test.images, mnist.test.labels
trX = trX.reshape(-1, 28, 28, 1) # 28x28x1 input img
teX = teX.reshape(-1, 28, 28, 1) # 28x28x1 input img
X0 = tf.placeholder("float", [None, 28, 28, 1], name='input_0')
X1 = tf.placeholder("float", [None, 28, 28, 1], name='input_1')
Y = tf.placeholder("float", [None, 10])
with tf.variable_scope('00'):
w0_1 = init_weights([3, 3, 1, 32]) # 3x3x1 conv, 32 outputs
w0_2 = init_weights([3, 3, 32, 64]) # 3x3x32 conv, 64 outputs
w0_3 = init_weights([3, 3, 64, 128]) # 3x3x32 conv, 128 outputs
w0_4 = init_weights([128 * 4 * 4, 625]) # FC 128 * 4 * 4 inputs, 625 outputs
w0_o = init_weights([625, 10]) # FC 625 inputs, 10 outputs (labels)
with tf.variable_scope('01'):
w1_1 = init_weights([3, 3, 1, 32]) # 3x3x1 conv, 32 outputs
w1_2 = init_weights([3, 3, 32, 64]) # 3x3x32 conv, 64 outputs
w1_3 = init_weights([3, 3, 64, 128]) # 3x3x32 conv, 128 outputs
w1_4 = init_weights([128 * 4 * 4, 625]) # FC 128 * 4 * 4 inputs, 625 outputs
w1_o = init_weights([625, 10]) # FC 625 inputs, 10 outputs (labels)
p_keep_conv = tf.placeholder("float")
p_keep_hidden = tf.placeholder("float")
py_x0 = model(X0, w0_1, w0_2, w0_3, w0_4, w0_o, p_keep_conv, p_keep_hidden)
py_x1 = model(X1, w1_1, w1_2, w1_3, w1_4, w1_o, p_keep_conv, p_keep_hidden)
cost0 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=py_x0, labels=Y))
cost1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=py_x1, labels=Y))
model_0_vars = [tensor for tensor in tf.trainable_variables() if '00/' in tensor.name]
model_1_vars = [tensor for tensor in tf.trainable_variables() if '01/' in tensor.name]
#pprint(model_0_vars)
#pprint(model_1_vars)
train_op_model_0_grad = tf.train.RMSPropOptimizer(0.001, 0.9).compute_gradients(cost0, var_list = model_0_vars)
train_op_model_0_apply= tf.train.RMSPropOptimizer(0.001, 0.9).apply_gradients(train_op_model_0_grad)
train_op_model_1_grad = tf.train.RMSPropOptimizer(0.001, 0.9).compute_gradients(cost1, var_list = model_1_vars)
train_op_model_1_apply= tf.train.RMSPropOptimizer(0.001, 0.9).apply_gradients(train_op_model_1_grad)
predict_op_0 = tf.argmax(py_x0, 1)
predict_op_1 = tf.argmax(py_x1, 1)
# Launch the graph in a session
with tf.Session() as sess:
# you need to initialize all variables
tf.initialize_all_variables().run()
step = 0
for i in range(2):
#check_weight_tmp = sess.run([model_0_vars, model_1_vars])
#check_weight.append(check_weight_tmp)
training_batch = zip(range(0, len(trX), batch_size),
range(batch_size, len(trX)+1, batch_size))
for start, end in training_batch:
step +=1
grad0, grad1 = \
sess.run([train_op_model_0_grad, train_op_model_1_grad],
feed_dict={X0: trX[start:end], X1: trX[start:end], Y: trY[start:end],
p_keep_conv: 0.8, p_keep_hidden: 0.5})
cost0_r, cost1_r = \
sess.run([cost0, cost1],
feed_dict={X0: trX[start:end], X1: trX[start:end], Y: trY[start:end],
p_keep_conv: 0.8, p_keep_hidden: 0.5})
sess.run([train_op_model_0_apply, train_op_model_1_apply],
feed_dict={X0: trX[start:end], X1: trX[start:end], Y: trY[start:end],
p_keep_conv: 0.8, p_keep_hidden: 0.5})
print("STEP: {0:5d}, {1:3.6f}, {2:3.6f}, {3:3.6f}".format(step, cost0_r, cost1_r, cost0_r - cost1_r))
#test_indices = np.arange(len(teX)) # Get A Test Batch
#np.random.shuffle(test_indices)
#test_indices = test_indices[0:test_size]
#print(i, np.mean(np.argmax(teY[test_indices], axis=1) ==
# sess.run(predict_op_0,
# feed_dict={X: teX[test_indices],
# Y: teY[test_indices],
# p_keep_conv: 1.0,
# p_keep_hidden: 1.0})))
There is a noticeable difference in loss from the 200th epoch.
I think I have missed it, but I can't figure it out even I spend a long time to look at it.
I would appreciate if you could let me know what I am missing.