We ran into the strange problem that our relatively simple model converges on the CPU, but not on the server with GPU. No modifications to the code are done whatsoever between the two runs. Nor does the code contain any explicit conditional statements to change the workflow on different architectures.
What could possibly be the reason? How can this tensorflow model converge on a CPU but not on a GPU? In the likely event that the code is too long for you to read we are still thankful about general speculations and hints.
#!/usr/bin/python
from __future__ import print_function
import tensorflow as tf
import os
import numpy as np
import input_data # copy from tensorflow/examples/tutorials/mnist/input_data.py
# wget https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/examples/tutorials/mnist/input_data.py if needed
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
force_gpu = False
debug = True # histogram_summary ...
# _cpu='/cpu:0'
default_learning_rate=0.001
tensorboard_logs = '/tmp/tensorboard-logs/'
# $(sleep 5; open http://0.0.0.0:6006) & tensorboard --debug --logdir=/tmp/tensorboard-logs/
class net():
def __init__(self,model,data,name=0,learning_rate=default_learning_rate,batch_size=64):
self.session=sess=session=tf.Session()
self.model=model
self.data=data # assigned to self.x=net.input via train
self.batch_size=batch_size
self.layers=[]
self.last_width=self.input_width(data)
self.learning_rate=learning_rate
self.generate_model(model)
def generate_model(self,model, name=''):
if not model: return self
with tf.name_scope('state'):
self.keep_prob = tf.placeholder(tf.float32) # 1 for testing! else 1 - dropout
self.train_phase = tf.placeholder(tf.bool, name='train_phase')
self.global_step = tf.Variable(0) # dont set, feed or increment global_step, tensorflow will do it automatically
with tf.name_scope('data'):
n_input=28*28
n_classes=10
self.x = x = self.input = tf.placeholder(tf.float32, [None, n_input])
self.last_layer=x
self.y = y = self.target = tf.placeholder(tf.float32, [None, n_classes])
if not force_gpu: tf.image_summary("mnist", tf.reshape(self.x, [-1, 28, 28, 1], "mnist_images"))
with tf.name_scope('model'):
model(self)
if(self.last_width!=n_classes): self.classifier() # 10 classes auto
def input_width(self,data):
return 28*28
def add(self, layer):
self.layers.append(layer)
self.last_layer = layer
self.last_shape = layer.get_shape()
def reshape(self,shape):
self.last_layer = tf.reshape(self.last_layer,shape)
self.last_shape = shape
self.last_width = shape[-1]
def batchnorm(self):
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
with tf.name_scope('batchnorm') as scope:
input = self.last_layer
train_op=batch_norm(input, is_training=True, center=False, updates_collections=None, scope=scope)
test_op=batch_norm(input, is_training=False, updates_collections=None, center=False,scope=scope, reuse=True)
self.add(tf.cond(self.train_phase,lambda:train_op,lambda:test_op))
# Fully connected layer
def dense(self, hidden=1024, depth=1, act=tf.nn.tanh, dropout=False, parent=-1): #
if parent==-1: parent=self.last_layer
shape = self.last_layer.get_shape()
if shape and len(shape)>2:
self.last_width= int(shape[1]*shape[2]*shape[3])
print("reshapeing ",shape,"to",self.last_width)
parent = tf.reshape(parent, [-1, self.last_width])
width = hidden
while depth>0:
with tf.name_scope('Dense_{:d}'.format(hidden)) as scope:
print("Dense ", self.last_width, width)
nr = len(self.layers)
# if self.last_width == width:
# M = closest_unitary(np.random.rand(self.last_width, width) / (self.last_width + width))
# weights = tf.Variable(m, name="weights_dense_" + str(nr))
# else:
weights = tf.Variable(tf.random_uniform([self.last_width, width], minval=-1. / width, maxval=1. / width), name="weights_dense")
bias = tf.Variable(tf.random_uniform([width],minval=-1./width,maxval=1./width), name="bias_dense")
dense1 = tf.matmul(parent, weights, name='dense_'+str(nr))+ bias
tf.histogram_summary('dense_'+str(nr),dense1)
tf.histogram_summary('weights_'+str(nr),weights)
tf.histogram_summary('bias_'+str(nr),bias)
tf.histogram_summary('dense_'+str(nr)+'/sparsity', tf.nn.zero_fraction(dense1))
tf.histogram_summary('weights_'+str(nr)+'/sparsity', tf.nn.zero_fraction(weights))
if act: dense1 = act(dense1)
# if norm: dense1 = self.norm(dense1,lsize=1) # SHAPE!
if dropout: dense1 = tf.nn.dropout(dense1, self.keep_prob)
self.layers.append(dense1)
self.last_layer = parent = dense1
self.last_width = width
depth=depth-1
self.last_shape=[-1,width] # dense
# Convolution Layer
def conv(self,shape,act=tf.nn.relu,pool=True,dropout=False,norm=True,name=None): # True why dropout bad in tensorflow??
with tf.name_scope('conv'):
print("input shape ",self.last_shape)
print("conv shape ",shape)
width=shape[-1]
filters=tf.Variable(tf.random_normal(shape))
# filters = tf.Variable(tf.random_uniform(shape, minval=-1. / width, maxval=1. / width), name="filters")
_bias=tf.Variable(tf.random_normal([shape[-1]]))
# # conv1 = conv2d('conv', _X, _weights, _bias)
conv1=tf.nn.bias_add(tf.nn.conv2d(self.last_layer,filter=filters, strides=[1, 1, 1, 1], padding='SAME'), _bias)
if debug: tf.histogram_summary('conv_' + str(len(self.layers)), conv1)
if act: conv1=act(conv1)
if pool: conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
if norm: conv1 = tf.nn.lrn(conv1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
if debug: tf.histogram_summary('norm_' + str(len(self.layers)), conv1)
if dropout: conv1 = tf.nn.dropout(conv1,self.keep_prob)
print("output shape ",conv1.get_shape())
self.add(conv1)
def classifier(self,classes=10): # Define loss and optimizer
with tf.name_scope('prediction'):# prediction
if self.last_width!=classes:
# print("Automatically adding dense prediction")
self.dense(hidden=classes, act= False, dropout = False)
# cross_entropy = -tf.reduce_sum(y_*y)
with tf.name_scope('classifier'):
y_=self.target
manual=False # True
if classes>100:
print("using sampled_softmax_loss")
y=prediction=self.last_layer
self.cost = tf.reduce_mean(tf.nn.sampled_softmax_loss(y, y_)) # for big vocab
elif manual:
# prediction = y =self.last_layer=tf.nn.softmax(self.last_layer)
# self.cost = cross_entropy = -tf.reduce_sum(y_ * tf.log(y+ 1e-10)) # against NaN!
prediction = y = tf.nn.log_softmax(self.last_layer)
self.cost = cross_entropy = -tf.reduce_sum(y_ * y)
else:
y = prediction = self.last_layer
self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_)) # prediction, target
# if not gpu:
tf.scalar_summary('cost', self.cost)
# self.cost = tf.Print(self.cost , [self.cost ], "debug cost : ")
learning_scheme=self.learning_rate
# learning_scheme=tf.train.exponential_decay(self.learning_rate, self.global_step, decay_steps, decay_size)
self.optimizer = tf.train.AdamOptimizer(learning_scheme).minimize(self.cost)
# Evaluate model
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(self.target, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
if not force_gpu: tf.scalar_summary('accuracy', self.accuracy)
# Launch the graph
def next_batch(self,batch_size=10):
return self.data.train.next_batch(batch_size)
def train(self,steps=-1,dropout=None,display_step=10,test_step=200): #epochs=-1,
steps = 9999999 if steps==-1 else steps
session=self.session
# with tf.device(_cpu):
# import tensorflow.contrib.layers as layers
# t = tf.verify_tensor_all_finite(t, msg)
tf.add_check_numerics_ops()
self.summaries = tf.merge_all_summaries()
self.summary_writer = tf.train.SummaryWriter(tensorboard_logs, session.graph) #
if not dropout:dropout=1. # keep all
x=self.x
y=self.y
keep_prob=self.keep_prob
session.run([tf.initialize_all_variables()])
step = 0 # show first
while step < steps:
# print("step %d \r" % step)# end=' ')
batch_xs, batch_ys = self.next_batch(self.batch_size)
# tf.train.shuffle_batch_join(example_list, batch_size, capacity=min_queue_size + batch_size * 16, min_queue_size)
# Fit training using batch data
feed_dict = {x: batch_xs, y: batch_ys, keep_prob: dropout, self.train_phase: True}
loss,_= session.run([self.cost,self.optimizer], feed_dict=feed_dict)
if step % test_step == 0: self.test(step)
if step % display_step == 0:
# Calculate batch accuracy, loss
feed = {x: batch_xs, y: batch_ys, keep_prob: 1., self.train_phase: False}
acc , summary = session.run([self.accuracy,self.summaries], feed_dict=feed)
# self.summary_writer.add_summary(summary, step) # only test summaries for smoother curve
print("\rStep {:d} Loss= {:.6f} Accuracy= {:.3f}".format(step,loss,acc),end=' ')
if str(loss)=="nan": return print("\nLoss gradiant explosion, exiting!!!") #restore!
step += 1
print("\nOptimization Finished!")
self.test(step,number=10000) # final test
def inputs(self,data):
self.inputs, self.labels = load_data()#...)
def test(self,step,number=400):#256
session=sess=self.session
run_metadata = tf.RunMetadata()
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
# Calculate accuracy for 256 mnist test images
test_labels = self.data.test.labels[:number]
test_images = self.data.test.images[:number]
feed_dict = {self.x: test_images, self.y: test_labels, self.keep_prob: 1., self.train_phase:False}
accuracy,summary= self.session.run([self.accuracy, self.summaries], feed_dict=feed_dict)
# accuracy,summary = session.run([self.accuracy, self.summaries], feed_dict, run_options, run_metadata)
print('\t'*3+"Test Accuracy:",accuracy)
# self.summary_writer.add_run_metadata(run_metadata, 'step #%03d' % step)
self.summary_writer.add_summary(summary,global_step=step)
def dense(net): # best with lr ~0.001
# type: (layer.net) -> None
# net.batchnorm() # start lower, else no effect
# net.dense(400,act=None)# # ~95% we can do better:
net.dense(400, act=tf.nn.tanh)# 0.996 YAY only 0.985 on full set, Step 5000 flat
return # 0.957% without any model!!
def alex(net):
# type: (layer.net) -> None
print("Building Alex-net")
net.reshape(shape=[-1, 28, 28, 1]) # Reshape input pictures
# net.batchnorm()
net.conv([3, 3, 1, 64])
net.conv([3, 3, 64, 128])
net.conv([3, 3, 128, 256])
net.dense(1024,act=tf.nn.relu)
net.dense(1024,act=tf.nn.relu)
# net=layer.net(dense,data=mnist, learning_rate=0.01 )#,'mnist' baseline
_net=net(alex,data=mnist, learning_rate=0.001)#,'mnist'
_net.train(50000,dropout=0.6,display_step=1,test_step=10)