How can this tensorflow model converge on a CPU but not on a GPU?

Question

We ran into the strange problem that our relatively simple model converges on the CPU, but not on the server with GPU. No modifications to the code are done whatsoever between the two runs. Nor does the code contain any explicit conditional statements to change the workflow on different architectures.

What could possibly be the reason? How can this tensorflow model converge on a CPU but not on a GPU? In the likely event that the code is too long for you to read we are still thankful about general speculations and hints.

#!/usr/bin/python
from __future__ import print_function
import tensorflow as tf
import os
import numpy as np
import input_data # copy from tensorflow/examples/tutorials/mnist/input_data.py
# wget https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/examples/tutorials/mnist/input_data.py if needed

mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

force_gpu = False
debug = True # histogram_summary ...
# _cpu='/cpu:0'
default_learning_rate=0.001
tensorboard_logs = '/tmp/tensorboard-logs/'


# $(sleep 5; open http://0.0.0.0:6006) & tensorboard --debug --logdir=/tmp/tensorboard-logs/

class net():

    def __init__(self,model,data,name=0,learning_rate=default_learning_rate,batch_size=64):
            self.session=sess=session=tf.Session()
            self.model=model
            self.data=data # assigned to self.x=net.input via train
            self.batch_size=batch_size
            self.layers=[]
            self.last_width=self.input_width(data)
            self.learning_rate=learning_rate

            self.generate_model(model)

    def generate_model(self,model, name=''):
        if not model: return self
        with tf.name_scope('state'):
            self.keep_prob = tf.placeholder(tf.float32)  # 1 for testing! else 1 - dropout
            self.train_phase = tf.placeholder(tf.bool, name='train_phase')
            self.global_step = tf.Variable(0)  # dont set, feed or increment global_step, tensorflow will do it automatically
        with tf.name_scope('data'):
            n_input=28*28
            n_classes=10
            self.x = x = self.input  = tf.placeholder(tf.float32, [None, n_input])
            self.last_layer=x
            self.y = y = self.target = tf.placeholder(tf.float32, [None, n_classes])
            if not force_gpu: tf.image_summary("mnist", tf.reshape(self.x, [-1, 28, 28, 1], "mnist_images"))
        with tf.name_scope('model'):
            model(self)
        if(self.last_width!=n_classes): self.classifier()  # 10 classes auto


    def input_width(self,data):
        return 28*28


    def add(self, layer):
        self.layers.append(layer)
        self.last_layer = layer
        self.last_shape = layer.get_shape()

    def reshape(self,shape):
        self.last_layer = tf.reshape(self.last_layer,shape)
        self.last_shape = shape
        self.last_width = shape[-1]

    def batchnorm(self):
        from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
        with tf.name_scope('batchnorm') as scope:
            input = self.last_layer
            train_op=batch_norm(input, is_training=True, center=False, updates_collections=None, scope=scope)
            test_op=batch_norm(input, is_training=False, updates_collections=None, center=False,scope=scope, reuse=True)
            self.add(tf.cond(self.train_phase,lambda:train_op,lambda:test_op))

    # Fully connected layer
    def dense(self, hidden=1024, depth=1, act=tf.nn.tanh, dropout=False, parent=-1): #
        if parent==-1: parent=self.last_layer
        shape = self.last_layer.get_shape()
        if shape and len(shape)>2:
            self.last_width= int(shape[1]*shape[2]*shape[3])
            print("reshapeing ",shape,"to",self.last_width)
            parent = tf.reshape(parent, [-1, self.last_width])

        width = hidden
        while depth>0:
            with tf.name_scope('Dense_{:d}'.format(hidden)) as scope:
                print("Dense ", self.last_width, width)
                nr = len(self.layers)
                # if self.last_width == width:
                #   M = closest_unitary(np.random.rand(self.last_width, width) / (self.last_width + width))
                #   weights = tf.Variable(m, name="weights_dense_" + str(nr))
                # else:
                weights = tf.Variable(tf.random_uniform([self.last_width, width], minval=-1. / width, maxval=1. / width), name="weights_dense")
                bias = tf.Variable(tf.random_uniform([width],minval=-1./width,maxval=1./width), name="bias_dense")
                dense1 = tf.matmul(parent, weights, name='dense_'+str(nr))+ bias
                tf.histogram_summary('dense_'+str(nr),dense1)
                tf.histogram_summary('weights_'+str(nr),weights)
                tf.histogram_summary('bias_'+str(nr),bias)
                tf.histogram_summary('dense_'+str(nr)+'/sparsity', tf.nn.zero_fraction(dense1))
                tf.histogram_summary('weights_'+str(nr)+'/sparsity', tf.nn.zero_fraction(weights))
                if act: dense1 = act(dense1)
                # if norm: dense1 = self.norm(dense1,lsize=1) # SHAPE!
                if dropout: dense1 = tf.nn.dropout(dense1, self.keep_prob)
                self.layers.append(dense1)
                self.last_layer = parent = dense1
                self.last_width = width
                depth=depth-1
                self.last_shape=[-1,width] # dense

    # Convolution Layer
    def conv(self,shape,act=tf.nn.relu,pool=True,dropout=False,norm=True,name=None): # True why dropout bad in tensorflow??
        with tf.name_scope('conv'):
            print("input  shape ",self.last_shape)
            print("conv   shape ",shape)
            width=shape[-1]
            filters=tf.Variable(tf.random_normal(shape))
            # filters = tf.Variable(tf.random_uniform(shape, minval=-1. / width, maxval=1. / width), name="filters")
            _bias=tf.Variable(tf.random_normal([shape[-1]]))

            # # conv1 = conv2d('conv', _X, _weights, _bias)
            conv1=tf.nn.bias_add(tf.nn.conv2d(self.last_layer,filter=filters, strides=[1, 1, 1, 1], padding='SAME'), _bias)
            if debug: tf.histogram_summary('conv_' + str(len(self.layers)), conv1)
            if act: conv1=act(conv1)
            if pool: conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
            if norm: conv1 = tf.nn.lrn(conv1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
            if debug: tf.histogram_summary('norm_' + str(len(self.layers)), conv1)
            if dropout: conv1 = tf.nn.dropout(conv1,self.keep_prob)
            print("output shape ",conv1.get_shape())
            self.add(conv1)

    def classifier(self,classes=10):  # Define loss and optimizer
        with tf.name_scope('prediction'):# prediction
            if self.last_width!=classes:
                # print("Automatically adding dense prediction")
                self.dense(hidden=classes, act= False, dropout = False)
            # cross_entropy = -tf.reduce_sum(y_*y)
        with tf.name_scope('classifier'):
            y_=self.target
            manual=False # True
            if classes>100:
                print("using sampled_softmax_loss")
                y=prediction=self.last_layer
                self.cost = tf.reduce_mean(tf.nn.sampled_softmax_loss(y, y_))  # for big vocab
            elif manual:
                # prediction = y =self.last_layer=tf.nn.softmax(self.last_layer)
                # self.cost = cross_entropy = -tf.reduce_sum(y_ * tf.log(y+ 1e-10)) # against NaN!
                prediction = y = tf.nn.log_softmax(self.last_layer)
                self.cost = cross_entropy = -tf.reduce_sum(y_ * y)
            else:
                y = prediction = self.last_layer
                self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_)) # prediction, target

            # if not gpu:
            tf.scalar_summary('cost', self.cost)
            # self.cost = tf.Print(self.cost , [self.cost ], "debug cost : ")
            learning_scheme=self.learning_rate
            # learning_scheme=tf.train.exponential_decay(self.learning_rate, self.global_step, decay_steps, decay_size)
            self.optimizer = tf.train.AdamOptimizer(learning_scheme).minimize(self.cost)

            # Evaluate model
            correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(self.target, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
            if not force_gpu: tf.scalar_summary('accuracy', self.accuracy)
            # Launch the graph

    def next_batch(self,batch_size=10):
        return self.data.train.next_batch(batch_size)

    def train(self,steps=-1,dropout=None,display_step=10,test_step=200): #epochs=-1,
        steps = 9999999 if steps==-1 else steps
        session=self.session
        # with tf.device(_cpu):

        # import tensorflow.contrib.layers as layers
        # t = tf.verify_tensor_all_finite(t, msg)
        tf.add_check_numerics_ops()
        self.summaries = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(tensorboard_logs, session.graph) #
        if not dropout:dropout=1. # keep all
        x=self.x
        y=self.y
        keep_prob=self.keep_prob
        session.run([tf.initialize_all_variables()])
        step = 0 # show first
        while step < steps:
            # print("step %d \r" % step)# end=' ')
            batch_xs, batch_ys = self.next_batch(self.batch_size)

            # tf.train.shuffle_batch_join(example_list, batch_size, capacity=min_queue_size + batch_size * 16, min_queue_size)
            # Fit training using batch data
            feed_dict = {x: batch_xs, y: batch_ys, keep_prob: dropout, self.train_phase: True}
            loss,_= session.run([self.cost,self.optimizer], feed_dict=feed_dict)
            if step % test_step == 0: self.test(step)
            if step % display_step == 0:
                # Calculate batch accuracy, loss
                feed = {x: batch_xs, y: batch_ys, keep_prob: 1., self.train_phase: False}
                acc , summary = session.run([self.accuracy,self.summaries], feed_dict=feed)
                # self.summary_writer.add_summary(summary, step) # only test summaries for smoother curve
                print("\rStep {:d} Loss= {:.6f} Accuracy= {:.3f}".format(step,loss,acc),end=' ')
                if str(loss)=="nan": return print("\nLoss gradiant explosion, exiting!!!") #restore!
            step += 1
        print("\nOptimization Finished!")
        self.test(step,number=10000) # final test

    def inputs(self,data):
        self.inputs, self.labels = load_data()#...)

    def test(self,step,number=400):#256
        session=sess=self.session
        run_metadata = tf.RunMetadata()
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        # Calculate accuracy for 256 mnist test images
        test_labels = self.data.test.labels[:number]
        test_images = self.data.test.images[:number]
        feed_dict = {self.x: test_images, self.y: test_labels, self.keep_prob: 1., self.train_phase:False}
        accuracy,summary= self.session.run([self.accuracy, self.summaries], feed_dict=feed_dict)
        # accuracy,summary = session.run([self.accuracy, self.summaries], feed_dict, run_options, run_metadata)
        print('\t'*3+"Test Accuracy:",accuracy)
        # self.summary_writer.add_run_metadata(run_metadata, 'step #%03d' % step)
        self.summary_writer.add_summary(summary,global_step=step)


def dense(net): # best with lr ~0.001
    # type: (layer.net) -> None
    # net.batchnorm() # start lower, else no effect
    # net.dense(400,act=None)#  # ~95% we can do better:
    net.dense(400, act=tf.nn.tanh)# 0.996 YAY  only 0.985 on full set, Step 5000 flat
    return # 0.957% without any model!!

def alex(net):
    # type: (layer.net) -> None
    print("Building Alex-net")
    net.reshape(shape=[-1, 28, 28, 1])  # Reshape input pictures
    # net.batchnorm()
    net.conv([3, 3, 1, 64])
    net.conv([3, 3, 64, 128])
    net.conv([3, 3, 128, 256])
    net.dense(1024,act=tf.nn.relu)
    net.dense(1024,act=tf.nn.relu)


# net=layer.net(dense,data=mnist, learning_rate=0.01 )#,'mnist' baseline
_net=net(alex,data=mnist, learning_rate=0.001)#,'mnist'
_net.train(50000,dropout=0.6,display_step=1,test_step=10)

Does it still converge if you make optimization more stable by adding regularization (ie, dropout, lower learning rate, l2 penalty)?. GPU and CPU kernels give slightly different answers — Yaroslav Bulatov, Sep 05 '16 at 04:59
@YaroslavBulatov Thanks! "GPU and CPU kernels give slightly different answers". interesting, why is that? you can turn your comment into an answer and I will accept. — Anona112, Sep 05 '16 at 07:31
It depends on the GPU itself. Some have higher floating point error tolerance and other GPUs even include fp correction. Basically not all GPUs are created equal. — Steven, Sep 06 '16 at 17:43

score 0 · Answer 1 · answered Sep 06 '16 at 23:12

0

In general floating point computations can be a little nondeterministic when it comes to adding many numbers (and some GPUs are buggy). Did you try retuning hyperparameters (varying learning rates and whatnot) to account for this?

answered Sep 06 '16 at 23:12

Alexandre Passos

5,186
1
14
19

score 0 · Answer 2 · answered Jan 24 '23 at 16:26

I faced the same problem in the past. I solved it by following the tutorial at this link: https://www.tensorflow.org/install/pip#windows-native

Apparently, conda installation of TensorFlow is not recommended.

As an extra tip, if you're working with GPU avoid using TensorFlow 2.3. Apparently, it has some installation issues.

How can this tensorflow model converge on a CPU but not on a GPU?

2 Answers2

Linked