tensorflow cost is nan when training a autoencoder

Question

I am training a autoencoder with tensorflow ,but the cost is nan ,i modify the learning rate and the optimizer, but it not works.i search some result shows that decreasing the lr may help it ,but I change the lr to 0.00001 ,it also wont work . here is my parameters code :

learning_rate =0.00001
training_epochs = 2                                                       
batch_size = 900 
display_step =1                                                           
examples_to_show = 10
nextbatch = 0 
#network parameters
n_input = 500
# tf Graph input  
X = tf.placeholder("float",[None,n_input])                                

# hidden layer setting                                                    
n_hidden_1 = 400 # 1st layer num features
n_hidden_2 = 300 # 2nd layer num features                                 
n_hidden_3 = 200 # 3nd layer num features
n_hidden_4 = 100 # 4nd layer num features
weights = {
    'encoder_h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),    # 500 * 400
    'encoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])), # 400 * 300
    'encoder_h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])), # 300 * 200
    'encoder_h4': tf.Variable(tf.random_normal([n_hidden_3, n_hidden_4])), # 200 * 100

    'decoder_h1': tf.Variable(tf.random_normal([n_hidden_4, n_hidden_3])), # 100 * 200
    'decoder_h2': tf.Variable(tf.random_normal([n_hidden_3, n_hidden_2])), # 200 * 300 
    'decoder_h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_1])), # 300 * 400
    'decoder_h4': tf.Variable(tf.random_normal([n_hidden_1, n_input])),    # 400 * 500 
}
biases = {
'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),  # 400
'encoder_b2': tf.Variable(tf.random_normal([n_hidden_2])),  # 300 
'encoder_b3': tf.Variable(tf.random_normal([n_hidden_3])),  # 200
'encoder_b4': tf.Variable(tf.random_normal([n_hidden_4])),  # 100

'decoder_b1': tf.Variable(tf.random_normal([n_hidden_3])),  # 200
'decoder_b2': tf.Variable(tf.random_normal([n_hidden_2])),  # 300
'decoder_b3': tf.Variable(tf.random_normal([n_hidden_1])),  # 400
'decoder_b4': tf.Variable(tf.random_normal([n_input])),     # 500
}

# Building the encoder
def encoder(x):
    print ("i am encoder")
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, 
weights['encoder_h1']),biases['encoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, 
weights['encoder_h2']),biases['encoder_b2']))
    layer_3 = tf.nn.sigmoid(tf.add(tf.matmul(layer_2, 
weights['encoder_h3']),biases['encoder_b3']))
    layer_4 = tf.nn.sigmoid(tf.add(tf.matmul(layer_3, 
weights['encoder_h4']),biases['encoder_b4']))
    return layer_4

# Building the decoder
def decoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, 
weights['decoder_h1']),biases['decoder_b1']))
    print("layer1:",layer_1)
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, 
weights['decoder_h2']),biases['decoder_b2']))
    print("layer2:",layer_2)
    layer_3 = tf.nn.sigmoid(tf.add(tf.matmul(layer_2, 
weights['decoder_h3']),biases['decoder_b3']))
    print("layer3:",layer_3)
    layer_4 = tf.nn.sigmoid(tf.add(tf.matmul(layer_3, 
weights['decoder_h4']),biases['decoder_b4']))
    print("layer4:",layer_4)
    return layer_4

def normalize(x):
    amin,amax = x.min(),x.max()
    x = (x-amin)/(amax - amin)
    return x
def main():
# Construct model
encoder_op = encoder(X)
print ("encoder_op:",encoder_op)
decoder_op = decoder(encoder_op)
print ("decoder_op:",decoder_op)
# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X

# Define loss and optimizer, minimize the squared error
#cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
#cost =tf.reduce_sum(tf.pow((y_true - y_pred),2))
cost = tf.reduce_mean(tf.squared_difference(y_true, y_pred))
#optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
#optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(cost)

# Launch the graph
with tf.Session() as sess:
    if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
        init = tf.initialize_all_variables()
    else:
        init = tf.global_variables_initializer()
    sess.run(init)
    lenx_train = len(loadvector("x","train"))
    total_batch = int(lenx_train/batch_size)

    # Training cycle
    for epoch in range(training_epochs):
        # Loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = get_next_batch(batch_size)  # max(x) = 1, min(x) = 0
            # Run optimization op (backprop) and cost op (to get loss value)
            op,c = sess.run([optimizer, cost], feed_dict={X: batch_xs})
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1),"cost=",c)
            print("op:", '%04d' % (epoch+1),"op=",op)
print("Optimization Finished!")
if __name__ == "__main__":
    x_train= loadvector("x","train")
    #x_train = scale(x_train)
    x_train = normalize(x_train)
    y_train = loadvector("y", "train")
    main()

here is the result when i running it

Epoch: 0001 cost= 0.373359
op: 0001 op= None
Epoch: 0002 cost= nan
op: 0002 op= None
Optimization Finished!

score 0 · Answer 1 · answered Nov 08 '17 at 07:41

It is hard to tell for sure without the data, but a possible reason is your implementation of normalize(). If you happened to have a tensor of equal values, you will get a bunch of nans from normalize().

In general, you can take a look at the second answer in How does one debug NaN values in TensorFlow? that suggests to use tf.add_check_numerics_ops(). This should catch numeric issues in most cases pretty quickly. Also, you can take a look at tfdbg.

tensorflow cost is nan when training a autoencoder

1 Answers1