Tensorflow: Saving/importing checkpoint works without error, but all imported variables have value 'none'

Question

I am training a deep CNN for image augmentation and have run into a very odd issue.

My network architecture is fully convolutional and implements several small "u-shaped" components, wherein feature maps are down/upsampled in order to be processed throughout a "top layer." In the top layer, there are several nodes where the network "guesses" the output image, and then adds the output of the lower layers to the features derived from the guess. The loss function I have penalizes error in the final prediction as well as these guesses.

The network is defined thusly:

def convnet(x, weights, biases):
    #TOP LAYER
    conv0_1        =  conv3dWrap(x, weights['wConv0_1'], biases['bConv0_1'],[1,1,1,1,1])
    conv0_2        =  conv3dWrap(conv0_1, weights['wConv0_2'], biases['bConv0_2'],[1,1,1,1,1])

    #MID LAYER DOWN SAMPLE
    conv1_1        =  conv3dWrap(conv0_2, weights['wConv1_1'], biases['bConv1_1'],[1,2,2,2,1])
    conv1_2        =  conv3dWrap(conv1_1, weights['wConv1_2'], biases['bConv1_2'],[1,1,1,1,1])

    #BOTTOM LAYER DOWN SAMPLE
    conv2_1        = conv3dWrap(conv1_2, weights['wConv2_1'], biases['bConv2_1'],[1,2,2,2,1])
    conv2_2        = conv3dWrap(conv2_1, weights['wConv2_2'], biases['bConv2_2'],[1,1,1,1,1])
    conv2_3        = conv3dWrap(conv2_2, weights['wConv2_3'], biases['bConv2_3'],[1,1,1,1,1])
    convTrans2_1   = conv3dTransWrap(conv2_3,weights['wTConv2_1'], biases['bTConv2_1'], [4,2,32,32,64],[1,2,2,2,1])

    #MID LAYER UPSAMPLE
    conv1_3        =  conv3dWrap(tf.add(convTrans2_1,conv1_2),weights['wConv1_3'], biases['bConv1_3'],[1,1,1,1,1])
    conv1_4        =  conv3dWrap(conv1_3, weights['wConv1_4'], biases['bConv1_4'],[1,1,1,1,1])
    convTrans1_1   =  conv3dTransWrap(conv1_4, weights['wTConv1_1'], biases['bTConv1_1'], [4,4,64,64,32],[1,2,2,2,1])

    #TOP LAYER AGAIN
    conv0_3        =  conv3dWrap(tf.add(conv0_2,convTrans1_1), weights['wConv0_3'], biases['bConv0_3'],[1,1,1,1,1])
    conv0_4        =  conv3dWrap(conv0_3, weights['wConv0_4'], biases['bConv0_4'],[1,1,1,1,1])
    recon0_1       =  reconWrap(conv0_3, weights['wReconDS0_1'], biases['bReconDS0_1'],[1,1,1,1,1])
    print(recon0_1.shape)
    catRecon0_1    =  tf.add(conv0_4,tf.contrib.keras.backend.repeat_elements(recon0_1,32,4))
    conv0_5        =  conv3dWrap(catRecon0_1, weights['wConv0_5'], biases['bConv0_5'],[1,1,1,1,1])

    #MID LAYER AGAIN
    conv1_5        =  conv3dWrap(conv0_5, weights['wConv1_5'], biases['bConv1_5'],[1,2,2,2,1])
    conv1_6        =  conv3dWrap(conv1_5, weights['wConv1_6'], biases['bConv1_6'],[1,1,1,1,1])

    #BOTTOM LAYER
    conv2_4        = conv3dWrap(conv1_6, weights['wConv2_4'], biases['bConv2_4'],[1,2,2,2,1])
    conv2_5        = conv3dWrap(conv2_4, weights['wConv2_5'], biases['bConv2_5'],[1,1,1,1,1])
    conv2_6        = conv3dWrap(conv2_5, weights['wConv2_6'], biases['bConv2_6'],[1,1,1,1,1])
    convTrans2_2   = conv3dTransWrap(conv2_6,weights['wTConv2_2'], biases['bTConv2_2'], [4,2,32,32,64],[1,2,2,2,1])

    #MID LAYER UPSAMPLE
    conv1_7        =  conv3dWrap(tf.add(convTrans2_2,conv1_6),weights['wConv1_7'], biases['bConv1_7'],[1,1,1,1,1])
    conv1_8        =  conv3dWrap(conv1_7, weights['wConv1_8'], biases['bConv1_8'],[1,1,1,1,1])
    convTrans1_2   =  conv3dTransWrap(conv1_8,weights['wTConv1_2'], biases['bTConv1_2'], [4,4,64,64,32],[1,2,2,2,1])

    #TOP LAYER
    conv0_6        =  conv3dWrap(tf.add(conv0_5,convTrans1_2), weights['wConv0_6'], biases['bConv0_6'],[1,1,1,1,1])
    recon0_2       =  reconWrap(conv0_6, weights['wReconDS0_2'], biases['bReconDS0_2'],[1,1,1,1,1])
    catRecon0_2    =  tf.add(conv0_6,tf.contrib.keras.backend.repeat_elements(recon0_2,32,4))
    conv0_7        =  conv3dWrap(catRecon0_2, weights['wConv0_7'], biases['bConv0_7'],[1,1,1,1,1])

    #MID LAYER
    conv1_9        =  conv3dWrap(conv0_7, weights['wConv1_9'], biases['bConv1_9'],[1,2,2,2,1]) 
    conv1_10       =  conv3dWrap(conv1_9, weights['wConv1_10'], biases['bConv1_10'],[1,1,1,1,1])

    #BOTTOM LAYER
    conv2_7        = conv3dWrap(conv1_10, weights['wConv2_7'], biases['bConv2_7'],[1,2,2,2,1])
    conv2_8        = conv3dWrap(conv2_7,  weights['wConv2_8'], biases['bConv2_8'],[1,1,1,1,1])
    conv2_9        = conv3dWrap(conv2_8,  weights['wConv2_9'], biases['bConv2_9'],[1,1,1,1,1])
    convTrans2_3   = conv3dTransWrap(conv2_9, weights['wTConv2_3'], biases['bTConv2_3'], [4,2,32,32,64],[1,2,2,2,1])

    #MID LAYER UPSAMPLE
    conv1_11        =  conv3dWrap(tf.add(convTrans2_3,conv1_10),weights['wConv1_11'], biases['bConv1_11'],[1,1,1,1,1])
    conv1_12        =  conv3dWrap(conv1_11, weights['wConv1_12'], biases['bConv1_12'],[1,1,1,1,1])
    convTrans1_3    =   conv3dTransWrap(conv1_12,weights['wTConv1_3'], biases['bTConv1_3'], [4,4,64,64,32],[1,2,2,2,1])

    #TOP LAYER
    conv0_8        =  conv3dWrap(tf.add(conv0_7,convTrans1_3), weights['wConv0_8'], biases['bConv0_8'],[1,1,1,1,1])
    recon0_3       =  reconWrap(conv0_8, weights['wReconDS0_3'], biases['bReconDS0_3'],[1,1,1,1,1])
    catRecon0_3    =  tf.add(conv0_8,tf.contrib.keras.backend.repeat_elements(recon0_3,32,4))
    conv0_9        =  conv3dWrap(catRecon0_3, weights['wConv0_9'], biases['bConv0_9'],[1,1,1,1,1])
    print(recon0_3.shape)

    #MID LAYER
    conv1_13        =  conv3dWrap(conv0_9, weights['wConv1_13'], biases['bConv1_13'],[1,2,2,2,1]) 
    conv1_14       =   conv3dWrap(conv1_13, weights['wConv1_14'], biases['bConv1_14'],[1,1,1,1,1])

    #BOTTOM LAYER
    conv2_10        = conv3dWrap(conv1_14, weights['wConv2_10'], biases['bConv2_10'],[1,2,2,2,1])
    conv2_11        = conv3dWrap(conv2_10,  weights['wConv2_11'], biases['bConv2_11'],[1,1,1,1,1])
    conv2_12        = conv3dWrap(conv2_11,  weights['wConv2_12'], biases['bConv2_12'],[1,1,1,1,1])
    convTrans2_4   = conv3dTransWrap(conv2_12, weights['wTConv2_4'], biases['bTConv2_4'], [4,2,32,32,64],[1,2,2,2,1])

    #MID LAYER UPSAMPLE
    conv1_15        =  conv3dWrap(tf.add(convTrans2_4,conv1_14),weights['wConv1_15'], biases['bConv1_15'],[1,1,1,1,1])
    conv1_16        =  conv3dWrap(conv1_15, weights['wConv1_16'], biases['bConv1_16'],[1,1,1,1,1])
    convTrans1_4    =  conv3dTransWrap(conv1_16,weights['wTConv1_4'], biases['bTConv1_4'], [4,4,64,64,32],[1,2,2,2,1])

    #TOP LAYER
    conv0_10        =  conv3dWrap(tf.add(conv0_9,convTrans1_4), weights['wConv0_10'], biases['bConv0_10'],[1,1,1,1,1])

    #OUTPUT
    convOUT        =  reconWrap(conv0_10, weights['wConvOUT'], biases['bConvOUT'],[1,1,1,1,1])
    print(convOUT.shape)

    return recon0_1, recon0_2, recon0_3, convOUT

Where all of the "wrappers" are as follows:

def conv3dWrap(x, W, b, strides):
    x = tf.nn.conv3d(x, W, strides, padding='SAME')
    x = tf.nn.bias_add(x, b)
    return tf.nn.relu(x)

def reconWrap(x, W, b, strides):
    x = tf.nn.conv3d(x, W, strides, padding='SAME')
    x = tf.nn.bias_add(x, b)
    return x

def conv3dTransWrap(x, W, b, shape, strides):
    x = tf.nn.conv3d_transpose(x, W, shape, strides, padding='SAME')
    x = tf.nn.bias_add(x,b)
    return tf.nn.relu(x)

My weights and biases are stored in dictionaries that are defined before starting the training:

weights={
#TOP LAYER
'wConv0_1':      tf.Variable(tf.random_normal([4, 3, 3, 1, 5]),   name='wC0_1'),
'wConv0_2':      tf.Variable(tf.random_normal([4, 3, 3, 5, 32]),  name='wC0_2'),
'wConv0_3':      tf.Variable(tf.random_normal([4, 3, 3, 32, 32]), name='wC0_3'),
'wConv0_4':      tf.Variable(tf.random_normal([4, 3, 3, 32, 32]),  name='wC0_4'),
'wReconDS0_1':   tf.Variable(tf.random_normal([1, 1, 1, 32, 1]) , name='wR0_1') ...... #THIS CONTINUES FOR QUITE AWHILE

Then, I begin the training like this:

def train_cnn(x):
    epochLosses=[]
    print('Beginning Training!')
    print(NUM_EPOCHS)
    r1,r2,r3,pred = convNet(x, weights, biases)        
    cost = (tf.losses.mean_squared_error(y,pred) 
    + 0.25* ((tf.losses.mean_squared_error(y,r1)) 
    + (tf.losses.mean_squared_error(y,r2))
    + (tf.losses.mean_squared_error(y,r3))))

    regularizer= 0.01*tf.nn.l2_loss((weights['wConv0_1'])+
                                    0.01*tf.nn.l2_loss(weights['wConv0_2'])+
                                    0.01*tf.nn.l2_loss(weights['wConv0_3'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv0_4'])+    
                                    0.01*tf.nn.l2_loss(weights['wReconDS0_1'])+  
                                    0.01*tf.nn.l2_loss(weights['wConv0_5'])+ 
                                    0.01*tf.nn.l2_loss(weights['wConv0_6'])+     
                                    0.01*tf.nn.l2_loss(weights['wReconDS0_2'])+  
                                    0.01*tf.nn.l2_loss(weights['wReconDS0_3'])+  
                                    0.01*tf.nn.l2_loss(weights['wConv0_7'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv0_8'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv0_9'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv0_10'])+     
                                    0.01*tf.nn.l2_loss(weights['wConvOUT'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv1_1'])+   
                                    0.01*tf.nn.l2_loss(weights['wConv1_2'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv1_3'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv1_4'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv1_5'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv1_6'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv1_7'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv1_8'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv1_9'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv1_10'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv1_11'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv1_12'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv1_13'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv1_14'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv1_15'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv1_16'])+      
                                    0.01*tf.nn.l2_loss(weights['wTConv1_1'])+      
                                    0.01*tf.nn.l2_loss(weights['wTConv1_2'])+     
                                    0.01*tf.nn.l2_loss(weights['wTConv1_3'])+      
                                    0.01*tf.nn.l2_loss(weights['wTConv1_4'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv2_1'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv2_2'])+   
                                    0.01*tf.nn.l2_loss(weights['wConv2_3'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv2_4'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv2_5'])+    
                                    0.01*tf.nn.l2_loss(weights['wConv2_6'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv2_7'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv2_8'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv2_9'])+      
                                    0.01*tf.nn.l2_loss(weights['wConv2_10'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv2_11'])+     
                                    0.01*tf.nn.l2_loss(weights['wConv2_12'])+   
                                    0.01*tf.nn.l2_loss(weights['wTConv2_1'])+     
                                    0.01*tf.nn.l2_loss(weights['wTConv2_2'])+     
                                    0.01*tf.nn.l2_loss(weights['wTConv2_3'])+    
                                    0.01*tf.nn.l2_loss(weights['wTConv2_4']))
    cost=cost+regularizer
    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(cost)
    saver = tf.train.Saver()
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    valLosses=[]
    epochLosses=[]
    print('Beginning Session!')
    writer  =  tf.summary.FileWriter ( './GRAPH' ,  sess.graph)
    sess.run(tf.global_variables_initializer())

Finally, I go ahead and do some stuff for loading in the batches and, once they're ready, I do the following (for each pass, I won't do the saving every pass once I have the weight importing working):

                _, c = sess.run([optimizer, cost], feed_dict = {x: inBatch,y: gsBatch})
                epoch_loss += c           
                save_path = saver.save(sess, "./CHKPT/model.cpkt")

So when I go ahead and import this model

sess = tf.Session()
x = tf.placeholder(dtype=tf.float32)
new_saver = tf.train.import_meta_graph('./CHKPT/model.cpkt.meta')
sess.run(tf.global_variables_initializer())
a,b,c,pred = convNet(x, weights, biases)

I am met with the following error:

ValueError: Tried to convert 'filter' to a tensor and failed. Error: None values not supported.

When I look at the imported weights and biases, each of them have value 'None'. Not only is this odd, but the network 'runs' incredibly quickly during training, far far more quickly than I'd expect. I am worried that no legitimate computations are occurring.

This must not be the case, but, I am almost positive I am following the saving/loading process I've used for many other networks verbatim. Can anyone shed some light on what might be happening here?

Edit: I'm also very new to TF, and it's likely there are non-idealities in my code. If you see anything outside of the saving/importing that isn't kosher please let me know.

BlueSun · Answer 1 · 2017-10-18T17:52:54.277

0

Running sess.run(tf.global_variables_initializer()) will reinitialize every tensor and delete their loaded values. Skip calling tf.global_variables_initializer() when you load a model. The initialization is done by the saver.

You are also missing the restore call (import_meta_graph() only loads the saver object).

new_saver = tf.train.import_meta_graph('./CHKPT/model.cpkt.meta')
new_saver.restore(sess, './CHKPT/model.cpkt')

Thereafter when you run:

a,b,c,pred = convNet(x, weights, biases)

you create an all new network and never use the loaded one. Instead you have to find the tensors you need inside tf.global_variables() after restoring the model. For example by searching for them by name.

edited Oct 18 '17 at 17:52

answered Oct 18 '17 at 17:19

BlueSun

3,541
1
18
37

I just removed the initializer call in the loading code and the issue remains unchanged. – Karl Oct 18 '17 at 17:22
@Karl you also skipped the restoring of the model. I edited the answer. – BlueSun Oct 18 '17 at 17:36
I just added this and am still having the same issue. I think the problem lies in the training and/or saving. When I look at the weights and biases I import, they are in fact 'None' valued: weights Out[40]: {'wConv0_1': None, 'wConv0_10': None, 'wConv0_2': None, 'wConv0_3': None, 'wConv0_4': None, 'wConv0_5': None, 'wConv0_6': None, 'wConv0_7': None, – Karl Oct 18 '17 at 17:41
@Karl I think you are looking at the wrong weights and biases. I edited the answer. – BlueSun Oct 18 '17 at 17:54
Thanks so much for the time you're putting into this. Unfortunately when I run the following command `sess = tf.Session() x = tf.placeholder(dtype=tf.float32) new_saver = tf.train.import_meta_graph('./CHKPT/model.cpkt.meta') new_saver.restore(sess, './CHKPT/model.cpkt') [print(v) for v in tf.global_variables()]` I end up just printing a list of 'None'. Am I seriously misunderstanding something here, or does this mean my variables are being saved as none. – Karl Oct 18 '17 at 18:00
@Karl tf.global_variables() being a list of None is strange. You can try looking what is actually in the checkpoint by using the checkpoint inspector: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/inspect_checkpoint.py. Run in the terminal: PATHTO/inspect_checkpoint.py --file_name ./CHKPT/model.cpkt – BlueSun Oct 18 '17 at 18:40

Tensorflow: Saving/importing checkpoint works without error, but all imported variables have value 'none'

1 Answers1