Summary: I have a Training routine that attempts to reload a saved graph for continued training but instead produces an
IndexError: list index out of range
when I try to load the optimizer withoptimizer = tf.get_collection("optimizer")[0]
. I experienced several other errors along the way, but ultimately this is the one that had me stuck. I finally figured it out so I'll answer my own question in case it might help others.
The goal is simple: I spent 6+ hours training a model before saving it and now I would like to reload and train it some more. No matter what I do, however, I get an error.
I found a very simple example on Github that simply created a saver = tf.train.Saver()
operator and then saver.save(sess, model_path)
to save and saver.restore(sess, model_path)
to load. When I attempt to do the same, I get At least two variables have the same name: decode/decoder/dense/kernel/Adam_1
. I'm using the Adam optimizer so I'm guessing that's related to the problem. I resolve this issue using the approach below.
I know the model is good, because further down in my code (see bottom) I have a Prediction routine that loads the saved model and runs and input, and it works. It uses loaded_graph = tf.Graph()
and then loader = tf.train.import_meta_graph(checkpoint + '.meta')
plus loader.restore(sess, checkpoint)
to load the model. It then does a bunch of loaded_graph.get_tensor_by_name('input:0')
calls.
When I try this approach (you can see the commented code) the "two variables" problem goes away, but now I get a TypeError: Cannot interpret feed_dict key as Tensor: The name 'save/Const:0' refers to a Tensor which does not exist. The operation, 'save/Const', does not exist in the graph.
This post does a good job of explaining how to organize the code to avoid the ValueError: cannot add op with name <my weights variable name>/Adam as that name is already used
, which I've done.
@mmry explains the TypeError over here, but I'm not understanding what he's saying and don't see how I can fix it.
I've spent the entire day moving things around and getting different errors, and I have run out of ideas. Help would be appreciated.
This is the Training code:
import time
# Split data to training and validation sets
train_source = source_letter_ids[batch_size:]
train_target = target_letter_ids[batch_size:]
valid_source = source_letter_ids[:batch_size]
valid_target = target_letter_ids[:batch_size]
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
source_letter_to_int['<PAD>'],
target_letter_to_int['<PAD>']))
if (len(source_sentences) > 10000):
display_step = 100 # Check training loss after each of this many batches with large data
else:
display_step = 20 # Check training loss after each of this many batches with small data
# loader = tf.train.import_meta_graph(checkpoint + '.meta')
# loaded_graph = tf.get_default_graph()
# input_data = loaded_graph.get_tensor_by_name('input:0')
# targets = loaded_graph.get_tensor_by_name('targets:0')
# lr = loaded_graph.get_tensor_by_name('learning_rate:0')
# source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
# target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
# keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
# loader = tf.train.Saver()
saver = tf.train.Saver()
with tf.Session(graph=train_graph) as sess:
start = time.time()
sess.run(tf.global_variables_initializer())
# loader.restore(sess, checkpoint)
# optimizer = tf.get_collection("optimization")[0]
# gradients = optimizer.compute_gradients(cost)
# capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
# train_op = optimizer.apply_gradients(capped_gradients)
for epoch_i in range(1, epochs+1):
for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
get_batches(train_target, train_source, batch_size,
source_letter_to_int['<PAD>'],
target_letter_to_int['<PAD>'])):
# Training step
_, loss = sess.run(
[train_op, cost],
{input_data: sources_batch,
targets: targets_batch,
lr: learning_rate,
target_sequence_length: targets_lengths,
source_sequence_length: sources_lengths,
keep_prob: keep_probability})
# Debug message updating us on the status of the training
if batch_i % display_step == 0 and batch_i > 0:
# Calculate validation cost
validation_loss = sess.run(
[cost],
{input_data: valid_sources_batch,
targets: valid_targets_batch,
lr: learning_rate,
target_sequence_length: valid_targets_lengths,
source_sequence_length: valid_sources_lengths,
keep_prob: 1.0})
print('Epoch {:>3}/{} Batch {:>6}/{} Inputs (000) {:>7} - Loss: {:>6.3f} - Validation loss: {:>6.3f}'
.format(epoch_i, epochs, batch_i, len(train_source) // batch_size,
(((epoch_i - 1) * len(train_source)) + batch_i * batch_size) // 1000,
loss, validation_loss[0]))
# Save model
saver = tf.train.Saver()
saver.save(sess, checkpoint)
# Print time spent training the model
end = time.time()
seconds = end - start
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
print('Model Trained in {}h:{}m:{}s and Saved'.format(int(h), int(m), int(s)))
This is the key part of the Prediction code:
This code works, so I 'know' that the graph is being saved successfully.
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
# Load saved model
loader = tf.train.import_meta_graph(checkpoint + '.meta')
loader.restore(sess, checkpoint)
input_data = loaded_graph.get_tensor_by_name('input:0')
logits = loaded_graph.get_tensor_by_name('predictions:0')
source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
#Multiply by batch_size to match the model's input parameters
answer_logits = sess.run(logits, {input_data: [text]*batch_size,
target_sequence_length: [len(text)]*batch_size,
source_sequence_length: [len(text)]*batch_size,
keep_prob: 1.0})[0]
Update - Another try at Training code
Here's another crack at the training code, trying to follow a suggestion from @jie-zhou. This time the line optimizer = tf.get_collection("optimization")[0]
gives me IndexError: list index out of range
. That line only works when it's after sess.run(tf.global_variables_initializer())
so I'm not seeing what I'm supposed to initialize.
import time
# Split data to training and validation sets
train_source = source_letter_ids[batch_size:]
train_target = target_letter_ids[batch_size:]
valid_source = source_letter_ids[:batch_size]
valid_target = target_letter_ids[:batch_size]
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
source_letter_to_int['<PAD>'],
target_letter_to_int['<PAD>']))
if (len(source_sentences) > 10000):
display_step = 100 # Check training loss after each of this many batches with large data
else:
display_step = 20 # Check training loss after each of this many batches with small data
loader = tf.train.import_meta_graph(checkpoint + '.meta')
loaded_graph = tf.get_default_graph()
input_data = loaded_graph.get_tensor_by_name('input:0')
targets = loaded_graph.get_tensor_by_name('targets:0')
lr = loaded_graph.get_tensor_by_name('learning_rate:0')
source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
with tf.Session(graph=train_graph) as sess:
start = time.time()
sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
loader.restore(sess, checkpoint)
optimizer = tf.get_collection("optimization")[0]
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
for epoch_i in range(1, epochs+1):
for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
get_batches(train_target, train_source, batch_size,
source_letter_to_int['<PAD>'],
target_letter_to_int['<PAD>'])):
# Training step
_, loss = sess.run(
[train_op, cost],
{input_data: sources_batch,
targets: targets_batch,
lr: learning_rate,
target_sequence_length: targets_lengths,
source_sequence_length: sources_lengths,
keep_prob: keep_probability})
# Debug message updating us on the status of the training
if batch_i % display_step == 0 and batch_i > 0:
# Calculate validation cost
validation_loss = sess.run(
[cost],
{input_data: valid_sources_batch,
targets: valid_targets_batch,
lr: learning_rate,
target_sequence_length: valid_targets_lengths,
source_sequence_length: valid_sources_lengths,
keep_prob: 1.0})
print('Epoch {:>3}/{} Batch {:>6}/{} Inputs (000) {:>7} - Loss: {:>6.3f} - Validation loss: {:>6.3f}'
.format(epoch_i, epochs, batch_i, len(train_source) // batch_size,
(((epoch_i - 1) * len(train_source)) + batch_i * batch_size) // 1000,
loss, validation_loss[0]))
# Save model
saver = tf.train.Saver()
saver.save(sess, checkpoint)
# Print time spent training the model
end = time.time()
seconds = end - start
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
print('Model Trained in {}h:{}m:{}s and Saved'.format(int(h), int(m), int(s)))
Update 2 - Yet another try at the Training code
Trying to follow this model more closely, I've added code to check for the existence of a graph and do different things if I'm loading an existing graph. I also built it similar to the Prediction code, which I know works. One important different is that, unlike during Prediction, I need to load the optimizer for training.
It runs fine with a brand new graph, but it still unable to load an existing graph. I am still, however, getting IndexError: list index out of range
at optimizer = tf.get_collection("optimization")[0]
.
I've cut out some bits of code which are above to focus on the essential.
# Test to see if graph already exists
if os.path.exists(checkpoint + ".meta"):
print("Reloading existing graph to continue training.")
brand_new = False
train_graph = tf.Graph()
# saver = tf.train.import_meta_graph(checkpoint + '.meta')
# train_graph = tf.get_default_graph()
else:
print("Starting with new graph.")
brand_new = True
with train_graph.as_default():
saver = tf.train.Saver()
with tf.Session(graph=train_graph) as sess:
start = time.time()
if brand_new:
sess.run(tf.global_variables_initializer())
else:
# sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
saver = tf.train.import_meta_graph(checkpoint + '.meta')
saver.restore(sess, checkpoint)
# Restore variables
input_data = train_graph.get_tensor_by_name('input:0')
targets = train_graph.get_tensor_by_name('targets:0')
lr = train_graph.get_tensor_by_name('learning_rate:0')
source_sequence_length = train_graph.get_tensor_by_name('source_sequence_length:0')
target_sequence_length = train_graph.get_tensor_by_name('target_sequence_length:0')
keep_prob = train_graph.get_tensor_by_name('keep_prob:0')
# Load the optimizer
# Commenting out this block gives 'ValueError: Operation name: "optimization/Adam"'
# Leaving it gives 'IndexError: list index out of range' at 'optimizer = tf.get_collection("optimizer")[0]'
optimizer = tf.get_collection("optimizer")[0]
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
for epoch_i in range(1, epochs+1):
for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
get_batches(train_target, train_source, batch_size,
source_letter_to_int['<PAD>'],
target_letter_to_int['<PAD>'])):
# Training step
_, loss = sess.run(...)
# Debug message updating us on the status of the training
if batch_i % display_step == 0 and batch_i > 0:
# Calculate validation cost and output update to training
# Save model
# saver = tf.train.Saver()
saver.save(sess, checkpoint)