0

I'm trying to compile and fit a model using the EpochModelCheckpoint class from this thread - I want the model to save regularly after each epoch.

But I get the following error which I absolutely don't understand:

Epoch 1/1000

Epoch 1: val_loss improved from inf to -0.86435, saving model to Models/HM0001/01
WARNING:absl:Found untraced functions such as _update_step_xla while saving (showing 1 of 1). These functions will not be directly callable after loading.
Traceback (most recent call last):
  File "/home/au/find/Ex.py", line 92, in <module>
    model = CompileFitModel (xTrain, yTrain, epochs, batchSize, optimizer, loss, activation1, activation2, verbose)
  File "/home/au/find/Ex.py", line 71, in CompileFitModel
    model.fit(xTrain, yTrain, epochs=epochs, verbose=verbose, batch_size=batchSize,validation_data=(xTrain, yTrain),
  File "/home/au/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/au/find/Ex.py", line 30, in on_epoch_end
    self._save_model(epoch=epoch, batch=None, logs=logs)
  File "/home/au/.local/lib/python3.10/site-packages/tensorflow/dtensor/python/d_variable.py", line 60, in __init__
    original_layout = api.fetch_layout(dvariable)
  File "/home/au/.local/lib/python3.10/site-packages/tensorflow/dtensor/python/api.py", line 353, in fetch_layout
    return _dtensor_device().fetch_layout(tensor)
  File "/home/au/.local/lib/python3.10/site-packages/tensorflow/dtensor/python/dtensor_device.py", line 312, in fetch_layout
    raise core._status_to_exception(e) from None  # pylint: disable=protected-access
tensorflow.python.framework.errors_impl.InvalidArgumentError: FetchLayout expects a tensor placed on the layout device.

Any idea? Full code follows. Source data - Meh100.npy (936K) https://github.com/velkyvont/velkyvont/blob/main/Meh100.npy

import numpy as np
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from keras.layers import AlphaDropout, Dense
from keras import backend as K
from tensorflow.keras.utils import to_categorical

class EpochModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
    def __init__(self,
                 filepath,
                 frequency=1,
                 monitor='val_loss',
                 verbose=2,
                 save_best_only=True,
                 save_weights_only=False,
                 mode='auto',
                 loss='MyLoss.hdf5',
                 options=None,
                 **kwargs):
        super(EpochModelCheckpoint, self).__init__(filepath, monitor, verbose, save_best_only, save_weights_only, mode, "epoch", options)
        self.epochs_since_last_save = 0
        self.frequency = frequency

    def on_epoch_end(self, epoch, logs=None):
        self.epochs_since_last_save += 1
        # pylint: disable=protected-access
        if self.epochs_since_last_save % self.frequency == 0:
            self._save_model(epoch=epoch, batch=None, logs=logs)

    def on_train_batch_end(self, batch, logs=None):
        pass

def GetXYFromDataIncludingOdds(data):
    favOdds = data.T[6]
    dogOdds = data.T[7]
    drawOdds = data.T[8]
    odds = np.array(list(zip(favOdds, dogOdds, drawOdds)))
    y = data.T[1]
    y = np.asarray(y).astype('float32')
    y = to_categorical(y, 3)
    y = np.hstack([y ,odds])
    x = data.T[2:].T
    x = tf.keras.utils.normalize(x)
    return x, y
    
def MyLoss(yTrue, yPred):
    favWin = yTrue[:,1:2]
    dogWin = yTrue[:, 2:3]
    draw = yTrue[:, 0:1]
    favOdds = yTrue[:, 3:4]
    dogOdds = yTrue[:, 4:5]
    drawOdds = yTrue[:, 5:6]
    gainLossVector = K.concatenate([ 
        draw*drawOdds,
        favWin*favOdds,
        dogWin*dogOdds,
    ], axis=1)

    return -1 * K.mean(K.sum(gainLossVector * yPred, axis=1))
            
def CompileFitModel(xTrain, yTrain, epochs, batchSize, optimizer, loss, activation1, activation2, verbose):
    model = Sequential()
    model.add(AlphaDropout(2000, input_dim=1191))
    model.add(Dense(1000, activation=activation1))  
    model.add(AlphaDropout(500))
    model.add(Dense(3, activation=activation2)) 
    
    model.compile(optimizer=optimizer, loss = loss)
    model.fit(xTrain, yTrain, epochs=epochs, verbose=verbose, batch_size=batchSize,validation_data=(xTrain, yTrain),
    callbacks=[EarlyStopping(patience=100), EpochModelCheckpoint("Models/HM0001/{epoch:02d}", frequency=1)
    ])

    return model


learningRate = 0.00001
batchSize = 128
loss = MyLoss
activation1 = 'elu'
activation2 = 'softmax'
verbose = 2
epochs = 1000
batchSize = 128
optimizer = tf.keras.dtensor.experimental.optimizers.RMSprop(learning_rate=learningRate,rho=0.9,momentum=0.0, epsilon=1e-07, centered=False, gradients_clip_option=None, ema_option=None, jit_compile=True, name='RMSprop', mesh=None)

numpyFilename = "Meh100.npy"
data = np.load(numpyFilename, allow_pickle=True)
xTrain, yTrain = GetXYFromDataIncludingOdds (data)

model = CompileFitModel (xTrain, yTrain, epochs, batchSize, optimizer, loss, activation1, activation2, verbose)
pepazdepa
  • 117
  • 8
  • Seems like an issue with the optimizer, which is experimental. It works if you use `optimizer = 'adam'` – AndrzejO Sep 23 '22 at 08:27

0 Answers0