Model not training when using batch normalization with keras functional API

Question

I'm going through some tutorials using the Keras functional API in Tensorflow 2, and I'm having some trouble including BatchNormalization layers when using the functional API.

Using roughly the same code:

This network trains with the sequential API and batch normalization
This network trains with the functional API, but commenting out the batch normalization layers
This network does not train using the functional API and batch normalization layers

Am I missing a step somewhere? Do I set training=true or training=false somewhere in the code?

Working Sequential Code:

#subclassed layers in keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import SeparableConv2D
from tensorflow.keras.layers import BatchNormalization
import numpy as np
import logging
tf.get_logger().setLevel(logging.ERROR)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import matplotlib.pyplot as plt
%matplotlib inline

cifar_dataset = keras.datasets.cifar10
(train_images, train_labels), (test_images,
            test_labels) = cifar_dataset.load_data()


EPOCHS = 128
BATCH_SIZE = 128

#standardize dataset
mean = np.mean(train_images)
stdev = np.std(train_images)
train_images = (train_images - mean)/stdev
test_images = (test_images - mean)/stdev

#change labels to one-hot
train_labels = to_categorical(train_labels, num_classes=10)
test_labels = to_categorical(test_labels, num_classes=10)

# Keras model subclassing: build your own layers
#CNN -> batch norm -> Relu
#create a class for this kind of block

class CNNBlock(layers.Layer):#inherits from layers.Layer - keeps track of what we need for back propagation
    def __init__(self, out_channels, kernel_size=3, strides=(1,1)): #needs both __init__ and call functions, initialize layer
        super(CNNBlock, self).__init__() #superclass layers.Layer with our new class
        self.conv = layers.Conv2D(out_channels, kernel_size, padding='same',
                                 kernel_initializer='he_normal',bias_initializer='zeros')#initialize the conv portion of this class
        self.bn = layers.BatchNormalization()#initialize batch normalization in this block
    
    def call(self, input_tensor, training=False): #what happens when this block is encountered, specify training bool for traning/evaluation
        #call method (forward method in pytorch)
        #take input tensor, run it though our initialized layers in __init__
        x = self.conv(input_tensor)#run convolution operation
        x = self.bn(x, training=training)#batch norm
        x = tf.nn.relu(x)#activation function for this layer
        return x
    
class CNNBlock_init(layers.Layer):#inherits from layers.Layer - keeps track of what we need for back propagation
    def __init__(self, out_channels, input_size, kernel_size=3): #needs both __init__ and call functions, initialize layer
        super(CNNBlock_init, self).__init__() #superclass layers.Layer with our new class - make sure new class name matches
        self.input_size = input_size
        self.conv = layers.Conv2D(out_channels, kernel_size, 
                                  input_shape=input_size, #first layer needs input shape to build properly
                                  padding='same')#initialize the conv portion of this class
        self.bn = layers.BatchNormalization()#initialize batch normalization in this block
    
    def call(self, input_tensor, training=False): #what happens when this block is encountered, specify training bool for traning/evaluation
        #call method (forward method in pytorch)
        #take input tensor, run it though our initialized layers in __init__
        x = self.conv(input_tensor,input_shape=self.input_size)#run convolution operation
        x = self.bn(x, training=training)#batch norm
        x = tf.nn.relu(x)#activation function for this layer
        return x
    
#build model with this
model = keras.Sequential(
    [
        CNNBlock(64,kernel_size=4,strides=(2,2)),
        Dropout(0.2),
        CNNBlock(64,kernel_size=2,strides=(2,2)),
        Dropout(0.2),
        CNNBlock(32),
        Dropout(0.2),
        CNNBlock(32),
        MaxPooling2D(pool_size=(2,2), strides=2),
        Dropout(0.2),
        Flatten(),
        Dense(64, activation='relu',#dense layers to combine features
                kernel_initializer='he_normal',
                bias_initializer='zeros'),
        Dropout(0.2),
        Dense(10, activation='softmax',#softmax for classification
                kernel_initializer='glorot_uniform',
                bias_initializer='zeros')
    ])

#compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#model.build(input_shape=(32,32,3))
#model.summary()
#train model
history = model.fit(
    train_images, train_labels,
    validation_data=(test_images,test_labels),
    epochs=EPOCHS, batch_size=BATCH_SIZE,
    verbose=1, shuffle=True) #verbose 1 is cool gives time for each epoch



#evaluate model
import matplotlib.pyplot as plt
%matplotlib inline
def plot_error(history):
    history_dict_vals = history.__dict__['history']
    history_x = history.epoch

    plt.plot(history_x,history_dict_vals['accuracy'],'r-', label='training accuracy')
    plt.plot(history_x,history_dict_vals['val_accuracy'],'g-', label='test accuracy')
    plt.axis([0,len(history_x),0.0,1])
    plt.xlabel('training epochs')
    plt.ylabel('accuracy')
    plt.legend()
    plt.show()
    print(f"Final test accuracy = {history_dict_vals['val_accuracy'][-1]}")
plot_error(history)

Working Functional Code:

# same convolutional structure but with the keras functional API
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import SeparableConv2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
import numpy as np
import logging
tf.get_logger().setLevel(logging.ERROR)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import matplotlib.pyplot as plt
%matplotlib inline

cifar_dataset = keras.datasets.cifar10
(train_images, train_labels), (test_images,
            test_labels) = cifar_dataset.load_data()


EPOCHS = 128
BATCH_SIZE = 128

#standardize dataset
mean = np.mean(train_images)
stdev = np.std(train_images)
train_images = (train_images - mean)/stdev
test_images = (test_images - mean)/stdev

#change labels to one-hot
train_labels = to_categorical(train_labels, num_classes=10)
test_labels = to_categorical(test_labels, num_classes=10)

# Keras model subclassing: build your own layers
#CNN -> batch norm -> Relu
#create a class for this kind of block

class CNNBlock(layers.Layer):#inherits from layers.Layer - keeps track of what we need for back propagation
    def __init__(self, out_channels, kernel_size=3, strides=(1,1)): #needs both __init__ and call functions, initialize layer
        super(CNNBlock, self).__init__() #superclass layers.Layer with our new class
        self.conv = layers.Conv2D(out_channels, kernel_size, padding='same',
                                 kernel_initializer='he_normal',bias_initializer='zeros')#initialize the conv portion of this class
        #self.bn = layers.BatchNormalization()#initialize batch normalization in this block
    
    def call(self, input_tensor, training=False): #what happens when this block is encountered, specify training bool for traning/evaluation
        #call method (forward method in pytorch)
        #take input tensor, run it though our initialized layers in __init__
        x = self.conv(input_tensor)#run convolution operation
        #x = self.bn(x, training=training)#batch norm
        x = tf.nn.relu(x)#activation function for this layer
        return x
    
class CNNBlock_init(layers.Layer):#inherits from layers.Layer - keeps track of what we need for back propagation
    def __init__(self, out_channels, input_size, kernel_size=3): #needs both __init__ and call functions, initialize layer
        super(CNNBlock_init, self).__init__() #superclass layers.Layer with our new class - make sure new class name matches
        self.input_size = input_size
        self.conv = layers.Conv2D(out_channels, kernel_size, 
                                  input_shape=input_size, #first layer needs input shape to build properly
                                  padding='same')#initialize the conv portion of this class
        #self.bn = layers.BatchNormalization()#initialize batch normalization in this block
    
    def call(self, input_tensor, training=False): #what happens when this block is encountered, specify training bool for traning/evaluation
        #call method (forward method in pytorch)
        #take input tensor, run it though our initialized layers in __init__
        x = self.conv(input_tensor,input_shape=self.input_size)#run convolution operation
        #x = self.bn(x, training=training)#batch norm
        x = tf.nn.relu(x)#activation function for this layer
        return x
    
#build model with this
#Build the model with the Keras functional API
input_shape = (32,32,3)
chanDim = -1

#define model with first inputs 
inputs = Input(shape=input_shape)
#functional API passing layers through
x = CNNBlock(64,kernel_size=4,strides=(2,2))(inputs)
x = Dropout(0.2)(x)
x = CNNBlock(64,kernel_size=2,strides=(2,2))(x)
x = Dropout(0.2)(x)
x = CNNBlock(64)(x)
x = MaxPooling2D(pool_size=(2,2), strides=2)(x)
x = Dropout(0.2)(x)
x = Flatten()(x)
x = Dense(64, activation='relu',#dense layers to combine features
                kernel_initializer='he_normal',
                bias_initializer='zeros')(x)
x = Dropout(0.2)(x)
y = Dense(10, activation='softmax',#softmax for classification
                kernel_initializer='glorot_uniform',
                bias_initializer='zeros')(x)

#initialize model with inputs and outputs
model = Model(inputs, y, name='convnet_func')

#compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
#train model
history = model.fit(
    train_images, train_labels,
    validation_data=(test_images,test_labels),
    epochs=EPOCHS, batch_size=BATCH_SIZE,
    verbose=1, shuffle=True) #verbose 1 is cool gives time for each epoch

#evaluate model
import matplotlib.pyplot as plt
%matplotlib inline
def plot_error(history):
    history_dict_vals = history.__dict__['history']
    history_x = history.epoch

    plt.plot(history_x,history_dict_vals['accuracy'],'r-', label='training accuracy')
    plt.plot(history_x,history_dict_vals['val_accuracy'],'g-', label='test accuracy')
    plt.axis([0,len(history_x),0.0,1])
    plt.xlabel('training epochs')
    plt.ylabel('accuracy')
    plt.legend()
    plt.show()
    print(f"Final test accuracy = {history_dict_vals['val_accuracy'][-1]}")
plot_error(history)

Unfortunately the model does not train when I remove the comments around the batch normalization layers.

I've executed the second code, removing the four comments, and it worked for me. What version of tensorflow are you using? What do you mean by "not train"? Is it showing an error? — ClaudiaR, Aug 23 '22 at 08:06

Model not training when using batch normalization with keras functional API

0 Answers0