Train image multiclassification model using TensorFlow or any suitable model

Question

Below dataframe contains image paths and the output columns(A1 to A7).
Output columns show what category the image belongs to. There are seven different categories. Image can be of multiple category. (A0...A7).
Total image count is 30000.
I want to train a model on this data using tensor flow. Need help for data_generator for this data as my resources are getting exausted.
Even for batch size 2 it is giving error 'ResourceExhaustedError'
My CPU 13 GB and GPU 15 GB.

DataFrame:

Image Path	A0	A1	A2	A3	A4	A5	A6
Img Path1	1	1	0	0	0	0	0
Img Path2	1	1	0	0	0	0	0
Img Path3	0	1	1	0	0	0	0
'''
'''
Img Pathn	0	0	0	0	0	0	1

My Code for model building:

def data_generator():
for i, study_instance in enumerate(meta_seg.StudyInstanceUID.unique()):
    for dcm in os.listdir(DATA_DIR + f"/train_images/{study_instance}"):
        train_labels = []
        path = DATA_DIR + f"/train_images/{study_instance}/{dcm}"
        #print(path)    
        img = load_dicom(path)
        img = np.resize(img, (512, 512))
        #  normalize image
        img = img / 255.0
        img = tf.expand_dims(img, axis=-1)
        img = tf.image.grayscale_to_rgb(img)
        train_labels.extend([
            meta_seg.loc[i, "A0"],
            meta_seg.loc[i, "A1"],
            meta_seg.loc[i, "A2"],
            meta_seg.loc[i, "A3"],
            meta_seg.loc[i, "A4"],
            meta_seg.loc[i, "A5"],
            meta_seg.loc[i, "A6"]])
        yield img, train_labels

train_data = tf.data.Dataset.from_generator(data_generator, (tf.float32, tf.int8))

def configure_for_performance(data):
data = data.cache()
data = data.batch(2)
data = data.prefetch(buffer_size=tf.data.AUTOTUNE)
return data

train_data = configure_for_performance(train_data)
val_data = configure_for_performance(val_data)

def cnn_model():
model = Sequential()
Layer 1...
Layer 2...

Have you tried without the cache `data = data.cache()` or use a file for caching? — AndrzejO, Oct 06 '22 at 06:04
@AndrzejO tried without `data = data.cache()` but same error "ResourceExhaustedError: OOM when allocating tensor with shape[4032064,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]" — Murphy, Oct 06 '22 at 06:09
Have you tried the solutions here https://stackoverflow.com/questions/34199233/how-to-prevent-tensorflow-from-allocating-the-totality-of-a-gpu-memory — Djinn, Oct 06 '22 at 06:33

score 0 · Answer 1 · edited Mar 17 '23 at 13:28

I have a sample code, it may help with the database working memory problem, a label number you can replace it with data [ A0, A1, A2, A3, A4, A5, A6 ]. Try to change the Loss and Optimizer function.

Sample I am using with "Street Fighters" game as discrete outputs.

dataset = tf.data.Dataset.from_tensor_slices((tf.constant(np.reshape(output_picture[np.argmax(result)], (1, 1, 1, 60, 78, 3))   , dtype=tf.float32), tf.constant(np.reshape(action, (1, 1, 2, 3, 2, 1)))))

Sample Codes: Use database buffers.

import os
from os.path import exists

import tensorflow as tf
import h5py

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
None
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
print(physical_devices)
print(config)

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Variables
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
filters = 32
kernel_size = (3, 3)
strides = 1

database_buffer = "F:\\models\\buffer\\" + os.path.basename(__file__).split('.')[0] + "\\TF_DataSets_01.h5"
database_buffer_dir = os.path.dirname(database_buffer)

checkpoint_path = "F:\\models\\checkpoint\\" + os.path.basename(__file__).split('.')[0] + "\\TF_DataSets_01.h5"
checkpoint_dir = os.path.dirname(checkpoint_path)

if not exists(checkpoint_dir) : 
    os.mkdir(checkpoint_dir)
    print("Create directory: " + checkpoint_dir)
    
if not exists(database_buffer_dir) : 
    os.mkdir(database_buffer_dir)
    print("Create directory: " + database_buffer_dir)
    
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Functions
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""   
def conv_batchnorm_relu(filters, kernel_size, strides=1):
    
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(input_shape=( 32, 32, 3 )),
        tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding = 'same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.ReLU(),
    ])
        
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(64))
    model.add(tf.keras.layers.Dense(10))
    model.summary()
    
    return model

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: DataSet
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.cifar10.load_data()
# Create hdf5 file
hdf5_file = h5py.File(database_buffer, mode='w')

# Train images
hdf5_file['x_train'] = train_images
hdf5_file['y_train'] = train_labels

# Test images
hdf5_file['x_test'] = test_images
hdf5_file['y_test'] = test_labels

hdf5_file.close()

# Visualize dataset train sample
hdf5_file = h5py.File(database_buffer,  mode='r')

x_train = hdf5_file['x_train'][0: 10000]
x_test = hdf5_file['x_test'][0: 100]
y_train = hdf5_file['y_train'][0: 10000]
y_test = hdf5_file['y_test'][0: 100]

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Optimizer
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
optimizer = tf.keras.optimizers.Nadam( learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name='Nadam' )

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Loss Fn
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""                               
lossfn = tf.keras.losses.MeanSquaredLogarithmicError(reduction=tf.keras.losses.Reduction.AUTO, name='mean_squared_logarithmic_error')

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Model Summary
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
model = conv_batchnorm_relu(filters, kernel_size, strides=1)
model.compile(optimizer=optimizer, loss=lossfn, metrics=['accuracy'])

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: FileWriter
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if exists(checkpoint_path) :
    model.load_weights(checkpoint_path)
    print("model load: " + checkpoint_path)
    input("Press Any Key!")

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Training
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
history = model.fit(x_train, y_train, epochs=1 ,validation_data=(x_train, y_train))
model.save_weights(checkpoint_path)

input('...')

Plable with famous retro game, I like his herriken kicks actions. Sample

Train image multiclassification model using TensorFlow or any suitable model

1 Answers1