0
  1. Below dataframe contains image paths and the output columns(A1 to A7).
  2. Output columns show what category the image belongs to. There are seven different categories. Image can be of multiple category. (A0...A7).
  3. Total image count is 30000.
  4. I want to train a model on this data using tensor flow. Need help for data_generator for this data as my resources are getting exausted.
  5. Even for batch size 2 it is giving error 'ResourceExhaustedError'
  6. My CPU 13 GB and GPU 15 GB.

DataFrame:

Image Path A0 A1 A2 A3 A4 A5 A6
Img Path1 1 1 0 0 0 0 0
Img Path2 1 1 0 0 0 0 0
Img Path3 0 1 1 0 0 0 0
'''
'''
Img Pathn 0 0 0 0 0 0 1

My Code for model building:

def data_generator():
for i, study_instance in enumerate(meta_seg.StudyInstanceUID.unique()):
    for dcm in os.listdir(DATA_DIR + f"/train_images/{study_instance}"):
        train_labels = []
        path = DATA_DIR + f"/train_images/{study_instance}/{dcm}"
        #print(path)    
        img = load_dicom(path)
        img = np.resize(img, (512, 512))
        #  normalize image
        img = img / 255.0
        img = tf.expand_dims(img, axis=-1)
        img = tf.image.grayscale_to_rgb(img)
        train_labels.extend([
            meta_seg.loc[i, "A0"],
            meta_seg.loc[i, "A1"],
            meta_seg.loc[i, "A2"],
            meta_seg.loc[i, "A3"],
            meta_seg.loc[i, "A4"],
            meta_seg.loc[i, "A5"],
            meta_seg.loc[i, "A6"]])
        yield img, train_labels

train_data = tf.data.Dataset.from_generator(data_generator, (tf.float32, tf.int8))

def configure_for_performance(data):
data = data.cache()
data = data.batch(2)
data = data.prefetch(buffer_size=tf.data.AUTOTUNE)
return data

train_data = configure_for_performance(train_data)
val_data = configure_for_performance(val_data)

def cnn_model():
model = Sequential()
Layer 1...
Layer 2...
Murphy
  • 1
  • 2
  • Have you tried without the cache `data = data.cache()` or use a file for caching? – AndrzejO Oct 06 '22 at 06:04
  • @AndrzejO tried without `data = data.cache()` but same error "ResourceExhaustedError: OOM when allocating tensor with shape[4032064,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]" – Murphy Oct 06 '22 at 06:09
  • Have you tried the solutions here https://stackoverflow.com/questions/34199233/how-to-prevent-tensorflow-from-allocating-the-totality-of-a-gpu-memory – Djinn Oct 06 '22 at 06:33

1 Answers1

0

I have a sample code, it may help with the database working memory problem, a label number you can replace it with data [ A0, A1, A2, A3, A4, A5, A6 ]. Try to change the Loss and Optimizer function.

Sample I am using with "Street Fighters" game as discrete outputs.

dataset = tf.data.Dataset.from_tensor_slices((tf.constant(np.reshape(output_picture[np.argmax(result)], (1, 1, 1, 60, 78, 3))   , dtype=tf.float32), tf.constant(np.reshape(action, (1, 1, 2, 3, 2, 1)))))

Sample Codes: Use database buffers.

import os
from os.path import exists

import tensorflow as tf
import h5py

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
None
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
print(physical_devices)
print(config)

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Variables
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
filters = 32
kernel_size = (3, 3)
strides = 1

database_buffer = "F:\\models\\buffer\\" + os.path.basename(__file__).split('.')[0] + "\\TF_DataSets_01.h5"
database_buffer_dir = os.path.dirname(database_buffer)

checkpoint_path = "F:\\models\\checkpoint\\" + os.path.basename(__file__).split('.')[0] + "\\TF_DataSets_01.h5"
checkpoint_dir = os.path.dirname(checkpoint_path)

if not exists(checkpoint_dir) : 
    os.mkdir(checkpoint_dir)
    print("Create directory: " + checkpoint_dir)
    
if not exists(database_buffer_dir) : 
    os.mkdir(database_buffer_dir)
    print("Create directory: " + database_buffer_dir)
    
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Functions
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""   
def conv_batchnorm_relu(filters, kernel_size, strides=1):
    
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(input_shape=( 32, 32, 3 )),
        tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding = 'same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.ReLU(),
    ])
        
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(64))
    model.add(tf.keras.layers.Dense(10))
    model.summary()
    
    return model

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: DataSet
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.cifar10.load_data()
# Create hdf5 file
hdf5_file = h5py.File(database_buffer, mode='w')

# Train images
hdf5_file['x_train'] = train_images
hdf5_file['y_train'] = train_labels

# Test images
hdf5_file['x_test'] = test_images
hdf5_file['y_test'] = test_labels

hdf5_file.close()

# Visualize dataset train sample
hdf5_file = h5py.File(database_buffer,  mode='r')

x_train = hdf5_file['x_train'][0: 10000]
x_test = hdf5_file['x_test'][0: 100]
y_train = hdf5_file['y_train'][0: 10000]
y_test = hdf5_file['y_test'][0: 100]

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Optimizer
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
optimizer = tf.keras.optimizers.Nadam( learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name='Nadam' )

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Loss Fn
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""                               
lossfn = tf.keras.losses.MeanSquaredLogarithmicError(reduction=tf.keras.losses.Reduction.AUTO, name='mean_squared_logarithmic_error')

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Model Summary
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
model = conv_batchnorm_relu(filters, kernel_size, strides=1)
model.compile(optimizer=optimizer, loss=lossfn, metrics=['accuracy'])

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: FileWriter
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if exists(checkpoint_path) :
    model.load_weights(checkpoint_path)
    print("model load: " + checkpoint_path)
    input("Press Any Key!")

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Training
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
history = model.fit(x_train, y_train, epochs=1 ,validation_data=(x_train, y_train))
model.save_weights(checkpoint_path)

input('...')

Plable with famous retro game, I like his herriken kicks actions. Sample

General Grievance
  • 4,555
  • 31
  • 31
  • 45