1

My computer has a gpu GeForce 940MX installed. It has the Memory bandwidth 16.02 GB/s. I'm trying to train LUNA dataset using UNET model using following code.

from __future__ import print_function

import numpy as np
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D
from keras.layers import concatenate
from keras.optimizers import Adam
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras import backend as K


K.set_image_dim_ordering('th')  # Theano dimension ordering in this code

img_rows = 512
img_cols = 512

smooth = 1.


def dice_coef(y_true, y_pred):
    y_true_f = K.flatten(y_true)
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)

def dice_coef_np(y_true,y_pred):
    y_true_f = y_true.flatten()
    y_pred_f = y_pred.flatten()
    intersection = np.sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (np.sum(y_true_f) + np.sum(y_pred_f) + smooth)

def dice_coef_loss(y_true, y_pred):
    return -dice_coef(y_true, y_pred)


def get_unet():
    inputs = Input((1,img_rows, img_cols))
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)

    conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool1)
    conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)

    conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(pool2)
    conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)

    conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(pool3)
    conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2))(conv4)

    conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(pool4)
    conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(conv5)

    #up6 = merge([UpSampling2D(size=(2, 2))(conv5), conv4], mode='concat', concat_axis=1)
    up6 = concatenate([UpSampling2D(size=(2, 2))(conv5), conv4], axis=1)
    conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(up6)
    conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv6)

    #up7 = merge([UpSampling2D(size=(2, 2))(conv6), conv3], mode='concat', concat_axis=1)
    up7 = concatenate([UpSampling2D(size=(2, 2))(conv6), conv3], axis=1)
    conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(up7)
    conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv7)

    #up8 = merge([UpSampling2D(size=(2, 2))(conv7), conv2], mode='concat', concat_axis=1)
    up8 = concatenate([UpSampling2D(size=(2, 2))(conv7), conv2], axis=1)
    conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(up8)
    conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv8)

    #up9 = merge([UpSampling2D(size=(2, 2))(conv8), conv1], mode='concat', concat_axis=1)
    up9 = concatenate([UpSampling2D(size=(2, 2))(conv8), conv1], axis=1)
    conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(up9)
    conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv9)

    conv10 = Conv2D(1, (1, 1), activation='sigmoid')(conv9)

    model = Model(inputs=inputs, outputs=conv10)

    model.compile(optimizer=Adam(lr=1.0e-5), loss=dice_coef_loss, metrics=[dice_coef])

    return model


def train_and_predict(use_existing):
    print('-'*30)
    print('Loading and preprocessing train data...')
    print('-'*30)
    imgs_train = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"trainImages.npy").astype(np.float32)
    imgs_mask_train = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"trainMasks.npy").astype(np.float32)

    imgs_test = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"testImages.npy").astype(np.float32)
    imgs_mask_test_true = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"testMasks.npy").astype(np.float32)

    mean = np.mean(imgs_train)  # mean for data centering
    std = np.std(imgs_train)  # std for data normalization

    imgs_train -= mean  # images should already be standardized, but just in case
    imgs_train /= std

    print('-'*30)
    print('Creating and compiling model...')
    print('-'*30)
    model = get_unet()
    # Saving weights to unet.hdf5 at checkpoints
    model_checkpoint = ModelCheckpoint('unet.hdf5', monitor='loss', save_best_only=True)
    #
    # Should we load existing weights? 
    # Set argument for call to train_and_predict to true at end of script
    if use_existing:
        model.load_weights('./unet.hdf5')

    # 
    # The final results for this tutorial were produced using a multi-GPU
    # machine using TitanX's.
    # For a home GPU computation benchmark, on my home set up with a GTX970 
    # I was able to run 20 epochs with a training set size of 320 and 
    # batch size of 2 in about an hour. I started getting reseasonable masks 
    # after about 3 hours of training. 
    #
    print('-'*30)
    print('Fitting model...')
    print('-'*30)
    model.fit(imgs_train, imgs_mask_train, batch_size=50, epochs=10, verbose=1, shuffle=True,
              callbacks=[model_checkpoint])

    # loading best weights from training session
    print('-'*30)
    print('Loading saved weights...')
    print('-'*30)
    model.load_weights('./unet.hdf5')

    print('-'*30)
    print('Predicting masks on test data...')
    print('-'*30)
    num_test = len(imgs_test)
    imgs_mask_test = np.ndarray([num_test,1,512,512],dtype=np.float32)
    for i in range(num_test):
        imgs_mask_test[i] = model.predict([imgs_test[i:i+1]], verbose=0)[0]
    np.save('masksTestPredicted.npy', imgs_mask_test)
    mean = 0.0
    for i in range(num_test):
        mean+=dice_coef_np(imgs_mask_test_true[i,0], imgs_mask_test[i,0])
    mean/=num_test
    print("Mean Dice Coeff : ",mean)

if __name__ == '__main__':
    train_and_predict(False)

But when running it using GPU I'm getting the following error.

Warning (from warnings module):
  File "C:\Research\Python_installation\lib\site-packages\h5py\__init__.py", line 36
    from ._conv import register_converters as _register_converters
FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
Using TensorFlow backend.
------------------------------
Loading and preprocessing train data...
------------------------------
------------------------------
Creating and compiling model...
------------------------------
------------------------------
Fitting model...
------------------------------
Epoch 1/10
Traceback (most recent call last):
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1327, in _do_call
    return fn(*args)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1306, in _run_fn
    status, run_metadata)
  File "C:\Research\Python_installation\lib\contextlib.py", line 66, in __exit__
    next(self.gen)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 466, in raise_exception_on_not_ok_status
    pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[50,32,512,512]
     [[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_261, conv2d_1/kernel/read)]]
     [[Node: loss/mul/_273 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_3022_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 150, in <module>
    train_and_predict(False)
  File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 127, in train_and_predict
    callbacks=[model_checkpoint])
  File "C:\Research\Python_installation\lib\site-packages\keras\engine\training.py", line 1657, in fit
    validation_steps=validation_steps)
  File "C:\Research\Python_installation\lib\site-packages\keras\engine\training.py", line 1213, in _fit_loop
    outs = f(ins_batch)
  File "C:\Research\Python_installation\lib\site-packages\keras\backend\tensorflow_backend.py", line 2357, in __call__
    **self.session_kwargs)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 895, in run
    run_metadata_ptr)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1124, in _run
    feed_dict_tensor, options, run_metadata)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1321, in _do_run
    options, run_metadata)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1340, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[50,32,512,512]
     [[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_261, conv2d_1/kernel/read)]]
     [[Node: loss/mul/_273 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_3022_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'conv2d_1/convolution', defined at:
  File "<string>", line 1, in <module>
  File "C:\Research\Python_installation\lib\idlelib\run.py", line 124, in main
    ret = method(*args, **kwargs)
  File "C:\Research\Python_installation\lib\idlelib\run.py", line 351, in runcode
    exec(code, self.locals)
  File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 150, in <module>
    train_and_predict(False)
  File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 106, in train_and_predict
    model = get_unet()
  File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 39, in get_unet
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
  File "C:\Research\Python_installation\lib\site-packages\keras\engine\topology.py", line 603, in __call__
    output = self.call(inputs, **kwargs)
  File "C:\Research\Python_installation\lib\site-packages\keras\layers\convolutional.py", line 164, in call
    dilation_rate=self.dilation_rate)
  File "C:\Research\Python_installation\lib\site-packages\keras\backend\tensorflow_backend.py", line 3195, in conv2d
    data_format=tf_data_format)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 672, in convolution
    op=op)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 338, in with_space_to_batch
    return op(input, num_spatial_dims, padding)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 664, in op
    name=name)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 131, in _non_atrous_convolution
    name=name)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\gen_nn_ops.py", line 397, in conv2d
    data_format=data_format, name=name)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[50,32,512,512]
     [[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_261, conv2d_1/kernel/read)]]
     [[Node: loss/mul/_273 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_3022_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Can someone please kindly explain me the reason behind this error, ResourceExhaustedError. Is it because that the memory of GPU is not enough to load the dataset. This worked fine without GPU. But took around 6 hours to finish one epoch

Christoph Rackwitz
  • 11,317
  • 4
  • 27
  • 36
user3789200
  • 1,166
  • 2
  • 25
  • 45
  • 1
    Memory bandwidth is irrelevant, only the amount of memory in your GPU. The most likely problem is not enough GPU RAM, simply as that. – Dr. Snoopy Jan 28 '18 at 01:51
  • Hi Matias, Do you have any suggestions how I can avoid this issue? – user3789200 Jan 28 '18 at 02:01
  • 1
    Get a GPU with more RAM, or don't use that GPU :) The only thing you can do is to reduce batch size. – Dr. Snoopy Jan 28 '18 at 02:01
  • Thanks, Matias. I checked the 'Total Available Graphics Memory' and it showed 8252MB. :( Thanks for the advice. – user3789200 Jan 28 '18 at 02:21
  • Reduce the batch size then, 50 is too much. – Dr. Snoopy Jan 28 '18 at 02:22
  • At which point do I need to reduce batch_size? Is it something like this model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=10, verbose=1, shuffle=True, callbacks=[model_checkpoint]) – user3789200 Jan 28 '18 at 02:30
  • 1
    What do you mean "at which point"? batch size is an argument in your `model.fit()` – desertnaut Jan 28 '18 at 12:14
  • [Very similar to the question. ](https://stackoverflow.com/questions/59394947/how-to-fix-resourceexhaustederror-oom-when-allocating-tensor) Try changing the parameters to reduce resource utilisation – Swetha Magesh Mar 21 '21 at 10:48

0 Answers0