0

Hi I was trying to train a modell when i get this Error Message: If i understand it right it says i am out of VRAM but i have an Asus GTX1080 A8G Gaming which should have enough Vram. I tried it before everything worked but suddenly out of nowhere it wont work anymore. My Deep Net:

    # -*- coding: utf-8 -*-
"""
Created on Thu Jun 29 11:52:11 2017
@author: tobia
"""
#importing pre_processing libaries
import numpy as np
from keras.models import load_model
import os
#importing Deep Learning Libaries
from keras import layers
from keras.models import Sequential
from keras.callbacks import TensorBoard
from keras.layers import Flatten,Dense,Conv2D,MaxPooling2D,Dropout,BatchNormalization,Activation
def load_data():


    key_values = np.empty((0,8),dtype = 'uint8')
    picture_data = np.empty((0,60,80), dtype = 'uint8')


    for i in range(len(os.listdir('data/key_values'))):
        buffer = np.load('data/key_values/values_{0}.npy'.format(i+1))
        key_values = np.append(key_values,buffer,axis = 0)
        buffer_2 = np.load('data/video/video_{}.npy'.format(i+1))
        picture_data = np.append(picture_data,buffer_2,axis = 0) 
    picture_data = picture_data.reshape((len(key_values),60,80,1))    
    """
    train_data = np.load("data/Processed/train_data.npy")
    train = train_data[:]


    picture_data = np.array([i[1] for i in train]).reshape(-1,60,80,1)
    key_values = np.array([i[0] for i in train])
    key_values = np.squeeze(key_values)
    """
   # key_values = np.reshape(key_values,(len(key_values[:]),1,7))    
   # picture_data = np.reshape(picture_data,(len(picture_data[:]),1,60,80,1))
    return key_values,picture_data
class Network:
    def __init__(self):

        pass

    def model_1(self,picture_data,key_values): 
        model = Sequential()
        model.add(Conv2D(96, 11,input_shape = (60,80,1),activation = "relu"))
        model.add(MaxPooling2D(pool_size = 3,strides =1))
        model.add(BatchNormalization(axis = 1))
        model.add(Flatten())
        model.add(Dense(units = 8, activation ="softmax"))
        model.compile(optimizer ='adam', loss = 'categorical_crossentropy',metrics = ['accuracy'])

        model.summary()
        return model
    def start(self,picture_data,key_values):
        model = self.model_1(picture_data,key_values)
        tbCallBack= TensorBoard(log_dir = "./logs",histogram_freq=0,write_graph=True,write_images=True)
        model.fit(picture_data,key_values,batch_size = 1000,epochs =10,validation_split = 0.1,callbacks = [tbCallBack])

    def predict_key(self,live_image,model):
        self.model = model

        x = self.model.predict(live_image,batch_size =3)

        return x

input_k = input("Start new Training press: N or to contiune learning press C")
if(input_k == 'N'):
    key_values,picture_data= load_data()
    test = Network()
    test.start(picture_data,key_values)
elif(input_k == 'C'):

    model = load_model('Models/Modell.h5')
    visual = TensorBoard(log_dir = "./logs",histogram_freq=0,write_graph=True,write_images=True)
    key_values,picture_data = load_data()
    model.fit(picture_data,key_values,batch_size = 1000,epochs=1,validation_split = 0.1,callbacks = [visual])
    model.save("Models/Modell.h5")

Error Message:

File "<ipython-input-1-73951c078cac>", line 1, in <module>
    runfile('C:/Users/tobia/Desktop/Ai_Star/ai_train.py', wdir='C:/Users/tobia/Desktop/Ai_Star')
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile
    execfile(filename, namespace)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 81, in <module>
    test.start(picture_data,key_values)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 66, in start
    model.fit(picture_data,key_values,batch_size = 1000,epochs =10,validation_split = 0.1,callbacks = [tbCallBack])
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\models.py", line 870, in fit
    initial_epoch=initial_epoch)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1507, in fit
    initial_epoch=initial_epoch)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1156, in _fit_loop
    outs = f(ins_batch)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\backend\tensorflow_backend.py", line 2269, in __call__
    **self.session_kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 789, in run
    run_metadata_ptr)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 997, in _run
    feed_dict_string, options, run_metadata)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 1132, in _do_run
    target_list, options, run_metadata)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\client\session.py", line 1152, in _do_call
    raise type(e)(node_def, op, message)
ResourceExhaustedError: OOM when allocating tensor with shape[313344,8]
 [[Node: gradients/dense_1/MatMul_grad/MatMul_1 = MatMul[T=DT_FLOAT, _class=["loc:@dense_1/MatMul"], transpose_a=true, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](flatten_1/Reshape, gradients/dense_1/Softmax_grad/mul_1)]]
Caused by op 'gradients/dense_1/MatMul_grad/MatMul_1', defined at:
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 231, in <module>
    main()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 227, in main
    kernel.start()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\IPython\core\interactiveshell.py", line 2808, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-73951c078cac>", line 1, in <module>
    runfile('C:/Users/tobia/Desktop/Ai_Star/ai_train.py', wdir='C:/Users/tobia/Desktop/Ai_Star')
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile
    execfile(filename, namespace)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 81, in <module>
    test.start(picture_data,key_values)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 66, in start
    model.fit(picture_data,key_values,batch_size = 1000,epochs =10,validation_split = 0.1,callbacks = [tbCallBack])
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\models.py", line 870, in fit
    initial_epoch=initial_epoch)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1490, in fit
    self._make_train_function()
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\training.py", line 1014, in _make_train_function
    self.total_loss)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\optimizers.py", line 405, in get_updates
    grads = self.get_gradients(loss, params)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\optimizers.py", line 71, in get_gradients
    grads = K.gradients(loss, params)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\backend\tensorflow_backend.py", line 2307, in gradients
    return tf.gradients(loss, variables, colocate_gradients_with_ops=True)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 540, in gradients
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 346, in _MaybeCompile
    return grad_fn()  # Exit early
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 540, in <lambda>
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\math_grad.py", line 825, in _MatMulGrad
    grad_b = math_ops.matmul(a, grad, transpose_a=True)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1816, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 1217, in _mat_mul
    transpose_b=transpose_b, name=name)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__
    self._traceback = _extract_stack()
...which was originally created as op 'dense_1/MatMul', defined at:
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 231, in <module>
    main()
[elided 20 identical lines from previous traceback]
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 81, in <module>
    test.start(picture_data,key_values)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 64, in start
    model = self.model_1(picture_data,key_values)
  File "C:/Users/tobia/Desktop/Ai_Star/ai_train.py", line 57, in model_1
    model.add(Dense(units = 8, activation ="softmax"))
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\models.py", line 476, in add
    output_tensor = layer(self.outputs[0])
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\engine\topology.py", line 596, in __call__
    output = self.call(inputs, **kwargs)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\layers\core.py", line 843, in call
    output = K.dot(inputs, self.kernel)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\keras\backend\tensorflow_backend.py", line 976, in dot
    out = tf.matmul(x, y)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1816, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 1217, in _mat_mul
    transpose_b=transpose_b, name=name)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\Users\tobia\Anaconda3\envs\StarCraft\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__
    self._traceback = _extract_stack()
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[313344,8]
 [[Node: gradients/dense_1/MatMul_grad/MatMul_1 = MatMul[T=DT_FLOAT, _class=["loc:@dense_1/MatMul"], transpose_a=true, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](flatten_1/Reshape, gradients/dense_1/Softmax_grad/mul_1)]]
DrDeep
  • 45
  • 1
  • 9
  • 2
    regarding this: "I tried it before everything worked but suddenly out of nowhere it wont work anymore." Maybe your GPU has zombie processes that are holding memory allocated. You can usually observe this with `nvidia-smi`. One possible solution in that case would be to reboot. – Robert Crovella Jun 29 '17 at 14:51

1 Answers1

0

Try it again with restarting python. GPU memory will not free until you mention in code. Some time when run deep learning program in same python shell again without mentioning how much fraction of memory need to use this OOM error occur. Refer this post

How to prevent tensorflow from allocating the totality of a GPU memory?

Azad
  • 71
  • 4