Got ResourceExhaustedError while training deep learning algorithm on NVIDIA GeForce RTX 3050 Ti Laptop GPU using tensorflow with memory_limit: 1721342363
I am training 2870 images and its working well using CPU but on GPU it seems to be getting restricted due to memory limit. Have I turned on a limit of memory on my GPU or do I have no option but to use my CPU? It took me 70 mins on my CPU and that is why I chose to run on my GPU. But while training
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(training_set, validation_data=test_set, epochs=20, batch_size=32)
Got this error:
Epoch 1/20
1/45 [..............................] - ETA: 9:46 - loss: 1.8638 - accuracy: 0.1667
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
Cell In [4], line 3
1 #Compile the model
2 model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
----> 3 history = model.fit(training_set, validation_data=test_set, epochs=20, batch_size=32)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\keras\engine\training.py:1184, in Model.fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1177 with tf.profiler.experimental.Trace(
1178 'train',
1179 epoch_num=epoch,
1180 step_num=step,
1181 batch_size=batch_size,
1182 _r=1):
1183 callbacks.on_train_batch_begin(step)
-> 1184 tmp_logs = self.train_function(iterator)
1185 if data_handler.should_sync:
1186 context.async_wait()
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\def_function.py:885, in Function.__call__(self, *args, **kwds)
882 compiler = "xla" if self._jit_compile else "nonXla"
884 with OptionalXlaContext(self._jit_compile):
--> 885 result = self._call(*args, **kwds)
887 new_tracing_count = self.experimental_get_tracing_count()
888 without_tracing = (tracing_count == new_tracing_count)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\def_function.py:917, in Function._call(self, *args, **kwds)
914 self._lock.release()
915 # In this case we have created variables on the first call, so we run the
916 # defunned version which is guaranteed to never create variables.
--> 917 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
918 elif self._stateful_fn is not None:
919 # Release the lock early so that multiple threads can perform the call
920 # in parallel.
921 self._lock.release()
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\function.py:3039, in Function.__call__(self, *args, **kwargs)
3036 with self._lock:
3037 (graph_function,
3038 filtered_flat_args) = self._maybe_define_function(args, kwargs)
-> 3039 return graph_function._call_flat(
3040 filtered_flat_args, captured_inputs=graph_function.captured_inputs)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\function.py:1963, in ConcreteFunction._call_flat(self, args, captured_inputs, cancellation_manager)
1959 possible_gradient_type = gradients_util.PossibleTapeGradientTypes(args)
1960 if (possible_gradient_type == gradients_util.POSSIBLE_GRADIENT_TYPES_NONE
1961 and executing_eagerly):
1962 # No tape is watching; skip to running the function.
-> 1963 return self._build_call_outputs(self._inference_function.call(
1964 ctx, args, cancellation_manager=cancellation_manager))
1965 forward_backward = self._select_forward_and_backward_functions(
1966 args,
1967 possible_gradient_type,
1968 executing_eagerly)
1969 forward_function, args_with_tangents = forward_backward.forward()
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\function.py:591, in _EagerDefinedFunction.call(self, ctx, args, cancellation_manager)
589 with _InterpolateFunctionError(self):
590 if cancellation_manager is None:
--> 591 outputs = execute.execute(
592 str(self.signature.name),
593 num_outputs=self._num_outputs,
594 inputs=args,
595 attrs=attrs,
596 ctx=ctx)
597 else:
598 outputs = execute.execute_with_cancellation(
599 str(self.signature.name),
600 num_outputs=self._num_outputs,
(...)
603 ctx=ctx,
604 cancellation_manager=cancellation_manager)
File ~\anaconda3\envs\tf-gpu1\lib\site-packages\tensorflow\python\eager\execute.py:59, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
57 try:
58 ctx.ensure_initialized()
---> 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
ResourceExhaustedError: OOM when allocating tensor with shape[64,64,224,224] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[node model/block1_conv2/Relu (defined at \AppData\Local\Temp\ipykernel_11956\3538519329.py:3) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
[Op:__inference_train_function_1205]
Function call stack:
train_function