When I run the fine-tuned llama model with lora to generate results using 1 GPU, this error happened torch.cuda.OutOfMemoryError: CUDA out of memory
.
My code:
test_data = Dataset.from_list(torch.load(test_dataset_dir))
tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAMES[model_name])
tokenizer.pad_token_id = 0
tokenizer.bos_token_id = 1
tokenizer.eos_token_id = 2
tokenizer.padding_side = "left"
model = LlamaForCausalLM.from_pretrained(MODEL_NAMES[model_name],
torch_dtype=torch.float16,
device_map='auto',
llm_int8_enable_fp32_cpu_offload=True
)
model = PeftModelForCausalLM.from_pretrained(model, peft_model_id)
model.print_trainable_parameters()
model.to(DEVICE)
model.eval()
model = torch.compile(model)
generation_config = GenerationConfig.from_pretrained(
MODEL_NAMES[model_name],
temperature=temperature,
top_p=top_p,
top_k=top_k,
)
input_ids = torch.Tensor([[0]*(max_seq_length-len(x)) + x for x in test_data["input_ids"]]).long().to(DEVICE) # pad right for input ids
max_new_tokens = 4
with torch.no_grad():
outputs = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=return_dict_in_generate,
output_scores=output_scores,
output_attentions=output_attentions,
max_new_tokens=max_new_tokens,
pad_token_id=tokenizer.pad_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id
)
Errors happened when run outputs = model.generate()
:
Traceback (most recent call last):
File "/users/aj2066/llm/generate.py", line 242, in <module>
main(dataset=args.dataset,
File "/users/aj2066/llm/generate.py", line 145, in main
outputs = model.generate(
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/peft/peft_model.py", line 1002, in generate
outputs = self.base_model.generate(**kwargs)
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/transformers/generation/utils.py", line 1538, in generate
return self.greedy_search(
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/transformers/generation/utils.py", line 2431, in greedy_search
outputs = self(
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 810, in forward
outputs = self.model(
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 698, in forward
layer_outputs = decoder_layer(
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 410, in forward
hidden_states = self.input_layernorm(hidden_states)
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/users/aj2066/.conda/envs/gbvmodel/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 87, in forward
variance = hidden_states.pow(2).mean(-1, keepdim=True)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 15.62 GiB (GPU 0; 44.56 GiB total capacity; 36.55 GiB already allocated; 7.36 GiB free; 36.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Some common solutions did not work on my case, such as
- don't calculate gradients by using
with torch.no_grad()
- reset GPU by
torch.cuda.empty_cache()
- set PYTORCH_CUDA_ALLOC_CONF like
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
How to solve this problem? Many thanks!
[Solved]
Since this is a generation task, the size of the test data to be generated is too large and caused the error. Just split it into subsets of the test data and run the Python script multiple times.