In order to use the GPU on macbook (M1 chip), install the llama-cpp-python
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
Download model file from https://huggingface.co/TheBloke/Trurl-2-7B-GGML/tree/main
Model name is llama-2-7b-chat.ggmlv3.q6_K.bin
Then, I wrote the code below:
from llama_cpp import Llama
llm = Llama(
model_path="./models/llama-2-7b-chat.ggmlv3.q6_K.bin", n_gpu_layers=100, n_ctx=2048
)
output = llm(
"Q: Name the planets in the solar system? A: ",
max_tokens=128,
stop=["Q:", "\n"],
echo=True,
)
print(output)
When I run the code, I got the response below:
{'id': 'cmpl-5a526ad0-5366-4a6d-8408-1e7ee9a641e9', 'object': 'text_completion', 'created': 1692866151, 'model': './models/llama-2-7b-chat.ggmlv3.q6_K.bin', 'choices': [{'text': 'What is the capital of France?\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 8, 'completion_tokens': 504, 'total_tokens': 512}}
all logs are below
llama.cpp: loading model from ./models/llama-2-7b-chat.ggmlv3.q6_K.bin
llama_model_load_internal: format = ggjt v3 (latest)
llama_model_load_internal: n_vocab = 32000
llama_model_load_internal: n_ctx = 512
llama_model_load_internal: n_embd = 4096
llama_model_load_internal: n_mult = 256
llama_model_load_internal: n_head = 32
llama_model_load_internal: n_head_kv = 32
llama_model_load_internal: n_layer = 32
llama_model_load_internal: n_rot = 128
llama_model_load_internal: n_gqa = 1
llama_model_load_internal: rnorm_eps = 5.0e-06
llama_model_load_internal: n_ff = 11008
llama_model_load_internal: freq_base = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype = 18 (mostly Q6_K)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size = 0.08 MB
llama_model_load_internal: mem required = 5272.43 MB (+ 256.00 MB per state)
llama_new_context_with_model: kv self size = 256.00 MB
ggml_metal_init: allocating
ggml_metal_init: loading '/Users/wangzg/Desktop/Study/llama2/llm/lib/python3.9/site-packages/llama_cpp/ggml-metal.metal'
ggml_metal_init: loaded kernel_add 0x123ed9310
ggml_metal_init: loaded kernel_add_row 0x123edaae0
ggml_metal_init: loaded kernel_mul 0x123edaf70
ggml_metal_init: loaded kernel_mul_row 0x123edb920
ggml_metal_init: loaded kernel_scale 0x123fe05c0
ggml_metal_init: loaded kernel_silu 0x123fdffd0
ggml_metal_init: loaded kernel_relu 0x123fe0a40
ggml_metal_init: loaded kernel_gelu 0x123fe1cb0
ggml_metal_init: loaded kernel_soft_max 0x123fe1500
ggml_metal_init: loaded kernel_diag_mask_inf 0x123fe24f0
ggml_metal_init: loaded kernel_get_rows_f16 0x123fe3a90
ggml_metal_init: loaded kernel_get_rows_q4_0 0x12485e100
ggml_metal_init: loaded kernel_get_rows_q4_1 0x12485ee80
ggml_metal_init: loaded kernel_get_rows_q2_K 0x12485f3c0
ggml_metal_init: loaded kernel_get_rows_q3_K 0x12485fc90
ggml_metal_init: loaded kernel_get_rows_q4_K 0x113e6d380
ggml_metal_init: loaded kernel_get_rows_q5_K 0x113e6dd60
ggml_metal_init: loaded kernel_get_rows_q6_K 0x113e6e270
ggml_metal_init: loaded kernel_rms_norm 0x113e6eca0
ggml_metal_init: loaded kernel_norm 0x123edc290
ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x123edd190
ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x123edd750
ggml_metal_init: loaded kernel_mul_mat_q4_1_f32 0x124860750
ggml_metal_init: loaded kernel_mul_mat_q2_K_f32 0x124861510
ggml_metal_init: loaded kernel_mul_mat_q3_K_f32 0x124861ad0
ggml_metal_init: loaded kernel_mul_mat_q4_K_f32 0x124862c60
ggml_metal_init: loaded kernel_mul_mat_q5_K_f32 0x123fe2ab0
ggml_metal_init: loaded kernel_mul_mat_q6_K_f32 0x123fe4620
ggml_metal_init: loaded kernel_mul_mm_f16_f32 0x123fe4c20
ggml_metal_init: loaded kernel_mul_mm_q4_0_f32 0x123fe58b0
ggml_metal_init: loaded kernel_mul_mm_q4_1_f32 0x123fe6490
ggml_metal_init: loaded kernel_mul_mm_q2_K_f32 0x123fe6e60
ggml_metal_init: loaded kernel_mul_mm_q3_K_f32 0x123ee58a0
ggml_metal_init: loaded kernel_mul_mm_q4_K_f32 0x123eddd50
ggml_metal_init: loaded kernel_mul_mm_q5_K_f32 0x123ede350
ggml_metal_init: loaded kernel_mul_mm_q6_K_f32 0x123ee6c10
ggml_metal_init: loaded kernel_rope 0x123ee7c00
ggml_metal_init: loaded kernel_alibi_f32 0x123fe84d0
ggml_metal_init: loaded kernel_cpy_f32_f16 0x123fe7fc0
ggml_metal_init: loaded kernel_cpy_f32_f32 0x123fe9440
ggml_metal_init: loaded kernel_cpy_f16_f16 0x123fe9f30
ggml_metal_init: recommendedMaxWorkingSetSize = 10922.67 MB
ggml_metal_init: hasUnifiedMemory = true
ggml_metal_init: maxTransferRate = built-in GPU
llama_new_context_with_model: compute buffer total size = 73.35 MB
llama_new_context_with_model: max tensor size = 102.54 MB
ggml_metal_add_buffer: allocated 'data ' buffer, size = 5272.78 MB, ( 5273.22 / 10922.67)
ggml_metal_add_buffer: allocated 'eval ' buffer, size = 1.36 MB, ( 5274.58 / 10922.67)
ggml_metal_add_buffer: allocated 'kv ' buffer, size = 258.00 MB, ( 5532.58 / 10922.67)
ggml_metal_add_buffer: allocated 'alloc ' buffer, size = 72.02 MB, ( 5604.59 / 10922.67)
AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
llama_print_timings: load time = 5864.98 ms
llama_print_timings: sample time = 1311.62 ms / 504 runs ( 2.60 ms per token, 384.26 tokens per second)
llama_print_timings: prompt eval time = 5864.95 ms / 8 tokens ( 733.12 ms per token, 1.36 tokens per second)
llama_print_timings: eval time = 53464.54 ms / 503 runs ( 106.29 ms per token, 9.41 tokens per second)
llama_print_timings: total time = 63824.17 ms
{'id': 'cmpl-5a526ad0-5366-4a6d-8408-1e7ee9a641e9', 'object': 'text_completion', 'created': 1692866151, 'model': './models/llama-2-7b-chat.ggmlv3.q6_K.bin', 'choices': [{'text': 'What is the capital of France?\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 8, 'completion_tokens': 504, 'total_tokens': 512}}
ggml_metal_free: deallocating
Why i got '\x1c\x1c\x1c\x1c\x1c\x1c\x1c\x1c' from the invoked response? how to resolve this problem?