8000 Added low-level api inference example · coderonion/llama-cpp-python@9af16b6 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9af16b6

Browse files
committed
Added low-level api inference example
1 parent 3273651 commit 9af16b6

File tree

1 file changed

+41
-0
lines changed

1 file changed

+41
-0
lines changed

examples/low_level_api_inference.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
""" Example of low-level API inference based on https://github.com/ggerganov/llama.cpp/issues/384#issuecomment-1480129622
2+
"""
3+
import multiprocessing
4+
5+
import llama_cpp
6+
7+
N_THREADS = multiprocessing.cpu_count()
8+
9+
prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
10+
11+
lparams = llama_cpp.llama_context_default_params()
12+
13+
ctx = llama_cpp.llama_init_from_file(b"models/ggml-alpaca-7b-q4.bin", lparams)
14+
tmp = [0, 1, 2, 3]
15+
llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS)
16+
17+
embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
18+
n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True)
19+
embd_inp = embd_inp[:n_of_tok]
20+
21+
for i in range(len(embd_inp)):
22+
llama_cpp.llama_eval(ctx, (llama_cpp.c_int * 1)(embd_inp[i]), 1, i, N_THREADS)
23+
24+
prediction = b""
25+
embd = embd_inp
26+
27+
n = 8
28+
29+
for i in range(n):
30+
id = llama_cpp.llama_sample_top_p_top_k(ctx, (llama_cpp.c_int * len(embd))(*embd), n_of_tok + i, 40, 0.8, 0.2, 1.0/0.85)
31+
32+
embd.append(id)
33+
34+
prediction += llama_cpp.llama_token_to_str(ctx, id)
35+
36+
llama_cpp.llama_eval(ctx, (llama_cpp.c_int * 1)(embd[-1]), 1, len(embd), N_THREADS)
37+
38+
39+
llama_cpp.llama_free(ctx)
40+
41+
print(prediction.decode("utf-8"))

0 commit comments

Comments
 (0)
0