10000 examples : server chat mode with llama2 (#2400) · ggml-org/llama.cpp@34ae1ca · GitHub
[go: up one dir, main page]

Skip to content

Commit 34ae1ca

Browse files
authored
examples : server chat mode with llama2 (#2400)
* add: server chat mode with llama2 * fix: remove the unnecessary last \n
1 parent d91f3f0 commit 34ae1ca

File tree

2 files changed

+135
-0
lines changed

2 files changed

+135
-0
lines changed

examples/server-llama2-13B.sh

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
cd "$(dirname "$0")/.." || exit
6+
7+
# Specify the model you want to use here:
8+
MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}"
9+
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
10+
11+
# Adjust to the number of CPU cores you want to use.
12+
N_THREAD="${N_THREAD:-12}"
13+
14+
# Note: you can also override the generation options by specifying them on the command line:
15+
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
16+
17+
18+
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
19+
./server $GEN_OPTIONS \
20+
--model "$MODEL" \
21+
--threads "$N_THREAD" \
22+
--rope-freq-scale 1.0 \
23+
"$@"
24+
25+
# I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line.
26+
# -ngl 1 \

examples/server/chat-llama2.sh

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/bin/bash
2+
3+
API_URL="${API_URL:-http://127.0.0.1:8080}"
4+
5+
CHAT=(
6+
"Hello, Assistant."
7+
"Hello. How may I help you today?"
8+
)
9+
10+
INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
11+
12+
trim() {
13+
shopt -s extglob
14+
set -- "${1##+([[:space:]])}"
15+
printf "%s" "${1%%+([[:space:]])}"
16+
}
17+
18+
trim_trailing() {
19+
shopt -s extglob
20+
printf "%s" "${1%%+([[:space:]])}"
21+
}
22+
23+
format_prompt() {
24+
if [[ "${#CHAT[@]}" -eq 0 ]]; then
25+
echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
26+
else
27+
LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
28+
echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
29+
fi
30+
}
31+
32+
tokenize() {
33+
curl \
34+
--silent \
35+
--request POST \
36+
--url "${API_URL}/tokenize" \
37+
--header "Content-Type: application/json" \
38+
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
39+
| jq '.tokens[]'
40+
}
41+
42+
N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
43+
44+
chat_completion() {
45+
PROMPT="$(trim_trailing "$(format_prompt "$1")")"
46+
DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
47+
prompt: .,
48+
temperature: 0.2,
49+
top_k: 40,
50+
top_p: 0.9,
51+
n_keep: $n_keep,
52+
n_predict: 1024,
53+
stop: ["[INST]"],
54+
stream: true
55+
}')"
56+
57+
# Create a temporary file to hold the Python output
58+
TEMPFILE=$(mktemp)
59+
60+
exec 3< <(curl \
61+
--silent \
62+
--no-buffer \
63+
--request POST \
64+
--url "${API_URL}/completion" \
65+
--header "Content-Type: application/json" \
66+
--data-raw "${DATA}")
67+
68+
python -c "
69+
import json
70+
import sys
71+
72+
answer = ''
73+
while True:
74+
line = sys.stdin.readline()
75+
if not line:
76+
break
77+
if line.startswith('data: '):
78+
json_content = line[6:].strip()
79+
content = json.loads(json_content)['content']
80+
sys.stdout.write(content)
81+
sys.stdout.flush()
82+
answer += content
83+
84+
answer = answer.rstrip('\n')
85+
86+
# Write the answer to the temporary file
87+
with open('$TEMPFILE', 'w') as f:
88+
f.write(answer)
89+
" <&3
90+
91+
exec 3<&-
92+
93+
# Read the answer from the temporary file
94+
ANSWER=$(cat $TEMPFILE)
95+
96+
# Clean up the temporary file
97+
rm $TEMPFILE
98+
99+
printf "\n"
100+
101+
CHAT+=("$1" "$(trim "$ANSWER")")
102+
}
103+
104+
while true; do
105+
echo -en "\033[0;32m" # Green color
106+
read -r -e -p "> " QUESTION
107+
echo -en "\033[0m" # Reset color
108+
chat_completion "${QUESTION}"
109+
done

0 commit comments

Comments
 (0)
0