8000 Merge remote-tracking branch 'ggerganov/master' · ggml-org/llama.cpp@891f506 · GitHub
[go: up one dir, main page]

Skip to content

Commit 891f506

Browse files
committed
Merge remote-tracking branch 'ggerganov/master'
2 parents f8420dd + 1b78ed2 commit 891f506

File tree

15 files changed

+381
-154
lines changed

15 files changed

+381
-154
lines changed

.github/workflows/build.yml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ on:
1010
push:
1111
branches:
1212
- master
13-
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
13+
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
1414
pull_request:
1515
types: [opened, synchronize, reopened]
16-
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
16+
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
1717

1818
env:
1919
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -157,15 +157,15 @@ jobs:
157157
matrix:
158158
include:
159159
- build: 'avx2'
160-
defines: ''
160+
defines: '-DLLAMA_BUILD_SERVER=ON'
161161
- build: 'avx'
162-
defines: '-DLLAMA_AVX2=OFF'
162+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
163163
- build: 'avx512'
164-
defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
164+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
165165
- build: 'clblast'
166-
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
166+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
167167
- build: 'openblas'
168-
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
168+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
169169

170170
steps:
171171
- name: Clone
@@ -187,7 +187,7 @@ jobs:
187187
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
188188
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
189189
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
190-
rename-item $env:RUNNER_TEMP/clblast_release_dir clblast
190+
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
191191
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
192192
$txt = Get-Content -Path $f -Raw
193193
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
@@ -213,7 +213,6 @@ jobs:
213213
cd build
214214
cmake .. ${{ matrix.defines }}
215215
cmake --build . --config Release
216-
cp ../LICENSE ./bin/Release/llama.cpp.txt
217216
218217
- name: Add clblast.dll
219218
id: add_clblast_dll
@@ -258,6 +257,7 @@ jobs:
258257
id: pack_artifacts
259258
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
260259
run: |
260+
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
261261
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
262262
263263
- name: Upload artifacts
@@ -292,7 +292,7 @@ jobs:
292292
run: |
293293
mkdir build
294294
cd build
295-
cmake .. -DLLAMA_CUBLAS=ON
295+
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
296296
cmake --build . --config Release
297297
298298
- name: Get commit hash

CMakeLists.txt

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -37,42 +37,44 @@ endif()
3737
#
3838

3939
# general
40-
option(LLAMA_STATIC "llama: static link libraries" OFF)
41-
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
42-
option(LLAMA_LTO "llama: enable link time optimization" OFF)
40+
option(LLAMA_STATIC "llama: static link libraries" OFF)
41+
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
42+
option(LLAMA_LTO "llama: enable link time optimization" OFF)
4343

4444
# debug
45-
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
46-
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
47-
option(LLAMA_GPROF "llama: enable gprof" OFF)
45+
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
46+
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
47+
option(LLAMA_GPROF "llama: enable gprof" OFF)
4848

4949
# sanitizers
50-
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
51-
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
52-
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
50+
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
51+
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
52+
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
5353

5454
# instruction set specific
55-
option(LLAMA_AVX "llama: enable AVX" ON)
56-
option(LLAMA_AVX2 "llama: enable AVX2" ON)
57-
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
58-
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
59-
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
60-
option(LLAMA_FMA "llama: enable FMA" ON)
55+
option(LLAMA_AVX "llama: enable AVX" ON)
56+
option(LLAMA_AVX2 "llama: enable AVX2" ON)
57+
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
58+
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
59+
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
60+
option(LLAMA_FMA "llama: enable FMA" ON)
6161
# in MSVC F16C is implied with AVX2/AVX512
6262
if (NOT MSVC)
63-
option(LLAMA_F16C "llama: enable F16C" ON)
63+
option(LLAMA_F16C "llama: enable F16C" ON)
6464
endif()
6565

6666
# 3rd party libs
67-
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
68-
option(LLAMA_BLAS "llama: use BLAS" OFF)
69-
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
70-
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
71-
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
72-
73-
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
74-
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
75-
option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
67+
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
68+
option(LLAMA_BLAS "llama: use BLAS" OFF)
69+
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
70+
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
71+
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
72+
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
73+
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
74+
75+
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
76+
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
77+
option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
7678

7779
#
7880
# Build info header
@@ -184,6 +186,8 @@ if (LLAMA_CUBLAS)
184186
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
185187

186188
add_compile_definitions(GGML_USE_CUBLAS)
189+
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
190+
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
187191

188192
if (LLAMA_STATIC)
189193
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)

Makefile

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Define the default target now so that it is always the first target
2-
default: main quantize quantize-stats perplexity embedding vdot
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
3+
4+
ifdef LLAMA_BUILD_SERVER
5+
BUILD_TARGETS += server
6+
endif
7+
8+
default: $(BUILD_TARGETS)
39

410
ifndef UNAME_S
511
UNAME_S := $(shell uname -s)
@@ -133,9 +139,19 @@ ifdef LLAMA_CUBLAS
133139
OBJS += ggml-cuda.o
134140
NVCC = nvcc
135141
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
142+
ifdef LLAMA_CUDA_DMMV_X
143+
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
144+
else
145+
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
146+
endif # LLAMA_CUDA_DMMV_X
147+
ifdef LLAMA_CUDA_DMMV_Y
148+
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
149+
else
150+
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
151+
endif # LLAMA_CUDA_DMMV_Y
136152
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
137153
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
138-
endif
154+
endif # LLAMA_CUBLAS
139155
ifdef LLAMA_CLBLAST
140156
CFLAGS += -DGGML_USE_CLBLAST
141157
CXXFLAGS += -DGGML_USE_CLBLAST
@@ -200,7 +216,7 @@ libllama.so: llama.o ggml.o $(OBJS)
200216
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
201217

202218
clean:
203-
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
219+
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
F438
204220

205221
#
206222
# Examples
@@ -227,6 +243,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
227243
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
228244
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
229245

246+
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
247+
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
248+
230249
build-info.h: $(wildcard .git/index) scripts/build-info.sh
231250
@sh scripts/build-info.sh > $@.tmp
232251
@if ! cmp -s $@.tmp $@; then \

README.md

Lines changed: 79 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -240,11 +240,11 @@ In order to build llama.cpp you have three different options.
240240

241241
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
242242
243-
- Accelerate Framework:
243+
- **Accelerate Framework**:
244244
245245
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
246246

247-
- OpenBLAS:
247+
- **OpenBLAS**:
248248

249249
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
250250

@@ -278,11 +278,11 @@ Building the program with BLAS support may lead to some performance improvements
278278
cmake --build . --config Release
279279
```
280280

281-
- BLIS
281+
- **BLIS**
282282

283283
Check [BLIS.md](BLIS.md) for more information.
284284

285-
- Intel MKL
285+
- **Intel MKL**
286286

287287
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
288288

@@ -293,7 +293,7 @@ Building the program with BLAS support may lead to some performance improvements
293293
cmake --build . -config Release
294294
```
295295

296-
- cuBLAS
296+
- **cuBLAS**
297297

298298
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
299299
- Using `make`:
@@ -308,8 +308,81 @@ Building the program with BLAS support may lead to some performance improvements
308308
cmake .. -DLLAMA_CUBLAS=ON
309309
cmake --build . --config Release
310310
```
311+
Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
311312
312-
Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
313+
- **CLBlast**
314+
315+
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
316+
317+
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
318+
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
319+
320+
- <details>
321+
<summary>Installing the OpenCL SDK from source</summary>
322+
323+
```sh
324+
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
325+
mkdir OpenCL-SDK/build
326+
cd OpenCL-SDK/build
327+
cmake .. -DBUILD_DOCS=OFF \
328+
-DBUILD_EXAMPLES=OFF \
329+
-DBUILD_TESTING=OFF \
330+
-DOPENCL_SDK_BUILD_SAMPLES=OFF \
331+
-DOPENCL_SDK_TEST_SAMPLES=OFF
332+
cmake --build . --config Release
333+
cmake --install . --prefix /some/path
334+
```
335+
</details>
336+
337+
Installing CLBlast: it may be found in your operating system's packages.
338+
339+
- <details>
340+
<summary>If not, then installing from source:</summary>
341+
342+
```sh
343+
git clone https://github.com/CNugteren/CLBlast.git
344+
mkdir CLBlast/build
345+
cd CLBLast/build
346+
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
347+
cmake --build . --config Release
348+
cmake --install . --prefix /some/path
349+
```
350+
351+
Where `/some/path` is where the built library will be installed (default is `/usr/loca`l`).
352+
</details>
353+
354+
Building:
355+
356+
- Build with make:
357+
```sh
358+
make LLAMA_CLBLAST=1
359+
```
360+
- CMake:
361+
```sh
362+
mkdir build
363+
cd build
364+
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
365+
cmake --build . --config Release
366+
```
367+
368+
Running:
369+
370+
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
371+
372+
To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
373+
The selection can be a number (starting from 0) or a text string to search:
374+
375+
```sh
376+
GGML_OPENCL_PLATFORM=1 ./main ...
377+
GGML_OPENCL_DEVICE=2 ./main ...
378+
GGML_OPENCL_PLATFORM=Intel ./main ...
379+
GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
380+
```
381+
382+
The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
383+
Using the variables it is possible to select a CPU-based driver as well, if so desired.
384+
385+
You can get a list of platforms and devices from the `clinfo -l` command, etc.
313386
314387
### Prepare Data & Run
315388

examples/common.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
251251
break;
252252
}
253253
params.model = argv[i];
254+
} else if (arg == "-a" || arg == "--alias") {
255+
if (++i >= argc) {
256+
invalid_param = true;
257+
break;
258+
}
259+
params.model_alias = argv[i];
254260
} else if (arg == "--lora") {
255261
if (++i >= argc) {
256262
invalid_param = true;
@@ -283,7 +289,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
283289
invalid_param = true;
284290
break;
285291
}
292+
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
286293
params.n_gpu_layers = std::stoi(argv[i]);
294+
#else
295+
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
296+
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
297+
#endif
287298
} else if (arg == "--no-mmap") {
288299
params.use_mmap = false;
289300
} else if (arg == "--mtest") {
@@ -410,7 +421,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
410421
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
411422
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
412423
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
413-
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n");
424+
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
425+
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
414426
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
415427
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
416428
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
@@ -421,8 +433,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
421433
if (llama_mmap_supported()) {
422434
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
423435
}
436+
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
424437
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
425438
fprintf(stderr, " number of layers to store in VRAM\n");
439+
#endif
426440
fprintf(stderr, " --mtest compute maximum memory usage\n");
427441
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
428442
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");

examples/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ struct gpt_params {
4545
float mirostat_eta = 0.10f; // learning rate
4646

4747
std::string model = "models/7B/ggml-model.bin"; // model path
48+
std::string model_alias = "unknown"; // model alias
4849
std::string prompt = "";
4950
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
5051
std::string input_prefix = ""; // string to prefix user inputs with

0 commit comments

Comments
 (0)
0