8000 Merge 'origin/master' into hipblas · LostRuins/koboldcpp@8c2c497 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8c2c497

Browse files
committed
Merge 'origin/master' into hipblas
2 parents e610466 + 2347463 commit 8c2c497

File tree

22 files changed

+481
-48
lines changed

22 files changed

+481
-48
lines changed

.github/workflows/build.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,40 @@ jobs:
104104
cd build
105105
ctest --verbose --timeout 900
106106
107+
ubuntu-latest-cmake-mpi:
108+
runs-on: ubuntu-latest
109+
110+
continue-on-error: true
111+
112+
strategy:
113+
matrix:
114+
mpi_library: [mpich, libopenmpi-dev]
115+
116+
steps:
117+
- name: Clone
118+
id: checkout
119+
uses: actions/checkout@v1
120+
121+
- name: Dependencies
122+
id: depends
123+
run: |
124+
sudo apt-get update
125+
sudo apt-get install build-essential ${{ matrix.mpi_library }}
126+
127+
- name: Build
128+
id: cmake_build
129+
run: |
130+
mkdir build
131+
cd build
132+
cmake -DLLAMA_MPI=ON ..
133+
cmake --build . --config Release
134+
135+
- name: Test
136+
id: cmake_test
137+
run: |
138+
cd build
139+
ctest --verbose
140+
107141
macOS-latest-make:
108142
runs-on: macos-latest
109143

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ build-static/
2020
build-cublas/
2121
build-opencl/
2222
build-metal/
23+
build-mpi/
2324
build-no-accel/
2425
build-sanitize-addr/
2526
build-sanitize-thread/

CMakeLists.txt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
7676
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
7777
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7878
option(LLAMA_METAL "llama: use Metal" OFF)
79+
option(LLAMA_MPI "llama: use MPI" OFF)
7980
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
8081
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
8182

@@ -309,6 +310,28 @@ if (LLAMA_METAL)
309310
)
310311
endif()
311312

313+
if (LLAMA_MPI)
314+
cmake_minimum_required(VERSION 3.10)
315+
find_package(MPI)
316+
if (MPI_C_FOUND)
317+
message(STATUS "MPI found")
318+
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
319+
add_compile_definitions(GGML_USE_MPI)
320+
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
321+
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
322+
set(c_flags ${c_flags} -Wno-cast-qual)
323+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
324+
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
325+
# Even if you're only using the C header, C++ programs may bring in MPI
326+
# C++ functions, so more linkage is needed
327+
if (MPI_CXX_FOUND)
328+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
329+
endif()
330+
else()
331+
message(WARNING "MPI not found")
332+
endif()
333+
endif()
334+
312335
if (LLAMA_CLBLAST)
313336
find_package(CLBlast)
314337
if (CLBlast_FOUND)
@@ -509,6 +532,7 @@ add_library(ggml OBJECT
509532
${GGML_SOURCES_CUDA}
510533
${GGML_SOURCES_OPENCL}
511534
${GGML_SOURCES_METAL}
535+
${GGML_SOURCES_MPI}
512536
${GGML_SOURCES_EXTRA}
513537
)
514538

Makefile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
147147
endif
148148
endif # LLAMA_NO_ACCELERATE
149149

150+
ifdef LLAMA_MPI
151+
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
152+
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
153+
OBJS += ggml-mpi.o
154+
155+
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
156+
$(CC) $(CFLAGS) -c $< -o $@
157+
endif # LLAMA_MPI
158+
150159
ifdef LLAMA_OPENBLAS
151160
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
152161
LDFLAGS += -lopenblas

README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,45 @@ Any value larger than 0 will offload the computation to the GPU. For example:
268268
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
269269
```
270270

271+
### MPI Build
272+
273+
MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
274+
275+
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
276+
277+
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
278+
279+
- Using `make`:
280+
281+
```bash
282+
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
283+
```
284+
285+
- Using `CMake`:
286+
287+
```bash
288+
cmake -S . -B build -DLLAMA_MPI=ON
289+
```
290+
291+
Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
292+
293+
Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
294+
295+
Here is an example hostfile:
296+
297+
```
298+
192.168.0.1:2
299+
malvolio.local:1
300+
```
301+
302+
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
303+
304+
Finally, you're ready to run a computation using `mpirun`:
305+
306+
```bash
307+
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
308+
```
309+
271310
### BLAS Build
272311

273312
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:

examples/common.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
267267
break;
268268
}
269269
params.lora_adapter = argv[i];
270-
params.use_mmap = false;
271270
} else if (arg == "--lora-base") {
272271
if (++i >= argc) {
273272
invalid_param = true;
@@ -499,7 +498,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
499498
fprintf(stderr, " --mtest compute maximum memory usage\n");
500499
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
501500
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
502-
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
501+
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
503502
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
504503
fprintf(stderr, " -m FNAME, --model FNAME\n");
505504
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());

examples/embd-input/embd-input-lib.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
3434
}
3535
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
3636

37-
llama_init_backend(params.numa);
37+
llama_backend_init(params.numa);
3838

3939
llama_model * model;
4040
llama_context * ctx;

examples/embedding/embedding.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
3535
params.prompt = gpt_random_prompt(rng);
3636
}
3737

38-
llama_init_backend(params.numa);
38+
llama_backend_init(params.numa);
3939

4040
llama_model * model;
4141
llama_context * ctx;
@@ -93,5 +93,7 @@ int main(int argc, char ** argv) {
9393
llama_free(ctx);
9494
llama_free_model(model);
9595

96+
llama_backend_free();
97+
9698
return 0;
9799
}

examples/main/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
293293
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
294294
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
295295< 57AE /td>
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
296-
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
296+
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
297297
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

examples/main/main.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
105105
params.prompt = gpt_random_prompt(rng);
106106
}
107107

108-
llama_init_backend(params.numa);
108+
llama_backend_init(params.numa);
109109

110110
llama_model * model;
111111
llama_context * ctx;
@@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
671671
llama_free(ctx);
672672
llama_free_model(model);
673673

674+
llama_backend_free();
675+
674676
return 0;
675677
}

0 commit comments

Comments
 (0)
0