8000 Merge remote-tracking branch 'upstream/concedo' · LostRuins/koboldcpp@912e31e · GitHub
[go: up one dir, main page]

Skip to content

Commit 912e31e

Browse files
committed
Merge remote-tracking branch 'upstream/concedo'
2 parents 74e2703 + ddaa4f2 commit 912e31e

37 files changed

+6583
-2127
lines changed

CMakeLists.txt

Lines changed: 13 additions & 0 deletions
< 67ED td data-grid-cell-id="diff-1e7de1ae2d059d21e1dd75d5812d5a34b0222cef273b7c3a2af62eb747f9d20a-44-44-2" data-line-anchor="diff-1e7de1ae2d059d21e1dd75d5812d5a34b0222cef273b7c3a2af62eb747f9d20aR44" data-selected="false" role="gridcell" style="background-color:var(--bgColor-default);padding-right:24px" tabindex="-1" valign="top" class="focusable-grid-cell diff-text-cell right-side-diff-cell left-side">
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ endif()
4444
4545
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
4646
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
47+
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
4748
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
4849
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
4950
option(LLAMA_HIPBLAS "llama: use hipBLAS" ON)
@@ -77,8 +78,11 @@ if (LLAMA_CUBLAS)
7778
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
7879

7980
add_compile_definitions(GGML_USE_CUBLAS)
81+
add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
82+
8083
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
8184
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
85+
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
8286
if (LLAMA_CUDA_DMMV_F16)
8387
add_compile_definitions(GGML_CUDA_DMMV_F16)
8488
endif()
@@ -90,6 +94,15 @@ if (LLAMA_CUBLAS)
9094
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
9195
endif()
9296

97+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
98+
if (LLAMA_CUDA_DMMV_F16)
99+
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
100+
else()
101+
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
102+
endif()
103+
endif()
104+
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
105+
93106
else()
94107
message(WARNING "cuBLAS not found")
95108
endif()

Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,16 +144,18 @@ ifdef LLAMA_CUBLAS
144144
CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
145145
CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
146146
NVCC = nvcc
147-
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
147+
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_FORCE_DMMV
148148
ifdef LLAMA_CUDA_DMMV_X
149149
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
150150
else
151151
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
152152
endif # LLAMA_CUDA_DMMV_X
153153
ifdef LLAMA_CUDA_DMMV_Y
154+
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
154155
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
155156
else
156157
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
158+
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
157159
endif # LLAMA_CUDA_DMMV_Y
158160
ifdef LLAMA_CUDA_DMMV_F16
159161
NVCCFLAGS += -DGGML_CUDA_DMMV_F16

convert.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,15 @@ def guessed(model: 'LazyModel') -> 'Params':
154154
# try transformer naming first
155155
if "model.layers.0.self_attn.q_proj.weight" in model:
156156
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
157+
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
158+
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
157159
else:
158160
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
159161

162+
if n_layer < 1:
163+
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
164+
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
165+
160166
n_head=n_embd // 128 # guessed
161167

162168
return Params(

examples/alpaca.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
cd `dirname $0`
88
cd ..
99

10-
./main -m ./models/ggml-alpaca-7b-q4.bin \
10+
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
1111
--color \
1212
-f ./prompts/alpaca.txt \
1313
--ctx_size 2048 \

examples/common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ struct gpt_params {
3131
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
3232
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3333
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
34-
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
34+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
3535

3636
// sampling parameters
3737
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -59,6 +59,7 @@ struct gpt_params {
5959
std::string lora_adapter = ""; // lora adapter path
6060
std::string lora_base = ""; // base model path for the lora adapter
6161

62+
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
6263
bool memory_f16 = true; // use f16 instead of f32 for memory kv
6364
bool random_prompt = false; // do not randomize prompt if none provided
6465
bool use_color = false; // use color to distinguish generations and inputs

examples/embd-input/embd-input-lib.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
2929

3030
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
3131

32-
if (params.seed < 0) {
32+
if (params.seed == LLAMA_DEFAULT_SEED) {
3333
params.seed = time(NULL);
3434
}
3535
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);

examples/embedding/embedding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
1818
params.embedding = true;
1919

2020
if (params.n_ctx > 2048) {
21-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
21+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
2222
"expect poor results\n", __func__, params.n_ctx);
2323
}
2424

examples/main/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
8585
}
8686

8787
if (params.n_ctx > 2048) {
88-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
88+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
8989
"expect poor results\n", __func__, params.n_ctx);
9090
} else if (params.n_ctx < 8) {
9191
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);

examples/perplexity/perplexity.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
130130
params.n_batch = std::min(params.n_batch, params.n_ctx);
131131

132132
if (params.n_ctx > 2048) {
133-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
133+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
134134
"expect poor results\n", __func__, params.n_ctx);
135135
}
136136

examples/quantize-stats/quantize-stats.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ void test_roundtrip_on_chunk(
147147
const ggml_tensor * layer,
148148
int64_t offset,
149149
int64_t chunk_size,
150-
const quantize_fns_t & qfns,
150+
const ggml_type_traits_t & qfns,
151151
bool use_reference,
152152
float * input_scratch,
153153
char * quantized_scratch,
@@ -163,11 +163,11 @@ void test_roundtrip_on_chunk(
163163
}
164164

165165
if (use_reference) {
166-
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
166+
qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
167167
} else {
168-
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
168+
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
169169
}
170-
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
170+
qfns.to_float(quantized_scratch, output_scratch, chunk_size);
171171

172172
update_error_stats(chunk_size, input_scratch, output_scratch, stats);
173173
}
@@ -177,7 +177,7 @@ void test_roundtrip_on_chunk(
177177
void test_roundtrip_on_layer(
178178
std::string & name,
179179
bool print_layer_stats,
180-
const quantize_fns_t & qfns,
180+
const ggml_type_traits_t & qfns,
181181
bool use_reference,
182182
const ggml_tensor * layer,
183183
std::vector<float> & input_scratch,
@@ -388,8 +388,8 @@ int main(int argc, char ** argv) {
388388
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
389389
continue;
390390
}
391-
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
392-
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
391+
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
392+
if (qfns.from_float && qfns.to_float) {
393393
if (params.verbose) {
394394
printf("testing %s ...\n", ggml_type_name(type));
395395
}

0 commit comments

Comments
 (0)
0