LostRuins
diff --git a/‎CMakeLists.txt
Lines changed: 13 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 13 additions & 0 deletions
diff --git a/‎Makefile
Lines changed: 3 additions & 1 deletion b/‎Makefile
Lines changed: 3 additions & 1 deletion
diff --git a/‎convert.py
Lines changed: 6 additions & 0 deletions b/‎convert.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/alpaca.sh
Lines changed: 1 addition & 1 deletion b/‎examples/alpaca.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/common.h
Lines changed: 2 additions & 1 deletion b/‎examples/common.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/embd-input/embd-input-lib.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/embd-input/embd-input-lib.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/embedding/embedding.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/embedding/embedding.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/main/main.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/main/main.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/perplexity/perplexity.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/perplexity/perplexity.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/quantize-stats/quantize-stats.cpp
Lines changed: 7 additions & 7 deletions b/‎examples/quantize-stats/quantize-stats.cpp
Lines changed: 7 additions & 7 deletions
@@ -44,6 +44,7 @@ endif()
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
+set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv CUDA kernels"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               ON)
@@ -77,8 +78,11 @@ if (LLAMA_CUBLAS)
         set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
 
         add_compile_definitions(GGML_USE_CUBLAS)
+        add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
+        
         add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
+        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})    
         if (LLAMA_CUDA_DMMV_F16)
             add_compile_definitions(GGML_CUDA_DMMV_F16)
         endif()
@@ -90,6 +94,15 @@ if (LLAMA_CUBLAS)
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
 
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        if (LLAMA_CUDA_DMMV_F16)
+            set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
+        endif()
+    endif()
+    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
     else()
         message(WARNING "cuBLAS not found")
     endif()
 
@@ -144,16 +144,18 @@ ifdef LLAMA_CUBLAS
 	CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 	NVCC      = nvcc
-	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
+	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_FORCE_DMMV
 ifdef LLAMA_CUDA_DMMV_X
 	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
 	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # LLAMA_CUDA_DMMV_X
 ifdef LLAMA_CUDA_DMMV_Y
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
 else
 	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_DMMV_Y
 ifdef LLAMA_CUDA_DMMV_F16
 	NVCCFLAGS += -DGGML_CUDA_DMMV_F16
 
@@ -154,9 +154,15 @@ def guessed(model: 'LazyModel') -> 'Params':
         # try transformer naming first
         if "model.layers.0.self_attn.q_proj.weight" in model:
             n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
         else:
             n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
 
+        if n_layer < 1:
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+
         n_head=n_embd // 128 # guessed
 
         return Params(
 
@@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..
 
-./main -m ./models/ggml-alpaca-7b-q4.bin \
+./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
        --color \
        -f ./prompts/alpaca.txt \
        --ctx_size 2048 \
 
@@ -31,7 +31,7 @@ struct gpt_params {
     int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
     int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
-    bool    low_vram                        = 0;   // if true, reduce VRAM usage at the cost of performance
+    int32_t n_probs                         = 0;   // if greater than 0, output the probabilities of top n_probs tokens.
 
     // sampling parameters
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -59,6 +59,7 @@ struct gpt_params {
     std::string lora_adapter = "";  // lora adapter path
     std::string lora_base    = "";  // base model path for the lora adapter
 
+    bool low_vram          = false;   // if true, reduce VRAM usage at the cost of performance
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
 
@@ -29,7 +29,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
 
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 
-    if (params.seed < 0) {
+    if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
     }
     fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
     params.embedding = true;
 
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
 
@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
     }
 
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
     } else if (params.n_ctx < 8) {
         fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
 
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
 
@@ -147,7 +147,7 @@ void test_roundtrip_on_chunk(
         const ggml_tensor * layer,
         int64_t offset,
         int64_t chunk_size,
-        const quantize_fns_t & qfns,
         bool use_reference,
         float * input_scratch,
         char * quantized_scratch,
@@ -163,11 +163,11 @@ void test_roundtrip_on_chunk(
     }
 
     if (use_reference) {
-        qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
+        qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
     } else {
-        qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
+        qfns.from_float(input_scratch, quantized_scratch, chunk_size);
     }
-    qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
+    qfns.to_float(quantized_scratch, output_scratch, chunk_size);
 
     update_error_stats(chunk_size, input_scratch, output_scratch, stats);
 }
@@ -177,7 +177,7 @@ void test_roundtrip_on_chunk(
 void test_roundtrip_on_layer(
         std::string & name,
         bool print_layer_stats,
-        const quantize_fns_t & qfns,
+        const ggml_type_traits_t & qfns,
         bool use_reference,
         const ggml_tensor * layer,
         std::vector<float> & input_scratch,
@@ -388,8 +388,8 @@ int main(int argc, char ** argv) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
-        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
-        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        if (qfns.from_float && qfns.to_float) {
             if (params.verbose) {
                 printf("testing %s ...\n",  ggml_type_name(type));
             }
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {`
`29`	`29`
`30`	`30`	`fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);`
`31`	`31`
`32`		`- if (params.seed < 0) {`
	`32`	`+ if (params.seed == LLAMA_DEFAULT_SEED) {`
`33`	`33`	`params.seed = time(NULL);`
`34`	`34`	`}`
`35`	`35`	`fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {`
`18`	`18`	`params.embedding = true;`
`19`	`19`
`20`	`20`	`if (params.n_ctx > 2048) {`
`21`		`- fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"`
	`21`	`+ fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"`
`22`	`22`	`"expect poor results\n", __func__, params.n_ctx);`
`23`	`23`	`}`
`24`	`24`
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {`
`85`	`85`	`}`
`86`	`86`
`87`	`87`	`if (params.n_ctx > 2048) {`
`88`		`- fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"`
	`88`	`+ fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"`
`89`	`89`	`"expect poor results\n", __func__, params.n_ctx);`
`90`	`90`	`} else if (params.n_ctx < 8) {`
`91`	`91`	`fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);`
Original file line number	Diff line number	Diff line change
`@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {`
`130`	`130`	`params.n_batch = std::min(params.n_batch, params.n_ctx);`
`131`	`131`
`132`	`132`	`if (params.n_ctx > 2048) {`
`133`		`- fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"`
	`133`	`+ fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"`
`134`	`134`	`"expect poor results\n", __func__, params.n_ctx);`
`135`	`135`	`}`
`136`	`136`