LaunchFastllmGemmFp32Int4NoZero优化

黄宇扬 · 黄宇扬 · commit 8278ef00a085 · 2025-02-27T11:03:48.000+08:00
diff --git a/src/devices/cuda/fastllm-cuda.cu b/src/devices/cuda/fastllm-cuda.cu
@@ -2754,7 +2754,7 @@ bool FastllmCudaMatMulFloatInt4Group(const fastllm::Data &input, fastllm::Data &
 }
 
 void LaunchFastllmGemmFp32Int4NoZero(float *input, uint8_t *weight, float *output, float *bias, float *scales, float *mins, int n, int m, int k) {
-    for (int i = 0; i < n; i++) {
+   /* for (int i = 0; i < n; i++) {
         FastllmGemvInt4NoZeroKernel1<64, 1> <<< k, 64 >>>(input + i * m, weight, output + i * k, bias, scales, mins, m, k);
     }
     return;

Original file line number	Diff line number	Diff line change
`@@ -2754,7 +2754,7 @@ bool FastllmCudaMatMulFloatInt4Group(const fastllm::Data &input, fastllm::Data &`
`2754`	`2754`	`}`
`2755`	`2755`
`2756`	`2756`	`void LaunchFastllmGemmFp32Int4NoZero(float input, uint8_t weight, float output, float bias, float scales, float mins, int n, int m, int k) {`
`2757`		`- for (int i = 0; i < n; i++) {`
	`2757`	`+ /* for (int i = 0; i < n; i++) {`
`2758`	`2758`	`FastllmGemvInt4NoZeroKernel1<64, 1> <<< k, 64 >>>(input + i * m, weight, output + i * k, bias, scales, mins, m, k);`
`2759`	`2759`	`}`
`2760`	`2760`	`return;`