8000 Arm AArch64: optimized GEMV and GEMM kernels for q4_0_q8_0, and q8_0_q8_0 quantization by Dibakar · Pull Request #5780 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

Arm AArch64: optimized GEMV and GEMM kernels for q4_0_q8_0, and q8_0_q8_0 quantization #5780

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
002e36e
Arm AArch64: optimized GEMV and GEMM kernels for q4_0_q8_0, and q8_0_…
Dibakar Feb 28, 2024
340ef07
Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 qu…
Dibakar Apr 22, 2024
81215ff
Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 qu…
Dibakar Apr 23, 2024
6c8d826
Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 qu…
Dibakar Apr 25, 2024
43e1297
Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 qu…
Dibakar Apr 29, 2024
441ab64
Arm AArch64: add copyright claim only to ggml-aarch64.cpp and ggml-aa…
Dibakar Apr 29, 2024
8ee6779
Arm AArch64: minor code refactoring for rebase
Dibakar May 1, 2024
a657246
Arm AArch64: minor code refactoring for resolving a build issue with …
Dibakar May 16, 2024
746b57f
Arm AArch64: minor code refactoring to split the Q4_0_AARC64 type int…
Dibakar May 21, 2024
5d10c21
Arm AArch64: minor code change for resolving a build issue with serve…
Dibakar May 31, 2024
7ac03e5
retrigger checks
Dibakar May 31, 2024
e2c1c47
Arm AArch64: minor code changes for rebase
Dibakar Jun 5, 2024
79b6cdf
Arm AArch64: minor changes to skip the pr#7433 vec_dot code for arm c…
Dibakar Jun 14, 2024
3c1ad5f
Arm AArch64: remove stale LLAMA_QKK_64 from CMakeLists.txt and delete…
Dibakar Jun 14, 2024
a7055b7
Arm AArch64: add reference scalar gemm and gemv, and avoid dynamic me…
Dibakar Jun 18, 2024
cce236b
Arm AArch64: add multithreaded quantization support for the new types…
Dibakar Jun 19, 2024
7a70606
Arm AArch64: minor code refactoring
Dibakar Jun 19, 2024
ffbfabb
Arm AArch64: simplify logic for calling gemm and gemv functions in gg…
Dibakar Jun 23, 2024
cbbfd69
Arm AArch64: minimize changes in ggml_compute_forward_mul_mat
Dibakar Jun 26, 2024
3564644
Arm AArch64: minor code refactoring, and add reference scalar code to…
Dibakar Jul 3, 2024
110d143
Arm AArch64: minor code refactoring
Dibakar Jul 3, 2024
4ff0b22
Arm AArch64: minor code refactoring
Dibakar Jul 6, 2024
42724b4
Arm AArch64: minor code refactoring
Dibakar Jul 8, 2024
e5f4713
rebase on the latest master commit 3fd62a6 and adapt to the new direc…
Dibakar Jul 8, 2024
c2595d0
Arm AArch64: remove a redundant comment
Dibakar Jul 9, 2024
a7abb78
Arm AArch64: add pragma in ggml-aarch64.c to turn -Woverlength-string…
Dibakar Jul 9, 2024
0e84ef1
Arm AArch64: use __aarch64__ check to guard 64-bit neon kernels
Dibakar Jul 9, 2024
c653eb1
Arm AArch64: update docs/build.md README to include compile time flag…
Dibakar Jul 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
rebase on the latest master commit 3fd62a6 and adapt to the new direc…
…tory structure
  • Loading branch information
Dibakar committed Jul 8, 2024
commit e5f4713d810c13af60d8fd09400df92ea6a30bdd
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,8 @@ OBJ_GGML += \
ggml/src/ggml.o \
ggml/src/ggml-alloc.o \
ggml/src/ggml-backend.o \
ggml/src/ggml-quants.o
ggml/src/ggml-quants.o \
ggml/src/ggml-aarch64.o

OBJ_LLAMA = \
src/llama.o \
Expand Down Expand Up @@ -959,6 +960,13 @@ ggml/src/ggml-quants.o: \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@

ggml/src/ggml-aarch64.o: \
ggml/src/ggml-aarch64.c \
ggml/include/ggml.h \
ggml/src/ggml-aarch64.h \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@

ggml/src/ggml-blas.o: \
ggml/src/ggml-blas.cpp \
ggml/include/ggml-blas.h
Expand Down
1 change: 1 addition & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1153,6 +1153,7 @@ add_library(ggml
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
ggml-aarch64.c ggml-aarch64.h
)

if (EMSCRIPTEN)
Expand Down
File renamed without changes.
File renamed without changes.
45 changes: 11 additions & 34 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -474,18 +474,6 @@ int64_t ggml_cycles_per_ms(void) {
return CLOCKS_PER_SEC/1000;
}

#ifdef GGML_PERF
#define ggml_perf_time_ms() ggml_time_ms()
#define ggml_perf_time_us() ggml_time_us()
#define ggml_perf_cycles() ggml_cycles()
#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
#else
#define ggml_perf_time_ms() 0
#define ggml_perf_time_us() 0
#define ggml_perf_cycles() 0
#define ggml_perf_cycles_per_ms() 0
#endif

//
// cross-platform UTF-8 file paths
//
Expand Down Expand Up @@ -12272,29 +12260,23 @@ UseGgmlGemm1:;

for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
int64_t i11_processed = 0;
if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
4, ne10, interleave_blcksize);
}
i11_processed = ne11 - ne11 % 4;
}
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
ne10);
}
}
}
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
int64_t i11_processed = 0;
if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
for (int64_t i11 = 0; i11 < ne11 - ne11 % 4; i11 += 4) {
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, 4, ne10, interleave_blcksize);
wdata += row_size * 4;
}
i11_processed = ne11 - ne11 % 4;
}
for (int64_t i11 = i11_processed; i11 < ne11; ++i11) {
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
wdata += row_size;
}
}
}
}

if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
Expand Down Expand Up @@ -12368,11 +12350,6 @@ UseGgmlGemm2:;
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;

// The first chunk comes from our thread_id, the rest will get auto-assigned.
int current_chunk = ith;
//if (ith == 0)
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);

if ((ggml_n_dims(src0) == 2) && gemv) {
const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
Expand Down
Loading
0