8000 common/llama: align structures for reduce cacheline size on 64bit platforms by GermanAizek · Pull Request #13710 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

common/llama: align structures for reduce cacheline size on 64bit platforms #13710

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from

Conversation

GermanAizek
Copy link
Contributor

This PR will decrease costs copying, moving, and creating object-structures only for common 64bit processors due to the 8-byte data alignment.

Smaller size structure or class, higher chance putting into CPU cache. Most processors are already 64 bit, so the change won't make it any worse.

Pahole example output:

  • Comment /* XXX {n} bytes hole, try to pack */ shows where optimization is possible by rearranging the order of fields structures and classes

Master branch

struct llm_graph_context {
        const enum llm_arch        arch;                 /*     0     4 */

        /* XXX 4 bytes hole, try to pack */

        const struct llama_hparams  & hparams;           /*     8     8 */
        const struct llama_cparams  & cparams;           /*    16     8 */
        const struct llama_ubatch  & ubatch;             /*    24     8 */
        const int64_t              n_embd;               /*    32     8 */
        const int64_t              n_layer;              /*    40     8 */
        const int64_t              n_rot;                /*    48     8 */
        const int64_t              n_ctx;                /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        const int64_t              n_head;               /*    64     8 */
        const int64_t              n_head_kv;            /*    72     8 */
        const int64_t              n_embd_head_k;        /*    80     8 */
        const int64_t              n_embd_k_gqa;         /*    88     8 */
        const int64_t              n_embd_head_v;        /*    96     8 */
        const int64_t              n_embd_v_gqa;         /*   104     8 */
        const int64_t              n_expert;             /*   112     8 */
        const int64_t              n_expert_used;        /*   120     8 */
        /* --- cacheline 2 boundary (128 bytes) --- */
        const float                freq_base;            /*   128     4 */
        const float                freq_scale;           /*   132     4 */
        const float                ext_factor;           /*   136     4 */
        const float                attn_factor;          /*   140     4 */
        const float                beta_fast;            /*   144     4 */
        const float                beta_slow;            /*   148     4 */
        const float                norm_eps;             /*   152     4 */
        const float                norm_rms_eps;         /*   156     4 */
        const int32_t              n_tokens;             /*   160     4 */
        const int32_t              n_outputs;            /*   164     4 */
        const int32_t              n_ctx_orig;           /*   168     4 */
        const enum llama_pooling_type pooling_type;      /*   172     4 */
        const enum llama_rope_type rope_type;            /*   176     4 */

        /* XXX 4 bytes hole, try to pack */

        struct ggml_context *      ctx0;                 /*   184     8 */
        /* --- cacheline 3 boundary (192 bytes) --- */
        ggml_backend_sched_t       sched;                /*   192     8 */
        ggml_backend_t             backend_cpu;          /*   200     8 */
        const struct llama_adapter_cvec  * cvec;         /*   208     8 */
        const llama_adapter_loras  * loras;              /*   216     8 */
        const class llama_memory_i  * memory;            /*   224     8 */
        const struct llama_cross  * cross;               /*   232     8 */
        const llm_graph_cb  &      cb_func;              /*   240     8 */
        class unique_ptr<llm_graph_result, std::default_delete<llm_graph_result> > res; /*   248     8 */

        /* size: 256, cachelines: 4, members: 38 */
        /* sum members: 248, holes: 2, sum holes: 8 */
};

struct llm_graph_params {
        struct ggml_context *      ctx;                  /*     0     8 */
        const enum llm_arch        arch;                 /*     8     4 */

        /* XXX 4 bytes hole, try to pack */

        const struct llama_hparams  & hparams;           /*    16     8 */
        const struct llama_cparams  & cparams;           /*    24     8 */
        const struct llama_ubatch  & ubatch;             /*    32     8 */
        ggml_backend_sched_t       sched;                /*    40     8 */
        ggml_backend_t             backend_cpu;          /*    48     8 */
        const struct llama_adapter_cvec  * cvec;         /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        const llama_adapter_loras  * loras;              /*    64     8 */
        const class llama_memory_i  * memory;            /*    72     8 */
        const struct llama_cross  * cross;               /*    80     8 */
        int32_t                    n_outputs;            /*    88     4 */

        /* XXX 4 bytes hole, try to pack */

        const llm_graph_cb  &      cb;                   /*    96     8 */

        /* size: 104, cachelines: 2, members: 13 */
        /* sum members: 96, holes: 2, sum holes: 8 */
        /* last cacheline: 40 bytes */
};

struct llama_model_params {
        ggml_backend_dev_t *       devices;              /*     0     8 */
        const struct llama_model_tensor_buft_override  * tensor_buft_overrides; /*     8     8 */
        int32_t                    n_gpu_layers;         /*    16     4 */
        enum llama_split_mode      split_mode;           /*    20     4 */
        int32_t                    main_gpu;             /*    24     4 */

        /* XXX 4 bytes hole, try to pack */

        const float  *             tensor_split;         /*    32     8 */
        llama_progress_callback    progress_callback;    /*    40     8 */
        void *                     progress_callback_user_data; /*    48     8 */
        const struct llama_model_kv_override  * kv_overrides; /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        bool                       vocab_only;           /*    64     1 */
        bool                       use_mmap;             /*    65     1 */
        bool                       use_mlock;            /*    66     1 */
        bool                       check_tensors;        /*    67     1 */

        /* size: 72, cachelines: 2, members: 13 */
        /* sum members: 64, holes: 1, sum holes: 4 */
        /* padding: 4 */
        /* last cacheline: 8 bytes */
};

This PR

struct llm_graph_context {
        const enum llm_arch        arch;                 /*     0     4 */
        const enum llama_rope_type rope_type;            /*     4     4 */
        const struct llama_hparams  & hparams;           /*     8     8 */
        const struct llama_cparams  & cparams;           /*    16     8 */
        const struct llama_ubatch  & ubatch;             /*    24     8 */
        const int64_t              n_embd;               /*    32     8 */
        const int64_t              n_layer;              /*    40     8 */
        const int64_t              n_rot;                /*    48     8 */
        const int64_t              n_ctx;                /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        const int64_t              n_head;               /*    64     8 */
        const int64_t              n_head_kv;            /*    72     8 */
        const int64_t              n_embd_head_k;        /*    80     8 */
        const int64_t              n_embd_k_gqa;         /*    88     8 */
        const int64_t              n_embd_head_v;        /*    96     8 */
        const int64_t              n_embd_v_gqa;         /*   104     8 */
        const int64_t              n_expert;             /*   112     8 */
        const int64_t              n_expert_used;        /*   120     8 */
        /* --- cacheline 2 boundary (128 bytes) --- */
        const float                freq_base;            /*   128     4 */
        const float                freq_scale;           /*   132     4 */
        const float                ext_factor;           /*   136     4 */
        const float                attn_factor;          /*   140     4 */
        const float                beta_fast;            /*   144     4 */
        const float                beta_slow;            /*   148     4 */
        const float                norm_eps;             /*   152     4 */
        const float                norm_rms_eps;         /*   156     4 */
        const int32_t              n_tokens;             /*   160     4 */
        const int32_t              n_outputs;            /*   164     4 */
        const int32_t              n_ctx_orig;           /*   168     4 */
        const enum llama_pooling_type pooling_type;      /*   172     4 */
        struct ggml_context *      ctx0;                 /*   176     8 */
        ggml_backend_sched_t       sched;                /*   184     8 */
        /* --- cacheline 3 boundary (192 bytes) --- */
        ggml_backend_t             backend_cpu;          /*   192     8 */
        const struct llama_adapter_cvec  * cvec;         /*   200     8 */
        const llama_adapter_loras  * loras;              /*   208     8 */
        const class llama_memory_i  * memory;            /*   216     8 */
        const struct llama_cross  * cross;               /*   224     8 */
        const llm_graph_cb  &      cb_func;              /*   232     8 */
        class unique_ptr<llm_graph_result, std::default_delete<llm_graph_result> > res; /*   240     8 */

        /* size: 248, cachelines: 4, members: 38 */
        /* last cacheline: 56 bytes */
};   /* saved 8 bytes! */


struct llm_graph_params {
        struct ggml_context *      ctx;                  /*     0     8 */
        const enum llm_arch        arch;                 /*     8     4 */
        int32_t                    n_outputs;            /*    12     4 */
        const struct llama_hparams  & hparams;           /*    16     8 */
        const struct llama_cparams  & cparams;           /*    24     8 */
        const struct llama_ubatch  & ubatch;             /*    32     8 */
        ggml_backend_sched_t       sched;                /*    40     8 */
        ggml_backend_t             backend_cpu;          /*    48     8 */
        const struct llama_adapter_cvec  * cvec;         /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        const llama_adapter_loras  * loras;              /*    64     8 */
        const class llama_memory_i  * memory;            /*    72     8 */
        const struct llama_cross  * cross;               /*    80     8 */
        const llm_graph_cb  &      cb;                   /*    88     8 */

        /* size: 96, cachelines: 2, members: 13 */
        /* last cacheline: 32 bytes */
};   /* saved 8 bytes! */

struct llama_model_params {
        ggml_backend_dev_t *       devices;              /*     0     8 */
        const struct llama_model_tensor_buft_override  * tensor_buft_overrides; /*     8     8 */
        int32_t                    n_gpu_layers;         /*    16     4 */
        enum llama_split_mode      split_mode;           /*    20     4 */
        int32_t                    main_gpu;             /*    24     4 */
        bool                       check_tensors;        /*    28     1 */
        bool                       use_mlock;            /*    29     1 */
        bool                       use_mmap;             /*    30     1 */
        bool                       vocab_only;           /*    31     1 */
        const float  *             tensor_split;         /*    32     8 */
        llama_progress_callback    progress_callback;    /*    40     8 */
        void *                     progress_callback_user_data; /*    48     8 */
        const struct llama_model_kv_override  * kv_overrides; /*    56     8 */

        /* size: 64, cachelines: 1, members: 13 */
};   /* saved 8 bytes and 1 cacheline! */

Info about technique:

https://hpc.rz.rptu.de/Tutorials/AVX/alignment.shtml

https://wr.informatik.uni-hamburg.de/_media/teaching/wintersemester_2013_2014/epc-14-haase-svenhendrik-alignmentinc-presentation.pdf

https://en.wikipedia.org/wiki/Data_structure_alignment

https://stackoverflow.com/a/20882083

https://zijishi.xyz/post/optimization-technique/learning-to-use-data-alignment/

Affected structs:

  • llm_graph_context from 256 to 248 bytes
  • llm_graph_params from 104 to 96 bytes
  • llama_sampler_chain from 48 to 40 bytes
  • llama_model_loader from 328 to 320 bytes (saved 1 cacheline)
  • llama_model_params from 72 to 64 bytes (saved 1 cacheline)
  • common_log_entry from 48 to 40 bytes
  • templates_params from 112 to 96 bytes (saved 16 bytes)
  • common_chat_params from 152 to 144 bytes
  • common_chat_templates_inputs from 136 to 128 bytes (saved 1 cacheline)
  • common_params from 4960 to 4888 bytes (saved 1 cacheline)
  • common_params_sampling from 288 to 280 bytes
  • common_grammar_trigger from 48 to 40 bytes
  • cpu_params from 532 to 528 bytes

@GermanAizek
Copy link
Contributor Author

@USBhost, if you're not too lazy, can you test it? I'll try to post my results later, my PC configuration is not much powerful.

8000
@ngxson
Copy link
Collaborator
ngxson commented May 22, 2025

Are you sure this is something we need to handle ourselves? AFAIK all modern compilers already done this under the hood

@GermanAizek
Copy link
Contributor Author

@ngxson, modern compiler may not always optimize, so we make explicit alignment and imply it.
Benchmarks are more interesting to look at, structures are more likely to be affected by initialization speed.
Also, I haven't looked at the assembly code for -O2 -O3 and -Ofast yet.

@ngxson
Copy link
Collaborator
ngxson commented May 22, 2025

Beside, I don't think these struct are being copied that much. Most of the copies inside llama.cpp are either copying large buffer (string, data) or copying tensor. I don't feel convinced unless you can tell me exactly how many miliseconds this runs faster.

@ngxson ngxson removed their request for review May 22, 2025 15:12
@slaren
Copy link
Member
slaren commented May 22, 2025

C/C++ compilers cannot typically reorder structs. But ultimately, none of these structs are used in performance sensitive paths, and it is not worth making the code less readable, introducing a breaking change, and the risk of adding bugs for what essentially is going to save one cache miss out of a billion.

@GermanAizek
Copy link
Contributor Author

AFAIK all modern compilers already done this under the hood

@ngxson,
I've already done this here, and by benchmarks, it really improved the situation, albeit slightly.
That's why I've now decided to completely align all the structures, in case there are still possibilities.
#7267

@GermanAizek
Copy link
Contributor Author
GermanAizek commented May 22, 2025

But ultimately, none of these structs are used in performance sensitive paths

I also think that this will not affect main performance, but compiler, due to reordering structures, can greatly change assembly code, it is advisable to test it with benchmark and go from results benchmark.
Visually, yes, it spoils code style and ABI.
But I tried to make it more readable with comments previous contributors.

@ngxson
Copy link
Collaborator
ngxson commented May 22, 2025

If I understand correctly, you got from master 877.91 s to PR 878.38 s - so unless I missed something, this even slow things down and not even an improvement.

But tbh I think this brings less than 0.1% of performance improvement while adding another task for other contributors: to make sure the struct is kept align - it just doesn't seem economically appealing to me

… platforms

- llm_graph_context from 256 to 248 bytes
- llm_graph_params from 104 to 96 bytes
- llama_sampler_chain from 48 to 40 bytes
- llama_model_loader from 328 to 320 bytes (saved 1 cacheline)
- llama_model_params from 72 to 64 bytes (saved 1 cacheline)
- common_log_entry from 48 to 40 bytes
- templates_params from 112 to 96 bytes (saved 16 bytes)
- common_chat_params from 152 to 144 bytes
- common_chat_templates_inputs from 136 to 128 bytes (saved 1 cacheline)
- common_params from 4960 to 4888 bytes (saved 1 cacheline)
- common_params_sampling from 288 to 280 bytes
- common_grammar_trigger from 48 to 40 bytes
- cpu_params from 532 to 528 bytes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants
0