8000 llama : add option to override model tensor buffers by slaren · Pull Request #11397 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

llama : add option to override model tensor buffers #11397

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 2, 2025
Merged
Prev Previous commit
disable pipeline parallelism when there are tensor overrides
  • Loading branch information
slaren committed Apr 2, 2025
commit 2e4e8b13444824f0a7b10bdfc1cab8a859b4daf8
3 changes: 2 additions & 1 deletion src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,8 @@ llama_context::llama_context(
model.n_devices() > 1 &&
model.params.n_gpu_layers > (int) model.hparams.n_layer &&
model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv;
cparams.offload_kqv &&
!model.has_tensor_overrides();

// pipeline parallelism requires support for async compute and events in all devices
if (pipeline_parallel) {
Expand Down
7 changes: 7 additions & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -379,9 +379,12 @@ struct llama_model::impl {
layer_dev dev_input = {};
layer_dev dev_output = {};
std::vector<layer_dev> dev_layer;

bool has_tensor_overrides;
};

llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
}

llama_model::~llama_model() {}
Expand Down Expand Up @@ -4169,6 +4172,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
});
}

bool llama_model::has_tensor_overrides() const {
return pimpl->has_tensor_overrides;
}

const ggml_tensor * llama_model::get_tensor(const char * name) const {
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
[name](const std::pair<std::string, ggml_tensor *> & it) {
Expand Down
2 changes: 2 additions & 0 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,8 @@ struct llama_model {

ggml_backend_buffer_type_t select_buft(int il) const;

bool has_tensor_overrides() const;

const struct ggml_tensor * get_tensor(const char * name) const;

// TODO: move this to new llm_arch_model_i interface
Expand Down
Loading
0