Skip to content

Commit

Permalink
server : fix default draft model parameters (ggerganov#10586)
Browse files Browse the repository at this point in the history
* server : force F16 KV cache for the draft model

ggml-ci

* server : fix draft params

ggml-ci

* server : various params fixes

ggml-ci
  • Loading branch information
ggerganov authored Dec 3, 2024
1 parent 642330a commit 70b98fa
Showing 1 changed file with 14 additions and 3 deletions.
17 changes: 14 additions & 3 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -696,8 +696,9 @@ struct server_context {

params_dft.devices = params_base.speculative.devices;
params_dft.model = params_base.speculative.model;
params_dft.n_ctx = params_base.speculative.n_ctx;
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
params_dft.n_parallel = 1;

common_init_result llama_init_dft = common_init_from_params(params_dft);

Expand All @@ -717,8 +718,14 @@ struct server_context {
return false;
}

cparams_dft = common_context_params_to_llama(params_base);
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);

cparams_dft = common_context_params_to_llama(params_dft);
cparams_dft.n_batch = n_ctx_dft;

// force F16 KV cache for the draft model for extra performance
cparams_dft.type_k = GGML_TYPE_F16;
cparams_dft.type_v = GGML_TYPE_F16;

// the context is not needed - we will create one for each slot
llama_free(llama_init_dft.context);
Expand Down Expand Up @@ -2322,6 +2329,10 @@ struct server_context {
continue;
}

if (slot.state != SLOT_STATE_GENERATING) {
continue;
}

llama_token id = slot.sampled;

struct common_speculative_params params_spec;
Expand Down

0 comments on commit 70b98fa

Please sign in to comment.