Skip to content

Commit

Permalink
llama : llama_perf + option to disable timings during decode
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Sep 8, 2024
1 parent fbb7fcf commit 471e7e1
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 29 deletions.
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2810,6 +2810,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;

cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ struct gpt_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool no_perf = false; // no perf (TODO: add llama_arg)

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool logits_all = false; // return logits for all tokens in the batch
Expand Down
2 changes: 1 addition & 1 deletion common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ std::string gpt_sampler_params::print() const {
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

lparams.no_perf = false; // TODO: control via params
lparams.no_perf = params.no_perf;

auto * result = new gpt_sampler {
/* .params = */ params,
Expand Down
1 change: 1 addition & 0 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ struct gpt_sampler_params {
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
bool no_perf = false; // disable performance metrics

std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
Expand Down
21 changes: 20 additions & 1 deletion include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ extern "C" {
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
//bool no_perf; // whether to measure performance timings, TODO: implement
bool no_perf; // whether to measure performance timings

// Abort callback
// if it returns true, execution of llama_decode() will be aborted
Expand Down Expand Up @@ -1168,11 +1168,30 @@ extern "C" {
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
//

// performance timing information
struct llama_perf_data {
// llama_context
double t_start_ms;
double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;

int32_t n_p_eval;
int32_t n_eval;

// llama_sampler_chain
double t_sample_ms;

int32_t n_sample;
};

enum llama_perf_type {
LLAMA_PERF_TYPE_CONTEXT = 0,
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
};

LLAMA_API struct llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type);

LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);

Expand Down
84 changes: 57 additions & 27 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2482,6 +2482,7 @@ struct llama_cparams {
bool causal_attn;
bool offload_kqv;
bool flash_attn;
bool no_perf;

enum llama_pooling_type pooling_type;

Expand Down Expand Up @@ -6657,8 +6658,6 @@ static bool llm_load_tensors(
bool use_mlock,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
model.t_start_us = ggml_time_us();

auto & hparams = model.hparams;

model.split_mode = split_mode;
Expand Down Expand Up @@ -8589,14 +8588,13 @@ static bool llm_load_tensors(
}
}

// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;
return true;
}

// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
model.t_start_us = ggml_time_us();

try {
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);

Expand Down Expand Up @@ -8658,6 +8656,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
return -1;
}

// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;

return 0;
}

Expand Down Expand Up @@ -17939,6 +17941,7 @@ struct llama_context_params llama_context_default_params() {
/*.embeddings =*/ false,
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
Expand Down Expand Up @@ -18149,6 +18152,7 @@ struct llama_context * llama_new_context_with_model(
cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.pooling_type = params.pooling_type;

cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
Expand Down Expand Up @@ -20067,10 +20071,14 @@ void llama_synchronize(struct llama_context * ctx) {

// add the evaluation to the stats
if (ctx->n_queued_tokens == 1) {
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
if (!ctx->cparams.no_perf) {
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
}
ctx->n_eval++;
} else if (ctx->n_queued_tokens > 1) {
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
if (!ctx->cparams.no_perf) {
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
}
ctx->n_p_eval += ctx->n_queued_tokens;
}

Expand Down Expand Up @@ -20677,39 +20685,61 @@ const char * llama_print_system_info(void) {
return s.c_str();
}

void llama_perf_print(const void * ctx, enum llama_perf_type type) {
llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) {
llama_perf_data data = {};

if (ctx == nullptr) {
return data;
}

switch (type) {
case LLAMA_PERF_TYPE_CONTEXT:
{
const auto * p = (const struct llama_context *) ctx;

const double t_start_ms = 1e-3 * p->t_start_us;
const double t_end_ms = 1.00 * ggml_time_ms();
const double t_load_ms = 1e-3 * p->t_load_us;
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
const double t_eval_ms = 1e-3 * p->t_eval_us;
data.t_start_ms = 1e-3 * p->t_start_us;
data.t_load_ms = 1e-3 * p->t_load_us;;
data.t_p_eval_ms = 1e-3 * p->t_p_eval_us;
data.t_eval_ms = 1e-3 * p->t_eval_us;
data.n_p_eval = std::max(1, p->n_p_eval);
data.n_eval = std::max(1, p->n_eval);
} break;
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
{
const auto * smpl = (const struct llama_sampler *) ctx;
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;

const int32_t n_p_eval = std::max(0, p->n_p_eval);
const int32_t n_eval = std::max(1, p->n_eval);
data.t_sample_ms = 1e-3 * p->t_sample_us;
data.n_sample = std::max(0, p->n_sample);
} break;
default:
GGML_ABORT("invalid perf type");
}

return data;
}

LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
switch (type) {
case LLAMA_PERF_TYPE_CONTEXT:
{
const auto data = llama_perf_get(ctx, type);

const double t_end_ms = 1e-3 * ggml_time_us();

LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
} break;
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
{
const auto * smpl = (const struct llama_sampler *) ctx;
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;

const double t_sampler_ms = 1e-3 * p->t_sample_us;

const int32_t n_sampler = std::max(0, p->n_sample);
const auto data = llama_perf_get(ctx, type);

LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
} break;
default:
GGML_ABORT("invalid perf type");
Expand All @@ -20729,7 +20759,7 @@ void llama_perf_reset(void * ctx, enum llama_perf_type type) {
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
{
auto * smpl = (struct llama_sampler *) ctx;
auto * p = (struct llama_sampler_chain *) smpl->ctx;
auto * p = (struct llama_sampler_chain *) smpl->ctx;

p->t_sample_us = p->n_sample = 0;
} break;
Expand Down

0 comments on commit 471e7e1

Please sign in to comment.