Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : llama_perf + option to disable timings during decode #9355

Merged
merged 9 commits into from
Sep 13, 2024
9 changes: 9 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -995,6 +995,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
params.flash_attn = true;
}
).set_env("LLAMA_ARG_FLASH_ATTN"));
add_opt(llama_arg(
{"--no-perf"},
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
[](gpt_params & params) {
params.no_perf = true;
params.sparams.no_perf = true;
}
).set_env("LLAMA_ARG_FLASH_ATTN"));
ngxson marked this conversation as resolved.
Show resolved Hide resolved
add_opt(llama_arg(
{"-p", "--prompt"}, "PROMPT",
ex == LLAMA_EXAMPLE_MAIN
Expand Down Expand Up @@ -2810,6 +2818,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;

cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ struct gpt_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool no_perf = false; // no perf (TODO: add llama_arg)

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool logits_all = false; // return logits for all tokens in the batch
Expand Down
2 changes: 1 addition & 1 deletion common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ std::string gpt_sampler_params::print() const {
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

lparams.no_perf = false; // TODO: control via params
lparams.no_perf = params.no_perf;

auto * result = new gpt_sampler {
/* .params = */ params,
Expand Down
1 change: 1 addition & 0 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ struct gpt_sampler_params {
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
bool no_perf = false; // disable performance metrics

std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
Expand Down
21 changes: 20 additions & 1 deletion include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ extern "C" {
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
//bool no_perf; // whether to measure performance timings, TODO: implement
bool no_perf; // whether to measure performance timings

// Abort callback
Comment on lines 344 to 348
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is minor libllama API breaking change due to the addition of the no_perf parameter

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this will be a breaking change, since struct llama_context_params is expected to be created by llama_context_default_params(), right?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// if it returns true, execution of llama_decode() will be aborted
Expand Down Expand Up @@ -1168,11 +1168,30 @@ extern "C" {
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
//

// performance timing information
struct llama_perf_data {
// llama_context
double t_start_ms;
double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;

int32_t n_p_eval;
int32_t n_eval;

// llama_sampler_chain
double t_sample_ms;

int32_t n_sample;
};

enum llama_perf_type {
LLAMA_PERF_TYPE_CONTEXT = 0,
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
};

LLAMA_API struct llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be preferable to have two separate functions, just to remove the possibility of calling it with the wrong type of pointer.


LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);

Expand Down
84 changes: 57 additions & 27 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2482,6 +2482,7 @@ struct llama_cparams {
bool causal_attn;
bool offload_kqv;
bool flash_attn;
bool no_perf;

enum llama_pooling_type pooling_type;

Expand Down Expand Up @@ -6657,8 +6658,6 @@ static bool llm_load_tensors(
bool use_mlock,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
model.t_start_us = ggml_time_us();

auto & hparams = model.hparams;

model.split_mode = split_mode;
Expand Down Expand Up @@ -8589,14 +8588,13 @@ static bool llm_load_tensors(
}
}

// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;
return true;
}

// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
model.t_start_us = ggml_time_us();

try {
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);

Expand Down Expand Up @@ -8658,6 +8656,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
return -1;
}

// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;
ngxson marked this conversation as resolved.
Show resolved Hide resolved

return 0;
}

Expand Down Expand Up @@ -17939,6 +17941,7 @@ struct llama_context_params llama_context_default_params() {
/*.embeddings =*/ false,
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
Expand Down Expand Up @@ -18149,6 +18152,7 @@ struct llama_context * llama_new_context_with_model(
cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.pooling_type = params.pooling_type;

cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
Expand Down Expand Up @@ -20067,10 +20071,14 @@ void llama_synchronize(struct llama_context * ctx) {

// add the evaluation to the stats
if (ctx->n_queued_tokens == 1) {
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
if (!ctx->cparams.no_perf) {
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
}
ctx->n_eval++;
} else if (ctx->n_queued_tokens > 1) {
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
if (!ctx->cparams.no_perf) {
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
}
ctx->n_p_eval += ctx->n_queued_tokens;
}

Expand Down Expand Up @@ -20677,39 +20685,61 @@ const char * llama_print_system_info(void) {
return s.c_str();
}

void llama_perf_print(const void * ctx, enum llama_perf_type type) {
llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) {
llama_perf_data data = {};

if (ctx == nullptr) {
return data;
}

switch (type) {
case LLAMA_PERF_TYPE_CONTEXT:
{
const auto * p = (const struct llama_context *) ctx;

const double t_start_ms = 1e-3 * p->t_start_us;
const double t_end_ms = 1.00 * ggml_time_ms();
const double t_load_ms = 1e-3 * p->t_load_us;
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
const double t_eval_ms = 1e-3 * p->t_eval_us;
data.t_start_ms = 1e-3 * p->t_start_us;
data.t_load_ms = 1e-3 * p->t_load_us;;
ggerganov marked this conversation as resolved.
Show resolved Hide resolved
data.t_p_eval_ms = 1e-3 * p->t_p_eval_us;
data.t_eval_ms = 1e-3 * p->t_eval_us;
data.n_p_eval = std::max(1, p->n_p_eval);
data.n_eval = std::max(1, p->n_eval);
} break;
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
{
const auto * smpl = (const struct llama_sampler *) ctx;
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;

const int32_t n_p_eval = std::max(0, p->n_p_eval);
const int32_t n_eval = std::max(1, p->n_eval);
data.t_sample_ms = 1e-3 * p->t_sample_us;
data.n_sample = std::max(0, p->n_sample);
} break;
default:
GGML_ABORT("invalid perf type");
}

return data;
}

LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
switch (type) {
case LLAMA_PERF_TYPE_CONTEXT:
{
const auto data = llama_perf_get(ctx, type);

const double t_end_ms = 1e-3 * ggml_time_us();

LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
} break;
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
{
const auto * smpl = (const struct llama_sampler *) ctx;
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;

const double t_sampler_ms = 1e-3 * p->t_sample_us;

const int32_t n_sampler = std::max(0, p->n_sample);
const auto data = llama_perf_get(ctx, type);

LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
} break;
default:
GGML_ABORT("invalid perf type");
Expand All @@ -20729,7 +20759,7 @@ void llama_perf_reset(void * ctx, enum llama_perf_type type) {
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
{
auto * smpl = (struct llama_sampler *) ctx;
auto * p = (struct llama_sampler_chain *) smpl->ctx;
auto * p = (struct llama_sampler_chain *) smpl->ctx;

p->t_sample_us = p->n_sample = 0;
} break;
Expand Down
Loading