Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : llama_perf + option to disable timings during decode #9355

Merged
merged 9 commits into from
Sep 13, 2024
8 changes: 8 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.prompt = value;
}
));
add_opt(llama_arg(
{"--no-perf"},
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
[](gpt_params & params) {
params.no_perf = true;
params.sparams.no_perf = true;
}
).set_env("LLAMA_ARG_NO_PERF"));
add_opt(llama_arg(
{"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)",
Expand Down
3 changes: 2 additions & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -828,7 +828,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
}
llama_kv_cache_clear(lctx);
llama_synchronize(lctx);
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_reset_context(lctx);
}

iparams.model = model;
Expand Down Expand Up @@ -924,6 +924,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;

cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ struct gpt_sampler_params {
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
bool no_perf = false; // disable performance metrics

std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
Expand Down Expand Up @@ -246,6 +247,7 @@ struct gpt_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool logits_all = false; // return logits for all tokens in the batch
Expand Down
6 changes: 3 additions & 3 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

lparams.no_perf = false; // TODO: control via params
lparams.no_perf = params.no_perf;

auto * result = new gpt_sampler {
/* .params = */ params,
Expand Down Expand Up @@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
// TODO: measure grammar performance

if (gsmpl) {
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
llama_perf_print_sampler(gsmpl->chain);
}
if (ctx) {
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
}
}

Expand Down
2 changes: 1 addition & 1 deletion examples/batched-bench/batched-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ int main(int argc, char ** argv) {
}

LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);

llama_batch_free(batch);

Expand Down
4 changes: 2 additions & 2 deletions examples/batched.swift/Sources/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()

print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")

llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
llama_perf_print_sampler(smpl)
llama_perf_print_context(context)

private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count
Expand Down
4 changes: 2 additions & 2 deletions examples/batched/batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

LOG_TEE("\n");
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_sampler(smpl);
llama_perf_print_context(ctx);

fprintf(stderr, "\n");

Expand Down
2 changes: 1 addition & 1 deletion examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
}

LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);

// clean up
llama_batch_free(batch);
Expand Down
2 changes: 1 addition & 1 deletion examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
}

LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);

llama_free(ctx);
llama_free_model(model);
Expand Down
2 changes: 1 addition & 1 deletion examples/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
g_collector.save_imatrix();

LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);

llama_free(ctx);
llama_free_model(model);
Expand Down
2 changes: 1 addition & 1 deletion examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
fflush(p_err->fout);
}

llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);

llama_free(ctx);

Expand Down
4 changes: 2 additions & 2 deletions examples/llava/llava-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);

llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
Expand All @@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);

llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
Expand Down
2 changes: 1 addition & 1 deletion examples/llava/minicpmv-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
}
}
printf("\n");
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx_llava->ctx_llama);

ctx_llava->model = NULL;
llava_free(ctx_llava);
Expand Down
3 changes: 1 addition & 2 deletions examples/lookup/lookup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,7 @@ int main(int argc, char ** argv){
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);

LOG_TEE("\ntarget:\n\n");
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
gpt_perf_print(ctx, smpl);

gpt_sampler_free(smpl);

Expand Down
2 changes: 1 addition & 1 deletion examples/parallel/parallel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ int main(int argc, char ** argv) {
LOG_TEE("\n");

// TODO: print sampling/grammar timings for all clients
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);

llama_batch_free(batch);

Expand Down
2 changes: 1 addition & 1 deletion examples/passkey/passkey.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);

fprintf(stderr, "\n");

Expand Down
2 changes: 1 addition & 1 deletion examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2049,7 +2049,7 @@ int main(int argc, char ** argv) {
}

LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);
write_logfile(ctx, params, model, results);

llama_free(ctx);
Expand Down
2 changes: 1 addition & 1 deletion examples/retrieval/retrieval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
}

LOG_TEE("\n");
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx);

// clean up
llama_batch_free(query_batch);
Expand Down
4 changes: 2 additions & 2 deletions examples/simple/simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

LOG_TEE("\n");
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_sampler(smpl);
llama_perf_print_context(ctx);

fprintf(stderr, "\n");

Expand Down
2 changes: 1 addition & 1 deletion examples/speculative/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ int main(int argc, char ** argv) {

LOG_TEE("\ndraft:\n\n");
// TODO: print sampling/grammar timings for all drafts
llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
llama_perf_print_context(ctx_dft);

LOG_TEE("\ntarget:\n\n");
gpt_perf_print(ctx_tgt, smpl);
Expand Down
29 changes: 23 additions & 6 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ extern "C" {
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
//bool no_perf; // whether to measure performance timings, TODO: implement
bool no_perf; // whether to measure performance timings

// Abort callback
Comment on lines 344 to 348
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is minor libllama API breaking change due to the addition of the no_perf parameter

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this will be a breaking change, since struct llama_context_params is expected to be created by llama_context_default_params(), right?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// if it returns true, execution of llama_decode() will be aborted
Expand Down Expand Up @@ -1169,13 +1169,30 @@ extern "C" {
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
//

enum llama_perf_type {
LLAMA_PERF_TYPE_CONTEXT = 0,
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
struct llama_perf_data_context {
double t_start_ms;
double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;

int32_t n_p_eval;
int32_t n_eval;
};

struct llama_perf_data_sampler {
double t_sample_ms;

int32_t n_sample;
};

LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
LLAMA_API struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx);
LLAMA_API struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain);

LLAMA_API void llama_perf_print_context(const struct llama_context * ctx);
LLAMA_API void llama_perf_print_sampler(const struct llama_sampler * chain);

LLAMA_API void llama_perf_reset_context(struct llama_context * ctx);
LLAMA_API void llama_perf_reset_sampler(struct llama_sampler * chain);

LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);

Expand Down
Loading
Loading