From f89fe2732c5709f6e86d5f4aee2e6d2a561f2eb2 Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Fri, 10 May 2024 15:51:58 +0530 Subject: [PATCH 1/6] Main+: optionally allow special tokens from user in interactive mode (#7097) @hanishkvc added a new `--interactive-specials` flag which would allow for inserting special tokens from user side into the embedding stream. --- common/common.cpp | 6 ++++++ common/common.h | 1 + examples/main/main.cpp | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 0535508ba98df..484e673349071 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -901,6 +901,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.interactive = true; return true; } + if (arg == "--interactive-specials") { + params.interactive_specials = true; + return true; + } if (arg == "--embedding") { params.embedding = true; return true; @@ -1422,6 +1426,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -h, --help show this help message and exit\n"); printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); + printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); @@ -2652,6 +2657,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str()); fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); + fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false"); fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); fprintf(stream, "keep: %d # default: 0\n", params.n_keep); fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); diff --git a/common/common.h b/common/common.h index 6f00a2cca8883..d80344f2a61bb 100644 --- a/common/common.h +++ b/common/common.h @@ -140,6 +140,7 @@ struct gpt_params { bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode + bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 49acd6bab4074..f3e445c16d6a9 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -879,7 +879,7 @@ int main(int argc, char ** argv) { } const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); + const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); From 4e3880978f8b1bf546dd4e6f3b524d6b8739c49c Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Fri, 10 May 2024 07:01:08 -0400 Subject: [PATCH 2/6] Fix memory bug in grammar parser (#7194) The llama.cpp grammar parser had a bug where forgetting to add a closing quotation mark to strings would cause parsing to crash. Anyone running a server on a public endpoint is advised to upgrade. To reproduce this bug ./llamafile -m foo.gguf -p bar --grammar 'root::="' Credit for discovering and reporting this issue goes to Eclypsium Security Researcher Richard Johnson . --- common/common.cpp | 8 +++----- common/grammar-parser.cpp | 9 +++++++++ examples/llava/llava-cli.cpp | 5 +++++ examples/main/main.cpp | 4 ++++ 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 484e673349071..ba1ecf0e59c8b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1371,14 +1371,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { std::replace(arg.begin(), arg.end(), '_', '-'); } - if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { throw std::invalid_argument("error: unknown argument: " + arg); } - } - - if (invalid_param) { - throw std::invalid_argument("error: invalid parameter for argument: " + arg); + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } } if (params.prompt_cache_all && diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index 2a1301569793a..fecb7cd713ea1 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -142,6 +142,9 @@ namespace grammar_parser { pos++; last_sym_start = out_elements.size(); while (*pos != '"') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } auto char_pair = parse_char(pos); pos = char_pair.second; out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); @@ -156,6 +159,9 @@ namespace grammar_parser { } last_sym_start = out_elements.size(); while (*pos != ']') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } auto char_pair = parse_char(pos); pos = char_pair.second; enum llama_gretype type = last_sym_start < out_elements.size() @@ -164,6 +170,9 @@ namespace grammar_parser { out_elements.push_back({type, char_pair.first}); if (pos[0] == '-' && pos[1] != ']') { + if (!pos[1]) { + throw std::runtime_error("unexpected end of input"); + } auto endchar_pair = parse_char(pos + 1); pos = endchar_pair.second; out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 157a680b5ecdb..da60ddf2f057d 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -189,6 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ LOG_TEE("\n"); struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + if (!ctx_sampling) { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } + std::string response = ""; for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index f3e445c16d6a9..9dee41001f12c 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -523,6 +523,10 @@ int main(int argc, char ** argv) { } struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); + if (!ctx_sampling) { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict From 25c6e82e7a1ad25a42b0894e87d9b5c557409516 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 10 May 2024 14:28:01 +0200 Subject: [PATCH 3/6] llama : use n_vocab to differentiate between mistral 7B and llama3 8B (#7200) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index e7b3fd8b433b4..2f1123d4e1678 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3860,7 +3860,7 @@ static void llm_load_hparams( switch (hparams.n_layer) { case 22: model.type = e_model::MODEL_1B; break; case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA + case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break; case 40: model.type = e_model::MODEL_13B; break; case 48: model.type = e_model::MODEL_34B; break; case 60: model.type = e_model::MODEL_30B; break; From 8c660242d708d3913a2adc2b6e4a9ee9cf5e4ce7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 10 May 2024 17:53:04 +0300 Subject: [PATCH 4/6] convert : print "ignore_merges" field --- convert-hf-to-gguf-update.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index e6468772210f8..b5eb41eacdab7 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -161,6 +161,8 @@ def download_file_with_auth(url, token, save_path): logger.info("normalizer: " + json.dumps(normalizer, indent=4)) pre_tokenizer = cfg["pre_tokenizer"] logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) + if "ignore_merges" in cfg["model"]: + logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4)) logger.info("") From 18e437665ce626dddbd79119aa7498493e7cb13b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 10 May 2024 18:20:10 +0300 Subject: [PATCH 5/6] metal : fix flash attention kernel requirements (#7169) * metal : fix flash attention kernel requirements ggml-ci * metal : fix ggml_metal_supports_op ggml-ci --- ggml-metal.m | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index c6817f01f2bdb..18ce5b88a1c01 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -633,14 +633,14 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); @@ -772,8 +772,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: case GGML_OP_LEAKY_RELU: - case GGML_OP_FLASH_ATTN_EXT: return true; + case GGML_OP_FLASH_ATTN_EXT: + return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: return ctx->support_simdgroup_reduction && From e849648888a11de13aaaa4cb2eda3f5a9c7b444d Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 10 May 2024 18:03:54 +0200 Subject: [PATCH 6/6] llama-bench : add pp+tg test type (#7199) --- examples/llama-bench/README.md | 18 +++++--- examples/llama-bench/llama-bench.cpp | 63 ++++++++++++++++++++++++---- scripts/compare-llama-bench.py | 8 +++- 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index 10f37b4418897..8578405646af7 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -26,16 +26,21 @@ options: -m, --model (default: models/7B/ggml-model-q4_0.gguf) -p, --n-prompt (default: 512) -n, --n-gen (default: 128) - -b, --batch-size (default: 512) - -ctk , --cache-type-k (default: f16) - -ctv , --cache-type-v (default: f16) - -t, --threads (default: 112) + -pg (default: 512,128) + -b, --batch-size (default: 2048) + -ub, --ubatch-size (default: 512) + -ctk, --cache-type-k (default: f16) + -ctv, --cache-type-v (default: f16) + -t, --threads (default: 16) -ngl, --n-gpu-layers (default: 99) -sm, --split-mode (default: layer) -mg, --main-gpu (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0) + -fa, --flash-attn <0|1> (default: 0) -mmp, --mmap <0|1> (default: 1) - -ts, --tensor_split (default: 0) + --numa (default: disabled) + -embd, --embeddings <0|1> (default: 0) + -ts, --tensor-split (default: 0) -r, --repetitions (default: 5) -o, --output (default: md) -v, --verbose (default: 0) @@ -43,10 +48,11 @@ options: Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. ``` -llama-bench can perform two types of tests: +llama-bench can perform three types of tests: - Prompt processing (pp): processing a prompt in batches (`-p`) - Text generation (tg): generating a sequence of tokens (`-n`) +- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`) With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`). diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 40128ec444334..8b965e1990ba5 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) { } } +static std::string pair_str(const std::pair & p) { + static char buf[32]; + snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second); + return buf; +} + struct cmd_params { std::vector model; std::vector n_prompt; std::vector n_gen; + std::vector> n_pg; std::vector n_batch; std::vector n_ubatch; std::vector type_k; @@ -188,6 +195,7 @@ static const cmd_params cmd_params_defaults = { /* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* n_prompt */ {512}, /* n_gen */ {128}, + /* n_pg */ {{512, 128}}, /* n_batch */ {2048}, /* n_ubatch */ {512}, /* type_k */ {GGML_TYPE_F16}, @@ -215,10 +223,11 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ub N, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); - printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -ub, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); + printf(" -ctk, --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); @@ -304,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); + } else if (arg == "-pg") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], ','); + if (p.size() != 2) { + invalid_param = true; + break; + } + params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -493,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.model.empty()) { params.model = cmd_params_defaults.model; } if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } + if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } @@ -632,6 +653,31 @@ static std::vector get_cmd_params_instances(const cmd_param }; instances.push_back(instance); } + + for (const auto & n_pg : params.n_pg) { + if (n_pg.first == 0 && n_pg.second == 0) { + continue; + } + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ n_pg.first, + /* .n_gen = */ n_pg.second, + /* .n_batch = */ nb, + /* .n_ubatch = */ nub, + /* .type_k = */ tk, + /* .type_v = */ tv, + /* .n_threads = */ nt, + /* .n_gpu_layers = */ nl, + /* .split_mode = */ sm, + /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, + /* .flash_attn = */ fa, + /* .tensor_split = */ ts, + /* .use_mmap = */ mmp, + /* .embeddings = */ embd, + }; + instances.push_back(instance); + } } return instances; @@ -965,6 +1011,9 @@ struct markdown_printer : public printer { if (field == "n_gpu_layers") { return 3; } + if (field == "test") { + return 13; + } int width = std::max((int)field.length(), 10); @@ -1091,12 +1140,11 @@ struct markdown_printer : public printer { value = test::get_backend(); } else if (field == "test") { if (t.n_prompt > 0 && t.n_gen == 0) { - snprintf(buf, sizeof(buf), "pp %d", t.n_prompt); + snprintf(buf, sizeof(buf), "pp%d", t.n_prompt); } else if (t.n_gen > 0 && t.n_prompt == 0) { - snprintf(buf, sizeof(buf), "tg %d", t.n_gen); + snprintf(buf, sizeof(buf), "tg%d", t.n_gen); } else { - assert(false); - exit(1); + snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); } value = buf; } else if (field == "t/s") { @@ -1297,6 +1345,7 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); uint64_t t_start = get_time_ns(); + if (t.n_prompt > 0) { test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); } diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index fed3c1ee3d298..0ede9e67c2f15 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -325,8 +325,12 @@ def get_rows(properties): for row in rows_show: n_prompt = int(row[-4]) n_gen = int(row[-3]) - assert n_prompt == 0 or n_gen == 0 - test_name = f"tg{n_gen}" if n_prompt == 0 else f"pp{n_prompt}" + if n_prompt != 0 and n_gen == 0: + test_name = f"pp{n_prompt}" + elif n_prompt == 0 and n_gen != 0: + test_name = f"tg{n_gen}" + else: + test_name = f"pp{n_prompt}+tg{n_gen}" # Regular columns test name avg t/s values Speedup # VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV table.append(list(row[:-4]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])