From 1442677f92e45a475be7b4d056e3633d1d6f813b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 4 Jun 2024 21:23:39 +0300 Subject: [PATCH] common : refactor cli arg parsing (#7675) * common : gpt_params_parse do not print usage * common : rework usage print (wip) * common : valign * common : rework print_usage * infill : remove cfg support * common : reorder args * server : deduplicate parameters ggml-ci * common : add missing header ggml-ci * common : remote --random-prompt usages ggml-ci * examples : migrate to gpt_params ggml-ci * batched-bench : migrate to gpt_params * retrieval : migrate to gpt_params * common : change defaults for escape and n_ctx * common : remove chatml and instruct params ggml-ci * common : passkey use gpt_params --- common/common.cpp | 823 +++++++++++++------ common/common.h | 105 ++- examples/batched-bench/README.md | 8 +- examples/batched-bench/batched-bench.cpp | 92 +-- examples/batched/README.md | 2 +- examples/batched/batched.cpp | 73 +- examples/embedding/embedding.cpp | 4 +- examples/eval-callback/eval-callback.cpp | 6 +- examples/gguf-split/tests.sh | 10 +- examples/gritlm/gritlm.cpp | 2 + examples/imatrix/imatrix.cpp | 8 +- examples/infill/infill.cpp | 134 +-- examples/llama-bench/llama-bench.cpp | 48 +- examples/llava/llava-cli.cpp | 14 +- examples/lookahead/lookahead.cpp | 3 +- examples/lookup/lookup-create.cpp | 2 + examples/lookup/lookup-stats.cpp | 1 + examples/lookup/lookup.cpp | 1 + examples/main/README.md | 5 +- examples/main/main.cpp | 69 +- examples/parallel/parallel.cpp | 3 +- examples/passkey/README.md | 2 +- examples/passkey/passkey.cpp | 66 +- examples/perplexity/perplexity.cpp | 12 +- examples/quantize/tests.sh | 4 +- examples/retrieval/retrieval.cpp | 90 +- examples/save-load-state/save-load-state.cpp | 1 + examples/server/server.cpp | 700 ++-------------- examples/server/utils.hpp | 7 - examples/simple/README.md | 2 +- examples/simple/simple.cpp | 50 +- examples/speculative/speculative.cpp | 3 +- llama.cpp | 2 +- scripts/run-with-preset.py | 4 +- 34 files changed, 900 insertions(+), 1456 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index df583db8390d2..c8df9a4ce8ef5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -7,20 +7,21 @@ #include #include +#include #include +#include +#include #include #include #include -#include #include +#include #include #include #include #include #include #include -#include -#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -237,10 +238,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } } - if (params.prompt_cache_all && - (params.interactive || params.interactive_first || - params.instruct)) { - + if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } @@ -265,22 +263,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - bool result = true; + const auto params_org = params; // the example can modify the default params + try { - if (!gpt_params_parse_ex(argc, argv, params)) { - gpt_params_print_usage(argc, argv, gpt_params()); - exit(0); + if (!gpt_params_parse_ex(argc, argv, params) || params.usage) { + params = params_org; + params.usage = true; + return false; } - } - catch (const std::invalid_argument & ex) { + } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); - gpt_params_print_usage(argc, argv, gpt_params()); - exit(1); + return false; } - return result; + + return true; } bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { + const char split_delim = ','; + llama_sampling_params & sparams = params.sparams; if (arg == "-s" || arg == "--seed") { @@ -288,7 +289,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } - // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context. + // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. params.seed = std::stoul(argv[i]); sparams.seed = std::stoul(argv[i]); return true; @@ -349,6 +350,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.escape = true; return true; } + if (arg == "--no-escape") { + params.escape = false; + return true; + } if (arg == "--prompt-cache") { if (++i >= argc) { invalid_param = true; @@ -403,7 +408,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } - if (arg == "-n" || arg == "--n-predict") { + if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { if (++i >= argc) { invalid_param = true; return true; @@ -900,34 +905,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.interactive = true; return true; } - if (arg == "--interactive-specials") { - params.interactive_specials = true; - return true; - } - if (arg == "--special") { + if (arg == "-sp" || arg == "--special") { params.special = true; return true; } - if (arg == "--embedding") { + if (arg == "--embedding" || arg == "--embeddings") { params.embedding = true; return true; } - if (arg == "--interactive-first") { + if (arg == "-if" || arg == "--interactive-first") { params.interactive_first = true; return true; } - if (arg == "-ins" || arg == "--instruct") { - params.instruct = true; - return true; - } if (arg == "-cnv" || arg == "--conversation") { params.conversation = true; return true; } - if (arg == "-cml" || arg == "--chatml") { - params.chatml = true; - return true; - } if (arg == "--infill") { params.infill = true; return true; @@ -964,7 +957,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.flash_attn = true; return true; } - if (arg == "--color") { + if (arg == "-co" || arg == "--color") { params.use_color = true; return true; } @@ -972,26 +965,26 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.use_mlock = true; return true; } - if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { if (++i >= argc) { invalid_param = true; return true; } params.n_gpu_layers = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } return true; } - if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { if (++i >= argc) { invalid_param = true; return true; } params.n_gpu_layers_draft = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } return true; @@ -1087,6 +1080,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else { invalid_param = true; } return true; } + if (arg == "-v" || arg == "--verbose") { + params.verbose = true; + return true; + } if (arg == "--verbose-prompt") { params.verbose_prompt = true; return true; @@ -1151,24 +1148,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.ppl_stride = std::stoi(argv[i]); return true; } - if (arg == "-ptc" || arg == "--print-token-count") { + if (arg == "--ppl-output-type") { if (++i >= argc) { invalid_param = true; return true; } - params.n_print = std::stoi(argv[i]); - return true; - } - if (arg == "--check-tensors") { - params.check_tensors = true; + params.ppl_output_type = std::stoi(argv[i]); return true; } - if (arg == "--ppl-output-type") { + if (arg == "-ptc" || arg == "--print-token-count") { if (++i >= argc) { invalid_param = true; return true; } - params.ppl_output_type = std::stoi(argv[i]); + params.n_print = std::stoi(argv[i]); + return true; + } + if (arg == "--check-tensors") { + params.check_tensors = true; return true; } if (arg == "--hellaswag") { @@ -1242,19 +1239,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } - if (arg == "-h" || arg == "--help") { - gpt_params_print_usage(argc, argv, gpt_params()); - exit(0); + if (arg == "-h" || arg == "--help" || arg == "--usage" ) { + params.usage = true; + return true; } if (arg == "--version") { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); } - if (arg == "--random-prompt") { - params.random_prompt = true; - return true; - } if (arg == "--in-prefix-bos") { params.input_prefix_bos = true; return true; @@ -1321,6 +1314,229 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } + if (arg == "--host") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hostname = argv[i]; + return true; + } + if (arg == "--port") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.port = std::stoi(argv[i]); + return true; + } + if (arg == "--path") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.public_path = argv[i]; + return true; + } + if (arg == "--api-key") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.api_keys.push_back(argv[i]); + return true; + } + if (arg == "--api-key-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream key_file(argv[i]); + if (!key_file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::string key; + while (std::getline(key_file, key)) { + if (!key.empty()) { + params.api_keys.push_back(key); + } + } + key_file.close(); + return true; + } + if (arg == "--ssl-key-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.ssl_file_key = argv[i]; + return true; + } + if (arg == "--ssl-cert-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.ssl_file_cert = argv[i]; + return true; + } + if (arg == "--timeout" || arg == "-to") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.timeout_read = std::stoi(argv[i]); + params.timeout_write = std::stoi(argv[i]); + return true; + } + if (arg == "-spf" || arg == "--system-prompt-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::string system_prompt; + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(system_prompt) + ); + params.system_prompt = system_prompt; + return true; + } + if (arg == "--log-format") { + if (++i >= argc) { + invalid_param = true; + return true; + } + if (std::strcmp(argv[i], "json") == 0) { + params.log_json = true; + } else if (std::strcmp(argv[i], "text") == 0) { + params.log_json = false; + } else { + invalid_param = true; + return true; + } + return true; + } + if (arg == "--no-slots") { + params.endpoint_slots = false; + return true; + } + if (arg == "--metrics") { + params.endpoint_metrics = true; + return true; + } + if (arg == "--slot-save-path") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.slot_save_path = argv[i]; + // if doesn't end with DIRECTORY_SEPARATOR, add it + if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + params.slot_save_path += DIRECTORY_SEPARATOR; + } + return true; + } + if (arg == "--chat-template") { + if (++i >= argc) { + invalid_param = true; + return true; + } + if (!llama_chat_verify_template(argv[i])) { + fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); + fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); + invalid_param = true; + return true; + } + params.chat_template = argv[i]; + return true; + } + if (arg == "-pps") { + params.is_pp_shared = true; + return true; + } + if (arg == "-npp") { + if (++i >= argc) { + invalid_param = true; + return true; + } + auto p = string_split(argv[i], split_delim); + params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); + return true; + } + if (arg == "-ntg") { + if (++i >= argc) { + invalid_param = true; + return true; + } + auto p = string_split(argv[i], split_delim); + params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); + return true; + } + if (arg == "-npl") { + if (++i >= argc) { + invalid_param = true; + return true; + } + auto p = string_split(argv[i], split_delim); + params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + return true; + } + if (arg == "--context-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + params.context_files.push_back(argv[i]); + return true; + } + if (arg == "--chunk-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.chunk_size = std::stoi(argv[i]); + return true; + } + if (arg == "--chunk-separator") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.chunk_separator = argv[i]; + return true; + } + if (arg == "--junk") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_junk = std::stoi(argv[i]); + return true; + } + if (arg == "--pos") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.i_pos = std::stoi(argv[i]); + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1348,6 +1564,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return false; } +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; @@ -1359,198 +1585,290 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param } sampler_type_names.pop_back(); - printf("\n"); - printf("usage: %s [options]\n", argv[0]); - printf("\n"); - printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" --version show version and build info\n"); - printf(" -i, --interactive run in interactive mode\n"); - printf(" --special special tokens output enabled\n"); - printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); - printf(" --interactive-first run in interactive mode and wait for input right away\n"); - printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); - printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); - printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); - printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); - printf(" -r PROMPT, --reverse-prompt PROMPT\n"); - printf(" halt generation at PROMPT, return control in interactive mode\n"); - printf(" (can be specified more than once for multiple prompts).\n"); - printf(" --color colorise output to distinguish prompt and user input from generations\n"); - printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); - printf(" -tb N, --threads-batch N\n"); - printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); - printf(" -td N, --threads-draft N"); - printf(" number of threads to use during generation (default: same as --threads)\n"); - printf(" -tbd N, --threads-batch-draft N\n"); - printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); - printf(" -p PROMPT, --prompt PROMPT\n"); - printf(" prompt to start generation with (default: empty)\n"); - printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); - printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); - printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); - printf(" not supported with --interactive or other interactive options\n"); - printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); - printf(" --random-prompt start with a randomized prompt.\n"); - printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n"); - printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n"); - printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); - printf(" -f FNAME, --file FNAME\n"); - printf(" prompt file to start generation.\n"); - printf(" -bf FNAME, --binary-file FNAME\n"); - printf(" binary file containing multiple choice tasks.\n"); - printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); - printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); - printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch); - printf(" -ub N, --ubatch-size N\n"); - printf(" physical maximum batch size (default: %d)\n", params.n_ubatch); - printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n"); - printf(" (default: %s)\n", sampler_type_names.c_str()); - printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); - printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); - printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); - printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); - printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); - printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); - printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n); - printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat); - printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present); - printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq); - printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range); - printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent); - printf(" --mirostat N use Mirostat sampling.\n"); - printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); - printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat); - printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta); - printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau); - printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); - printf(" modifies the likelihood of token appearing in the completion,\n"); - printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); - printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); - printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); - printf(" --grammar-file FNAME file to read grammar from\n"); - printf(" -j SCHEMA, --json-schema SCHEMA\n"); - printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n"); - printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n"); - printf(" --cfg-negative-prompt PROMPT\n"); - printf(" negative prompt to use for guidance. (default: empty)\n"); - printf(" --cfg-negative-prompt-file FNAME\n"); - printf(" negative prompt file to use for guidance. (default: empty)\n"); - printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale); - printf(" --rope-scaling {none,linear,yarn}\n"); - printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); - printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n"); - printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); - printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n"); - printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); - printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); - printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); - printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); - printf(" --pooling {none,mean,cls}\n"); - printf(" pooling type for embeddings, use model default if unspecified\n"); - printf(" -dt N, --defrag-thold N\n"); - printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); - printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); - printf(" --penalize-nl penalize newline tokens\n"); - printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp); - printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n"); - printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); - printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); - printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); - printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); - printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n"); - printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks); - printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n"); - printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); - printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); - printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); - printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); - printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split); - printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); - printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled"); - printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); - printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n"); + struct option_info { + LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5) + option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) { + va_list args_list; + va_start(args_list, desc); + char buffer[1024]; + vsnprintf(buffer, sizeof(buffer), desc, args_list); + va_end(args_list); + this->desc = buffer; + } + + option_info(const std::string & grp) : grp(grp) {} + + std::string tags; + std::string args; + std::string desc; + std::string grp; + }; + + std::vector options; + + // TODO: filter by tags + + options.push_back({ "general" }); + options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); + options.push_back({ "*", " --version", "show version and build info" }); + options.push_back({ "*", "-v, --verbose", "print verbose information" }); + options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); + options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); + options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); + options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads }); + options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); + options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); + options.push_back({ "speculative", "-tbd, --threads-batch-draft N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); + options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft }); + options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split }); + options.push_back({ "*", "-lcs, --lookup-cache-static FNAME", + "path to static lookup cache to use for lookup decoding (not updated by generation)" }); + options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME", + "path to dynamic lookup cache to use for lookup decoding (updated by generation)" }); + + options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx }); + options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); + options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch }); + options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch }); + options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); + options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); + options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); + options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() }); + options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); + options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); + options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); + options.push_back({ "*", " --no-escape", "do not process escape sequences" }); + options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print }); + options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" }); + options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n" + "not supported with --interactive or other interactive options" }); + options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" }); + options.push_back({ "main", "-r, --reverse-prompt PROMPT", + "halt generation at PROMPT, return control in interactive mode\n" + "can be specified more than once for multiple prompts" }); + options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); + options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" }); + options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); + options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); + options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); + options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); + options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); + options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + + options.push_back({ "sampling" }); + options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" + "(default: %s)", sampler_type_names.c_str() }); + options.push_back({ "*", " --sampling-seq SEQUENCE", + "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() }); + options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" }); + options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" }); + options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp }); + options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k }); + options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p }); + options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p }); + options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z }); + options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p }); + options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n }); + options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat }); + options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present }); + options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq }); + options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range }); + options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent }); + options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n" + "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat }); + options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta }); + options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau }); + options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" + "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" + "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" }); + options.push_back({ "main", " --cfg-negative-prompt PROMPT", + "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() }); + options.push_back({ "main", " --cfg-negative-prompt-file FNAME", + "negative prompt file to use for guidance" }); + options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); + + options.push_back({ "grammar" }); + options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); + options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); + options.push_back({ "*", "-j, --json-schema SCHEMA", + "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n" + "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" }); + + options.push_back({ "embedding" }); + options.push_back({ "embedding", " --pooling {none,mean,cls}", + "pooling type for embeddings, use model default if unspecified" }); + + options.push_back({ "context hacking" }); + options.push_back({ "*", " --rope-scaling {none,linear,yarn}", + "RoPE frequency scaling method, defaults to linear unless specified by the model" }); + options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" }); + options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" }); + options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" }); + options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx }); + options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor }); + options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor }); + options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow }); + options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast }); + options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n }); + options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w }); + options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" }); + options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" }); + options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() }); + options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() }); + + options.push_back({ "perplexity" }); + options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" }); + options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" }); + options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks }); + options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" }); + options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks }); + options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" }); + options.push_back({ "perplexity", " --multiple-choice-tasks N", + "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks }); + options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" }); + options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride }); + options.push_back({ "perplexity", " --ppl-output-type {0,1}", + "output type for perplexity calculation (default: %d)", params.ppl_output_type }); + + options.push_back({ "parallel" }); + options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold }); + options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); + options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); + options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); + + options.push_back({ "multi-modality" }); + options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); + options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" }); + + options.push_back({ "backend" }); + options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); if (llama_supports_mlock()) { - printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); + options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); } if (llama_supports_mmap()) { - printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); - } - printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); - printf(" - distribute: spread execution evenly over all nodes\n"); - printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); - printf(" - numactl: use the CPU map provided by numactl\n"); - printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); - printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); + options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" }); + } + options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n" + " - distribute: spread execution evenly over all nodes\n" + " - isolate: only spawn threads on CPUs on the node that execution started on\n" + " - numactl: use the CPU map provided by numactl\n" + "if run without this previously, it is recommended to drop the system page cache before using this\n" + "see https://github.com/ggerganov/llama.cpp/issues/1437" }); + if (llama_supports_gpu_offload()) { - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -ngld N, --n-gpu-layers-draft N\n"); - printf(" number of layers to store in VRAM for the draft model\n"); - printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf(" how to split the model across multiple GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf(" - layer (default): split layers and KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); - printf(" -ts SPLIT, --tensor-split SPLIT\n"); - printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); - printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); - } - printf(" --rpc SERVERS comma separated list of RPC servers\n"); - printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); - printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); - printf(" -gan N, --grp-attn-n N\n"); - printf(" group-attention factor (default: %d)\n", params.grp_attn_n); - printf(" -gaw N, --grp-attn-w N\n"); - printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w); - printf(" -dkvc, --dump-kv-cache\n"); - printf(" verbose print of the KV cache\n"); - printf(" -nkvo, --no-kv-offload\n"); - printf(" disable KV offload\n"); - printf(" -ctk TYPE, --cache-type-k TYPE\n"); - printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str()); - printf(" -ctv TYPE, --cache-type-v TYPE\n"); - printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str()); - printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); - printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); - printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - printf(" --control-vector FNAME\n"); - printf(" add a control vector\n"); - printf(" --control-vector-scaled FNAME S\n"); - printf(" add a control vector with user defined scaling S\n"); - printf(" --control-vector-layer-range START END\n"); - printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); - printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH); - printf(" -md FNAME, --model-draft FNAME\n"); - printf(" draft model for speculative decoding (default: unused)\n"); - printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); - printf(" model download url (default: unused)\n"); - printf(" -hfr REPO, --hf-repo REPO\n"); - printf(" Hugging Face model repository (default: unused)\n"); - printf(" -hff FILE, --hf-file FILE\n"); - printf(" Hugging Face model file (default: unused)\n"); - printf(" -ld LOGDIR, --logdir LOGDIR\n"); - printf(" path under which to save YAML logs (no logging if unset)\n"); - printf(" -lcs FNAME, --lookup-cache-static FNAME\n"); - printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n"); - printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n"); - printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n"); - printf(" --override-kv KEY=TYPE:VALUE\n"); - printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); - printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf(" -ptc N, --print-token-count N\n"); - printf(" print token count every N tokens (default: %d)\n", params.n_print); - printf(" --check-tensors check model tensor data for invalid values\n"); - printf("\n"); + options.push_back({ "*", "-ngl, --gpu-layers N", + "number of layers to store in VRAM" }); + options.push_back({ "*", "-ngld, --gpu-layers-draft N", + "number of layers to store in VRAM for the draft model" }); + options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", + "how to split the model across multiple GPUs, one of:\n" + " - none: use one GPU only\n" + " - layer (default): split layers and KV across GPUs\n" + " - row: split rows across GPUs" }); + options.push_back({ "*", "-ts, --tensor-split SPLIT", + "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); + options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n" + "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu }); + } + + options.push_back({ "model" }); + options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" }); + options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", + "advanced option to override model metadata by key. may be specified multiple times.\n" + "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); + options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" }); + options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); + options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" }); + options.push_back({ "*", " --control-vector FNAME", "add a control vector" }); + options.push_back({ "*", " --control-vector-scaled FNAME SCALE", + "add a control vector with user defined scaling SCALE" }); + options.push_back({ "*", " --control-vector-layer-range START END", + "layer range to apply the control vector(s) to, start and end inclusive" }); + options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" + "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH }); + options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" }); + options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" }); + options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); + options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" }); + + options.push_back({ "retrieval" }); + options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" }); + options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size }); + options.push_back({ "retrieval", " --chunk-separator STRING", + "separator between chunks (default: '%s')", params.chunk_separator.c_str() }); + + options.push_back({ "passkey" }); + options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); + options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); + + options.push_back({ "bench" }); + options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); + options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); + options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" }); + options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" }); + + options.push_back({ "server" }); + options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); + options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); + options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); + options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" }); + options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); + options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); + options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); + options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" }); + options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read }); + options.push_back({ "server", " --system-prompt-file FNAME", + "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" }); + options.push_back({ "server", " --log-format {text,json}", + "log output format: json or text (default: json)" }); + options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" }); + options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" }); + options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" }); + options.push_back({ "server", " --chat-template JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "only commonly used templates are accepted:\n" + "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); + #ifndef LOG_DISABLE_LOGS - log_print_usage(); + options.push_back({ "logging" }); + options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" }); + options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" }); + options.push_back({ "logging", " --log-test", "Run simple logging test" }); + options.push_back({ "logging", " --log-disable", "Disable trace logs" }); + options.push_back({ "logging", " --log-enable", "Enable trace logs" }); + options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" }); + options.push_back({ "logging", " --log-new", "Create a separate new log file on start. " + "Each log file will have unique name: \"..log\"" }); + options.push_back({ "logging", " --log-append", "Don't truncate the old log file." }); #endif // LOG_DISABLE_LOGS + + printf("usage: %s [options]\n", argv[0]); + + for (const auto & o : options) { + if (!o.grp.empty()) { + printf("\n%s:\n\n", o.grp.c_str()); + continue; + } + printf(" %-32s", o.args.c_str()); + if (o.args.length() > 30) { + printf("\n%34s", ""); + } + + const auto desc = o.desc; + size_t start = 0; + size_t end = desc.find('\n'); + while (end != std::string::npos) { + printf("%s\n%34s", desc.substr(start, end - start).c_str(), ""); + start = end + 1; + end = desc.find('\n', start); + } + + printf("%s\n", desc.substr(start).c_str()); + } + printf("\n"); } std::string gpt_params_get_system_info(const gpt_params & params) { @@ -1610,24 +1928,6 @@ std::string string_get_sortable_timestamp() { return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); } -std::string string_random_prompt(std::mt19937 & rng) { - const int r = rng() % 10; - switch (r) { - case 0: return "So"; - case 1: return "Once upon a time"; - case 2: return "When"; - case 3: return "The"; - case 4: return "After"; - case 5: return "If"; - case 6: return "import"; - case 7: return "He"; - case 8: return "She"; - case 9: return "They"; - } - - GGML_UNREACHABLE(); -} - void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -2503,6 +2803,12 @@ bool llama_should_add_bos_token(const llama_model * model) { return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); } +bool llama_chat_verify_template(const std::string & tmpl) { + llama_chat_message chat[] = {{"user", "test"}}; + int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); + return res >= 0; +} + // // KV cache utils // @@ -2902,9 +3208,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str()); fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str()); - fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); - fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false"); fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); fprintf(stream, "keep: %d # default: 0\n", params.n_keep); fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); @@ -2954,7 +3258,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens); - fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); fprintf(stream, "reverse_prompt:\n"); diff --git a/common/common.h b/common/common.h index 264504830a7f0..e0a08a61b7424 100644 --- a/common/common.h +++ b/common/common.h @@ -60,7 +60,7 @@ struct gpt_params { int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size + int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt @@ -99,23 +99,23 @@ struct gpt_params { // // sampling parameters struct llama_sampling_params sparams; - std::string model = ""; // model path - std::string model_draft = ""; // draft model for speculative decoding + std::string model = ""; // model path + std::string model_draft = ""; // draft model for speculative decoding std::string model_alias = "unknown"; // model alias - std::string model_url = ""; // model url to download - std::string hf_repo = ""; // HF repo - std::string hf_file = ""; // HF file + std::string model_url = ""; // model url to download + std::string hf_repo = ""; // HF repo + std::string hf_file = ""; // HF file std::string prompt = ""; - std::string prompt_file = ""; // store the external prompt file name - std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state - std::string input_prefix = ""; // string to prefix user inputs with - std::string input_suffix = ""; // string to suffix user inputs with - std::vector antiprompt; // string upon seeing which more user input is prompted - std::string logdir = ""; // directory in which to save YAML log files + std::string prompt_file = ""; // store the external prompt file name + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state + std::string input_prefix = ""; // string to prefix user inputs with + std::string input_suffix = ""; // string to suffix user inputs with + std::string logdir = ""; // directory in which to save YAML log files std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding - std::string logits_file = ""; // file for saving *all* logits + std::string logits_file = ""; // file for saving *all* logits + std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; // TODO: avoid tuple, use struct @@ -127,8 +127,8 @@ struct gpt_params { int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector - int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. - int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line + int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. + int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) // bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt @@ -142,19 +142,17 @@ struct gpt_params { bool kl_divergence = false; // compute KL divergence - bool random_prompt = false; // do not randomize prompt if none provided + bool usage = false; // print usage bool use_color = false; // use color to distinguish generations and inputs - bool interactive = false; // interactive mode - bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode bool special = false; // enable special token output + bool interactive = false; // interactive mode + bool interactive_first = false; // wait for user input immediately bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) - bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it bool embedding = false; // get only sentence embedding - bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\" - bool interactive_first = false; // wait for user input immediately + bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\" bool multiline_input = false; // reverse the usage of `\` bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly @@ -162,10 +160,10 @@ struct gpt_params { bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool ignore_eos = false; // ignore generated EOS tokens - bool instruct = false; // instruction mode (used for Alpaca models) bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory + bool verbose = false; bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool infill = false; // use infill mode @@ -180,6 +178,47 @@ struct gpt_params { // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector std::vector image; // path to image file(s) + + // server params + int32_t port = 8080; + int32_t timeout_read = 600; + int32_t timeout_write = timeout_read; + int32_t n_threads_http = -1; + + std::string hostname = "127.0.0.1"; + std::string public_path = ""; + std::string chat_template = ""; + std::string system_prompt = ""; + + std::vector api_keys; + + std::string ssl_file_key = ""; + std::string ssl_file_cert = ""; + + bool endpoint_slots = true; + bool endpoint_metrics = false; + + bool log_json = false; + + std::string slot_save_path; + + // batched-bench params + bool is_pp_shared = false; + + std::vector n_pp; + std::vector n_tg; + std::vector n_pl; + + // retrieval params + std::vector context_files; // context files to embed + + int32_t chunk_size = 64; // chunk size for context embedding + + std::string chunk_separator = "\n"; // chunk separator for context embedding + + // passkey params + int32_t n_junk = 250; // number of times to repeat the junk text + int32_t i_pos = -1; // position of the passkey in the junk text }; void gpt_params_handle_model_default(gpt_params & params); @@ -199,7 +238,20 @@ std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); std::string string_get_sortable_timestamp(); -std::string string_random_prompt(std::mt19937 & rng); + +template +static std::vector string_split(const std::string & str, char delim) { + std::vector values; + std::istringstream str_stream(str); + std::string token; + while (std::getline(str_stream, token, delim)) { + T value; + std::istringstream token_stream(token); + token_stream >> value; + values.push_back(value); + } + return values; +} bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); @@ -282,6 +334,13 @@ std::string llama_detokenize_bpe( // defaults to true when model type is SPM, otherwise false. bool llama_should_add_bos_token(const llama_model * model); +// +// Chat template utils +// + +// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid +bool llama_chat_verify_template(const std::string & tmpl); + // // KV cache utils // diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md index bf951baf7f096..fa4baf6403e9e 100644 --- a/examples/batched-bench/README.md +++ b/examples/batched-bench/README.md @@ -10,16 +10,16 @@ There are 2 modes of operation: - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) ```bash -./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] +./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared -./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99 +./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared -./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99 +./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps # custom set of batches -./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32 +./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 ``` ## Sample results diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 2924d8116f44f..718f0a61a1878 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -28,67 +28,27 @@ static std::vector parse_list(char * p) { return ret; } -int main(int argc, char ** argv) { - gpt_params params; - - if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] \n" , argv[0]); - printf(" , and PL are comma-separated lists of numbers without spaces\n\n"); - printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); - return 1 ; - } - - int n_kv_max = 2048; - int n_batch = 2048; - int n_ubatch = 512; - bool flash_attn = false; - int is_pp_shared = 0; - int n_gpu_layers = 0; - - std::vector n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, }; - std::vector n_tg = { 128, 256, }; - std::vector n_pl = { 1, 2, 4, 8, 16, 32, }; - //std::vector n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, }; - - if (argc >= 2) { - params.model = argv[1]; - } - - if (argc >= 3) { - n_kv_max = std::atoi(argv[2]); - } - - if (argc >= 4) { - n_batch = std::atoi(argv[3]); - } - - if (argc >= 5) { - n_ubatch = std::atoi(argv[4]); - } - - if (argc >= 6) { - flash_attn = std::atoi(argv[5]); - } +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); - if (argc >= 7) { - is_pp_shared = std::atoi(argv[6]); - } + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); + LOG_TEE("\n"); +} - if (argc >= 8) { - n_gpu_layers = std::atoi(argv[7]); - } +int main(int argc, char ** argv) { + gpt_params params; - if (argc >= 9) { - n_pp = parse_list(argv[8]); + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); + return 1; } - if (argc >= 10) { - n_tg = parse_list(argv[9]); - } + int is_pp_shared = params.is_pp_shared; - if (argc >= 11) { - n_pl = parse_list(argv[10]); - } + std::vector n_pp = params.n_pp; + std::vector n_tg = params.n_tg; + std::vector n_pl = params.n_pl; // init LLM @@ -97,12 +57,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_default_params(); - - const std::vector t_split(llama_max_devices(), 0.0f); - - model_params.n_gpu_layers = n_gpu_layers; - model_params.tensor_split = t_split.data(); + llama_model_params model_params = llama_model_params_from_gpt_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -111,16 +66,7 @@ int main(int argc, char ** argv) { return 1; } - llama_context_params ctx_params = llama_context_default_params(); - - ctx_params.seed = 1234; - ctx_params.n_ctx = n_kv_max; - ctx_params.n_batch = n_batch; - ctx_params.n_ubatch = n_ubatch; - ctx_params.flash_attn = flash_attn; - - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); // ensure enough sequences are available ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); @@ -132,6 +78,8 @@ int main(int argc, char ** argv) { return 1; } + const int32_t n_kv_max = llama_n_ctx(ctx); + llama_batch batch = llama_batch_init(n_kv_max, 0, 1); // decode in batches of ctx_params.n_batch tokens @@ -175,7 +123,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("\n"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); diff --git a/examples/batched/README.md b/examples/batched/README.md index 5d730331769fb..ed204c3088882 100644 --- a/examples/batched/README.md +++ b/examples/batched/README.md @@ -3,7 +3,7 @@ The example demonstrates batched generation from a given prompt ```bash -./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4 +./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 ... diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 591bc6e57645c..62d9b144d3340 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -7,48 +7,31 @@ #include #include -int main(int argc, char ** argv) { - gpt_params params; - - if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]); - return 1 ; - } - - // number of parallel batches - int n_parallel = 1; +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); - // total length of the sequences including the prompt - int n_len = 32; - - // number of layers to offload to the GPU - int n_gpu_layers = 0; - - if (argc >= 2) { - params.model = argv[1]; - } + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); + LOG_TEE("\n"); +} - if (argc >= 3) { - params.prompt = argv[2]; - } +int main(int argc, char ** argv) { + gpt_params params; - if (argc >= 4) { - n_parallel = std::atoi(argv[3]); - } + params.prompt = "Hello my name is"; + params.n_predict = 32; - if (argc >= 5) { - n_len = std::atoi(argv[4]); + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); + return 1; } - if (argc >= 6) { - n_gpu_layers = std::atoi(argv[5]); - } - if (params.prompt.empty()) { - params.prompt = "Hello my name is"; - } + // number of parallel batches + int n_parallel = params.n_parallel; - string_process_escapes(params.prompt); + // total length of the sequences including the prompt + int n_predict = 32; // init LLM @@ -57,9 +40,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_default_params(); - - model_params.n_gpu_layers = n_gpu_layers; + llama_model_params model_params = llama_model_params_from_gpt_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -73,18 +54,14 @@ int main(int argc, char ** argv) { std::vector tokens_list; tokens_list = ::llama_tokenize(model, params.prompt, true); - const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel; + const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; // initialize the context - llama_context_params ctx_params = llama_context_default_params(); + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); - ctx_params.seed = 1234; ctx_params.n_ctx = n_kv_req; - ctx_params.n_batch = std::max(n_len, n_parallel); - ctx_params.n_seq_max = n_parallel; - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + ctx_params.n_batch = std::max(n_predict, n_parallel); llama_context * ctx = llama_new_context_with_model(model, ctx_params); @@ -93,9 +70,9 @@ int main(int argc, char ** argv) { return 1; } - const int n_ctx = llama_n_ctx(ctx); + const int n_ctx = llama_n_ctx(ctx); - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { @@ -156,7 +133,7 @@ int main(int argc, char ** argv) { const auto t_main_start = ggml_time_us(); - while (n_cur <= n_len) { + while (n_cur <= n_predict) { // prepare the next batch llama_batch_clear(batch); @@ -192,7 +169,7 @@ int main(int argc, char ** argv) { //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of generation? -> mark the stream as finished - if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { + if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { i_batch[i] = -1; LOG_TEE("\n"); if (n_parallel > 1) { diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 004399b5f7eb8..244751e003d9e 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -63,6 +63,7 @@ int main(int argc, char ** argv) { gpt_params params; if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } @@ -79,9 +80,6 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = string_random_prompt(rng); - } llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 51d67d6d97ae6..64cd338c26351 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) { } int main(int argc, char ** argv) { - callback_data cb_data; gpt_params params; + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } print_build_info(); std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = string_random_prompt(rng); - } llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh index 7ca6fa7f20de8..3bc0fa47110e3 100755 --- a/examples/gguf-split/tests.sh +++ b/examples/gguf-split/tests.sh @@ -41,7 +41,7 @@ echo PASS echo # 2b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32 echo PASS echo @@ -51,7 +51,7 @@ echo PASS echo # 3b. Test the merged model is loading properly -$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32 echo PASS echo @@ -61,7 +61,7 @@ echo PASS echo # 4b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32 echo PASS echo @@ -71,7 +71,7 @@ echo #echo # 5b. Test the merged model is loading properly -#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32 +#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 #echo PASS #echo @@ -81,7 +81,7 @@ echo PASS echo # 6b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32 echo PASS echo diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 52fd719b38ee5..2135157916c97 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -153,7 +153,9 @@ static std::string gritlm_instruction(const std::string & instruction) { int main(int argc, char * argv[]) { gpt_params params; + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 25a2351cc64d3..e050c09d2f38b 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -533,7 +533,6 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool } int main(int argc, char ** argv) { - StatParams sparams; std::string prev_result_file; std::string combine_files; @@ -581,7 +580,9 @@ int main(int argc, char ** argv) { gpt_params params; params.n_batch = 512; - if (!gpt_params_parse(args.size(), args.data(), params)) { + + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } @@ -597,9 +598,6 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = string_random_prompt(rng); - } sparams.dataset = params.prompt_file; g_collector.set_parameters(std::move(sparams)); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 539f781847893..0e4ec79c693fa 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -107,6 +107,7 @@ int main(int argc, char ** argv) { g_params = ¶ms; if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } @@ -139,27 +140,6 @@ int main(int argc, char ** argv) { LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; } - if (params.instruct) { - printf("\n************\n"); - printf("%s: please use the 'main' tool for instruct mode\n", __func__); - printf("************\n\n"); - - return 0; - } - if (params.chatml) { - printf("\n************\n"); - printf("%s: please use the 'main' tool for chatml mode\n", __func__); - printf("************\n\n"); - - return 0; - } - if (!params.antiprompt.empty()) { - printf("\n************\n"); - printf("%s: please use the 'main' tool for antiprompt mode\n", __func__); - printf("************\n\n"); - - return 0; - } if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) { printf("\n************\n"); printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); @@ -167,20 +147,6 @@ int main(int argc, char ** argv) { return 0; } - if (params.random_prompt) { - printf("\n************\n"); - printf("%s: please use the 'main' tool for random prompt mode\n", __func__); - printf("************\n\n"); - - return 0; - } - if (!params.path_prompt_cache.empty()) { - printf("\n************\n"); - printf("%s: infill does not support prompt caching\n", __func__); - printf("************\n\n"); - - return 0; - } if (params.rope_freq_base != 0.0) { LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); @@ -207,17 +173,13 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; - llama_context * ctx_guidance = NULL; + g_model = &model; g_ctx = &ctx; // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (sparams.cfg_scale > 1.f) { - struct llama_context_params lparams = llama_context_params_from_gpt_params(params); - ctx_guidance = llama_new_context_with_model(model, lparams); - } if (model == NULL) { LOG_TEE("%s: error: unable to load model\n", __func__); @@ -273,25 +235,6 @@ int main(int argc, char ** argv) { LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); } - // Tokenize negative prompt - std::vector guidance_inp; - int guidance_offset = 0; - int original_prompt_len = 0; - if (ctx_guidance) { - LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); - - guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true); - LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); - - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true); - LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); - - original_prompt_len = original_inp.size(); - guidance_offset = (int)guidance_inp.size() - original_prompt_len; - LOG("original_prompt_len: %s", log_tostr(original_prompt_len)); - LOG("guidance_offset: %s", log_tostr(guidance_offset)); - } - if ((int) embd_inp.size() > n_ctx - 4) { LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; @@ -319,15 +262,6 @@ int main(int argc, char ** argv) { LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - if (ctx_guidance) { - LOG_TEE("\n"); - LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); - LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); - for (int i = 0; i < (int) guidance_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); - } - } - if (params.n_keep > 0) { LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { @@ -395,12 +329,11 @@ int main(int argc, char ** argv) { is_interacting = params.interactive_first; } - bool input_echo = true; + bool input_echo = true; - int n_past = 0; - int n_remain = params.n_predict; - int n_consumed = 0; - int n_past_guidance = 0; + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; std::vector input_tokens; g_input_tokens = &input_tokens; std::vector output_tokens; g_output_tokens = &output_tokens; @@ -410,7 +343,6 @@ int main(int argc, char ** argv) { console::set_display(console::prompt); std::vector embd; - std::vector embd_guidance; struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); @@ -436,7 +368,7 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { + if (n_past + (int) embd.size() > n_ctx) { if (params.n_predict == -2) { LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; @@ -453,11 +385,7 @@ int main(int argc, char ** argv) { n_past -= n_discard; - if (ctx_guidance) { - n_past_guidance -= n_discard; - } - - LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); + LOG("after swap: n_past = %d\n", n_past); LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); @@ -465,45 +393,6 @@ int main(int argc, char ** argv) { // evaluate tokens in batches // embd is typically prepared beforehand to fit within a batch, but not always - - if (ctx_guidance) { - int input_size = 0; - llama_token * input_buf = NULL; - - if (n_past_guidance < (int) guidance_inp.size()) { - // Guidance context should have the same data with these modifications: - // - // * Replace the initial prompt - // * Shift everything by guidance_offset - embd_guidance = guidance_inp; - if (embd.begin() + original_prompt_len < embd.end()) { - embd_guidance.insert( - embd_guidance.end(), - embd.begin() + original_prompt_len, - embd.end() - ); - } - - input_buf = embd_guidance.data(); - input_size = embd_guidance.size(); - - LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str()); - } else { - input_buf = embd.data(); - input_size = embd.size(); - } - - for (int i = 0; i < input_size; i += params.n_batch) { - int n_eval = std::min(input_size - i, params.n_batch); - if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); - return 1; - } - - n_past_guidance += n_eval; - } - } - for (int i = 0; i < (int) embd.size(); i += params.n_batch) { int n_eval = (int) embd.size() - i; if (n_eval > params.n_batch) { @@ -525,11 +414,9 @@ int main(int argc, char ** argv) { } embd.clear(); - embd_guidance.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - - const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); + const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr); llama_sampling_accept(ctx_sampling, ctx, id, true); @@ -583,7 +470,6 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { - // deal with eot token in infill mode if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { @@ -644,7 +530,6 @@ int main(int argc, char ** argv) { embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); embd_inp.push_back(llama_token_middle(model)); embd.clear(); - embd_guidance.clear(); n_remain = params.n_predict; n_past = 0; n_consumed = 0; @@ -751,7 +636,6 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); - if (ctx_guidance) { llama_free(ctx_guidance); } llama_free(ctx); llama_free_model(model); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index fa7ad1bdb9e4a..5c31548a6c25c 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -41,20 +41,6 @@ static std::string join(const std::vector & values, const std::string & delim return str.str(); } -template -static std::vector split(const std::string & str, char delim) { - std::vector values; - std::istringstream str_stream(str); - std::string token; - while (std::getline(str_stream, token, delim)) { - T value; - std::istringstream token_stream(token); - token_stream >> value; - values.push_back(value); - } - return values; -} - template static std::vector transform_to_str(const std::vector & values, F f) { std::vector str_values; @@ -322,28 +308,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.model.insert(params.model.end(), p.begin(), p.end()); } else if (arg == "-p" || arg == "--n-prompt") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end()); } else if (arg == "-n" || arg == "--n-gen") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); } else if (arg == "-pg") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], ','); + auto p = string_split(argv[i], ','); if (p.size() != 2) { invalid_param = true; break; @@ -354,21 +340,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); } else if (arg == "-ub" || arg == "--ubatch-size") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); } else if (arg == "-ctk" || arg == "--cache-type-k") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -384,7 +370,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -400,14 +386,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); } else if (arg == "-ngl" || arg == "--n-gpu-layers") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); } else if (arg == "-rpc" || arg == "--rpc") { if (++i >= argc) { @@ -420,7 +406,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector modes; for (const auto & m : p) { llama_split_mode mode; @@ -442,13 +428,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.main_gpu = split(argv[i], split_delim); + params.main_gpu = string_split(argv[i], split_delim); } else if (arg == "-nkvo" || arg == "--no-kv-offload") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); } else if (arg == "--numa") { if (++i >= argc) { @@ -466,28 +452,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end()); } else if (arg == "-mmp" || arg == "--mmap") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); } else if (arg == "-embd" || arg == "--embeddings") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) { invalid_param = true; break; } - for (auto ts : split(argv[i], split_delim)) { + for (auto ts : string_split(argv[i], split_delim)) { // split string by ; and / const std::regex regex{R"([;/]+)"}; std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index c974900f21e20..8c7dd2ae3d0dc 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -112,9 +112,12 @@ struct llava_context { struct llama_model * model = NULL; }; -static void show_additional_info(int /*argc*/, char ** argv) { - LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\n example usage:\n"); + LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); } static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { @@ -278,7 +281,7 @@ int main(int argc, char ** argv) { gpt_params params; if (!gpt_params_parse(argc, argv, params)) { - show_additional_info(argc, argv); + print_usage(argc, argv, params); return 1; } @@ -290,8 +293,7 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - gpt_params_print_usage(argc, argv, params); - show_additional_info(argc, argv); + print_usage(argc, argv, {}); return 1; } auto model = llava_init(¶ms); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 54f060a85b263..fb20ad93f9c1d 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -37,7 +37,8 @@ struct ngram_container { int main(int argc, char ** argv) { gpt_params params; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 1c230c9667c71..d713f6f2194a8 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -14,8 +14,10 @@ int main(int argc, char ** argv){ gpt_params params; if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } + // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 87ecc0a4f1394..0b171c87273d1 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -16,6 +16,7 @@ int main(int argc, char ** argv){ gpt_params params; if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 83dbee91a8362..80ecd925d5962 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -15,6 +15,7 @@ int main(int argc, char ** argv){ gpt_params params; if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } diff --git a/examples/main/README.md b/examples/main/README.md index ee930f4e79a0d..4eaa684758b3e 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can #### Unix-based systems (Linux, macOS, etc.): ```bash -./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt +./main -m models/7B/ggml-model.bin --ignore-eos -n -1 ``` #### Windows: ```powershell -main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt +main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 ``` ## Common Options @@ -80,7 +80,6 @@ The `main` program provides several ways to interact with the LLaMA models using - `--prompt PROMPT`: Provide a prompt directly as a command-line option. - `--file FNAME`: Provide a file containing a prompt or multiple prompts. - `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.) -- `--random-prompt`: Start with a randomized prompt. ## Interaction diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 44949ba869e70..b97b7b7937f02 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -122,8 +122,10 @@ int main(int argc, char ** argv) { g_params = ¶ms; if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } + llama_sampling_params & sparams = params.sparams; #ifndef LOG_DISABLE_LOGS @@ -180,9 +182,6 @@ int main(int argc, char ** argv) { LOG_TEE("%s: seed = %u\n", __func__, params.seed); std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = string_random_prompt(rng); - } LOG("%s: llama backend init\n", __func__); llama_backend_init(); @@ -250,11 +249,8 @@ int main(int argc, char ** argv) { std::vector embd_inp; - if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) { + if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { LOG("tokenize the prompt\n"); - if (params.chatml) { - params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; - } embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); } else { LOG("use session tokens\n"); @@ -332,37 +328,13 @@ int main(int argc, char ** argv) { } // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) { + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) { params.n_keep = (int)embd_inp.size(); } else { params.n_keep += add_bos; // always keep the BOS token } - // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true); - const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true); - - LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); - LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); - - // chatml prefix & suffix - const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true); - const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true); - - LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str()); - LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str()); - - // in instruct mode, we inject a prefix and a suffix to each input by the user - if (params.instruct) { - params.interactive_first = true; - params.antiprompt.emplace_back("### Instruction:\n\n"); - } - // similar for chatml mode - else if (params.chatml) { - params.interactive_first = true; - params.antiprompt.emplace_back("<|im_start|>user\n"); - } - else if (params.conversation) { + if (params.conversation) { params.interactive_first = true; } @@ -823,15 +795,13 @@ int main(int argc, char ** argv) { is_interacting = true; printf("\n"); - } else if (params.instruct || params.chatml) { - is_interacting = true; } } if (n_past > 0 && is_interacting) { LOG("waiting for user input\n"); - if (params.conversation || params.instruct || params.chatml) { + if (params.conversation) { printf("\n> "); } @@ -874,24 +844,12 @@ int main(int argc, char ** argv) { const size_t original_size = embd_inp.size(); - // instruct mode: insert instruction prefix - if (params.instruct && !is_antiprompt) { - LOG("inserting instruction prefix\n"); - n_consumed = embd_inp.size(); - embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); - } - // chatml mode: insert user chat prefix - if (params.chatml && !is_antiprompt) { - LOG("inserting chatml prefix\n"); - n_consumed = embd_inp.size(); - embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end()); - } if (params.escape) { string_process_escapes(buffer); } const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials); + const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); @@ -900,17 +858,6 @@ int main(int argc, char ** argv) { embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); - // instruct mode: insert response suffix - if (params.instruct) { - LOG("inserting instruction suffix\n"); - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - } - // chatml mode: insert assistant chat suffix - if (params.chatml) { - LOG("inserting chatml suffix\n"); - embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end()); - } - for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); @@ -935,7 +882,7 @@ int main(int argc, char ** argv) { } // end of generation - if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) { + if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { LOG_TEE(" [end of text]\n"); break; } diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index c731abb726dc2..7faeaec975ae3 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -100,7 +100,8 @@ int main(int argc, char ** argv) { gpt_params params; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } diff --git a/examples/passkey/README.md b/examples/passkey/README.md index 4a22bb55975be..9e7a119ba3e0b 100644 --- a/examples/passkey/README.md +++ b/examples/passkey/README.md @@ -8,5 +8,5 @@ See the following PRs for more info: ### Usage ```bash -make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250 +make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250 ``` diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index f2ef9ca10d4a2..d03215cd1e0a9 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -6,46 +6,32 @@ #include #include -int main(int argc, char ** argv) { - gpt_params params; - - if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]); - return 1 ; - } - - int seed = -1; +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); - int n_junk = 250; // number of times to repeat the junk text - int n_keep = 32; // number of tokens in the prompt prefix - int n_grp = 1; // if more than 1 - perform LongLM SelfExtend - int i_pos = -1; // position of the passkey in the junk text - - if (argc >= 2) { - params.model = argv[1]; - } - - if (argc >= 3) { - n_junk = std::stoi(argv[2]); - } + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); + LOG_TEE("\n"); +} - if (argc >= 4) { - n_grp = std::stoi(argv[3]); - } +int main(int argc, char ** argv) { + gpt_params params; - if (argc >= 5) { - i_pos = std::stoi(argv[4]); - } + params.n_junk = 250; + params.n_keep = 32; + params.i_pos = -1; - if (argc >= 6) { - seed = std::stoi(argv[5]); + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); + return 1; } - if (seed == -1) { - seed = time(NULL); - } + srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed); - srand(seed); + int n_junk = params.n_junk; + int n_keep = params.n_keep; + int n_grp = params.grp_attn_n; + int i_pos = params.i_pos; if (i_pos == -1) { i_pos = rand() % n_junk; @@ -76,9 +62,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_default_params(); - - model_params.n_gpu_layers = 99; // offload all layers to the GPU + llama_model_params model_params = llama_model_params_from_gpt_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -89,13 +73,9 @@ int main(int argc, char ** argv) { // initialize the context - llama_context_params ctx_params = llama_context_default_params(); + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); - ctx_params.seed = seed; - ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; - ctx_params.n_batch = 512; - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); @@ -135,7 +115,7 @@ int main(int argc, char ** argv) { LOG_TEE("prompt tokens: %d\n", n_tokens_all); //LOG_TEE("prompt: %s\n", params.prompt.c_str()); - llama_batch batch = llama_batch_init(512, 0, 1); + llama_batch batch = llama_batch_init(params.n_batch, 0, 1); int n_past = 0; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 30e5e282ef5cf..0bd78c21a86a1 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1032,7 +1032,7 @@ struct winogrande_entry { std::vector seq_tokens[2]; }; -static std::vector load_winogrande_from_csv(const std::string& prompt) { +static std::vector load_winogrande_from_csv(const std::string & prompt) { std::vector result; std::istringstream in(prompt); std::string line; @@ -1964,12 +1964,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { int main(int argc, char ** argv) { gpt_params params; + params.n_ctx = 512; + params.logits_all = true; + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - params.logits_all = true; - const int32_t n_ctx = params.n_ctx; if (n_ctx <= 0) { @@ -2006,9 +2008,6 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = string_random_prompt(rng); - } llama_backend_init(); llama_numa_init(params.numa); @@ -2027,6 +2026,7 @@ int main(int argc, char ** argv) { } const int n_ctx_train = llama_n_ctx_train(model); + if (params.n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, params.n_ctx); diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh index a3ca74c68e7e5..38e28ffc365ee 100644 --- a/examples/quantize/tests.sh +++ b/examples/quantize/tests.sh @@ -47,7 +47,7 @@ echo PASS echo # 3a. Test the requanted model is loading properly -$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32 echo PASS echo @@ -57,7 +57,7 @@ echo PASS echo # 4b. Test the requanted model is loading properly -$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32 +$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32 echo PASS echo diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 4e7530706d4a9..55b7b2f70ae2a 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -4,72 +4,12 @@ #include #include -struct retrieval_params { - std::vector context_files; // context files to embed - int32_t chunk_size = 64; // chunk size for context embedding - std::string chunk_separator = "\n"; // chunk separator for context embedding -}; +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); -static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) { - gpt_params_print_usage(argc, argv, gpt_params); - printf("retrieval options:\n"); - printf(" --context-file FNAME file containing context to embed.\n"); - printf(" specify multiple files by providing --context-file option multiple times.\n"); - printf(" --chunk-size N minimum length of embedded text chunk (default:%d)\n", params.chunk_size); - printf(" --chunk-separator STRING\n"); - printf(" string to separate chunks (default: \"\\n\")\n"); - printf("\n"); -} - -static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) { - int i = 1; - std::string arg; - while (i < argc) { - arg = argv[i]; - bool invalid_gpt_param = false; - if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) { - if (invalid_gpt_param) { - fprintf(stderr, "error: invalid argument: %s\n", arg.c_str()); - retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); - exit(1); - } - // option was parsed by gpt_params_find_arg - } else if (arg == "--context-file") { - if (++i >= argc) { - fprintf(stderr, "error: missing argument for --context-file\n"); - retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); - exit(1); - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); - exit(1); - } - // store the external file name in params - retrieval_params.context_files.push_back(argv[i]); - } else if (arg == "--chunk-size") { - if (++i >= argc) { - fprintf(stderr, "error: missing argument for --chunk-size\n"); - retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); - exit(1); - } - retrieval_params.chunk_size = std::stoi(argv[i]); - } else if (arg == "--chunk-separator") { - if (++i >= argc) { - fprintf(stderr, "error: missing argument for --chunk-separator\n"); - retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); - exit(1); - } - retrieval_params.chunk_separator = argv[i]; - } else { - // unknown argument - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); - exit(1); - } - i++; - } + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); + LOG_TEE("\n"); } struct chunk { @@ -171,33 +111,35 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { gpt_params params; - retrieval_params retrieval_params; - retrieval_params_parse(argc, argv, params, retrieval_params); + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); + return 1; + } // For BERT models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; + params.embedding = true; - if (retrieval_params.chunk_size <= 0) { + if (params.chunk_size <= 0) { fprintf(stderr, "chunk_size must be positive\n"); return 1; } - if (retrieval_params.context_files.empty()) { + if (params.context_files.empty()) { fprintf(stderr, "context_files must be specified\n"); return 1; } - params.embedding = true; print_build_info(); printf("processing files:\n"); - for (auto & context_file : retrieval_params.context_files) { + for (auto & context_file : params.context_files) { printf("%s\n", context_file.c_str()); } std::vector chunks; - for (auto & context_file : retrieval_params.context_files) { - std::vector file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator); + for (auto & context_file : params.context_files) { + std::vector file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); } printf("Number of chunks: %ld\n", chunks.size()); @@ -242,7 +184,7 @@ int main(int argc, char ** argv) { return 1; } // add eos if not present - if (inp.empty() || inp.back() != llama_token_eos(model)) { + if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) { inp.push_back(llama_token_eos(model)); } chunk.tokens = inp; diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index c3b766882dbec..00c2277ac2827 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -11,6 +11,7 @@ int main(int argc, char ** argv) { params.prompt = "The quick brown fox"; if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fc6d90848f099..d581cad95974d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -123,29 +123,6 @@ struct slot_params { json input_suffix; }; -struct server_params { - int32_t port = 8080; - int32_t read_timeout = 600; - int32_t write_timeout = 600; - int32_t n_threads_http = -1; - - std::string hostname = "127.0.0.1"; - std::string public_path = ""; - std::string chat_template = ""; - std::string system_prompt = ""; - - std::vector api_keys; - -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - std::string ssl_key_file = ""; - std::string ssl_cert_file = ""; -#endif - - bool slots_endpoint = true; - bool metrics_endpoint = false; - std::string slot_save_path; -}; - struct server_slot { int id; int id_task = -1; @@ -1261,7 +1238,7 @@ struct server_context { } json get_formated_generation(const server_slot & slot) const { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); + const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); std::vector samplers_sequence; @@ -2334,561 +2311,6 @@ struct server_context { } }; -static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) { - printf("usage: %s [options]\n", argv0); - printf("\n"); - printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); - printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n"); - printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - printf(" --rope-scaling {none,linear,yarn}\n"); - printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); - printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); - printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); - printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); - printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); - printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); - printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n"); - printf(" -dt N, --defrag-thold N\n"); - printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); - printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch); - printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch); - if (llama_supports_mlock()) { - printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); - } - if (llama_supports_mmap()) { - printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); - } - printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); - printf(" - distribute: spread execution evenly over all nodes\n"); - printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); - printf(" - numactl: use the CPU map provided my numactl\n"); - if (llama_supports_gpu_offload()) { - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf(" how to split the model across multiple GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf(" - layer (default): split layers and KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); - printf(" or for intermediate results and KV (with split-mode = row)\n"); - printf(" -nkvo, --no-kv-offload\n"); - printf(" disable KV offload\n"); - } - printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH); - printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); - printf(" model download url (default: unused)\n"); - printf(" -hfr REPO, --hf-repo REPO\n"); - printf(" Hugging Face model repository (default: unused)\n"); - printf(" -hff FILE, --hf-file FILE\n"); - printf(" Hugging Face model file (default: unused)\n"); - printf(" -a ALIAS, --alias ALIAS\n"); - printf(" set an alias for the model, will be added as `model` field in completion response\n"); - printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); - printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); - printf(" --rpc SERVERS comma separated list of RPC servers\n"); - printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n"); - printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); - printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n"); -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - printf(" --ssl-key-file FNAME path to file a PEM-encoded SSL private key\n"); - printf(" --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate\n"); -#endif - printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); - printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); - printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); - printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n"); - printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled"); - printf(" -spf FNAME, --system-prompt-file FNAME\n"); - printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); - printf(" -ctk TYPE, --cache-type-k TYPE\n"); - printf(" KV cache data type for K (default: f16)\n"); - printf(" -ctv TYPE, --cache-type-v TYPE\n"); - printf(" KV cache data type for V (default: f16)\n"); - printf(" --log-format log output format: json or text (default: json)\n"); - printf(" --log-disable disables logging to a file.\n"); - printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); - printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled"); - printf(" --slot-save-path PATH path to save slot kv cache (default: disabled)\n"); - printf("\n"); - printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); - printf(" --override-kv KEY=TYPE:VALUE\n"); - printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); - printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n"); - printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n"); - printf(" --chat-template JINJA_TEMPLATE\n"); - printf(" set custom jinja chat template (default: template taken from model's metadata)\n"); - printf(" only commonly used templates are accepted:\n"); - printf(" https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template\n"); - printf("\n"); -} - -static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params) { - gpt_params default_params; - server_params default_sparams; - - std::string arg; - bool invalid_param = false; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg == "--port") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.port = std::stoi(argv[i]); - } else if (arg == "--rpc") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rpc_servers = argv[i]; - } else if (arg == "--host") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.hostname = argv[i]; - } else if (arg == "--path") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.public_path = argv[i]; - } else if (arg == "--api-key") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.api_keys.push_back(argv[i]); - } else if (arg == "--api-key-file") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream key_file(argv[i]); - if (!key_file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::string key; - while (std::getline(key_file, key)) { - if (key.size() > 0) { - sparams.api_keys.push_back(key); - } - } - key_file.close(); - - } -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - else if (arg == "--ssl-key-file") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.ssl_key_file = argv[i]; - } else if (arg == "--ssl-cert-file") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.ssl_cert_file = argv[i]; - } -#endif - else if (arg == "--timeout" || arg == "-to") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.read_timeout = std::stoi(argv[i]); - sparams.write_timeout = std::stoi(argv[i]); - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; - } else if (arg == "-mu" || arg == "--model-url") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_url = argv[i]; - } else if (arg == "-hfr" || arg == "--hf-repo") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.hf_repo = argv[i]; - } else if (arg == "-hff" || arg == "--hf-file") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.hf_file = argv[i]; - } else if (arg == "-a" || arg == "--alias") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_alias = argv[i]; - } else if (arg == "-h" || arg == "--help") { - server_print_usage(argv[0], default_params, default_sparams); - exit(0); - } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--rope-scaling") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } - else { invalid_param = true; break; } - } else if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_base = std::stof(argv[i]); - } else if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = std::stof(argv[i]); - } else if (arg == "--yarn-ext-factor") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_ext_factor = std::stof(argv[i]); - } - else if (arg == "--yarn-attn-factor") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_attn_factor = std::stof(argv[i]); - } else if (arg == "--yarn-beta-fast") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_fast = std::stof(argv[i]); - } else if (arg == "--yarn-beta-slow") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_slow = std::stof(argv[i]); - } else if (arg == "--pooling") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else { invalid_param = true; break; } - } else if (arg == "--defrag-thold" || arg == "-dt") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.defrag_thold = std::stof(argv[i]); - } else if (arg == "--threads" || arg == "-t") { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - } else if (arg == "--grp-attn-n" || arg == "-gan") { - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_n = std::stoi(argv[i]); - } else if (arg == "--grp-attn-w" || arg == "-gaw") { - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_w = std::stoi(argv[i]); - } else if (arg == "--threads-batch" || arg == "-tb") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_batch = std::stoi(argv[i]); - } else if (arg == "--threads-http") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.n_threads_http = std::stoi(argv[i]); - } else if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - } else if (arg == "-ub" || arg == "--ubatch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ubatch = std::stoi(argv[i]); - } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - break; - } - if (llama_supports_gpu_offload()) { - params.n_gpu_layers = std::stoi(argv[i]); - } else { - LOG_WARNING( - "Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " - "See main README.md for information on enabling GPU BLAS support", - {{"n_gpu_layers", params.n_gpu_layers}}); - } - } else if (arg == "-nkvo" || arg == "--no-kv-offload") { - params.no_kv_offload = true; - } else if (arg == "--split-mode" || arg == "-sm") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::string arg_next = argv[i]; - if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; - } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_MODE_ROW; - } else { - invalid_param = true; - break; - } -#ifndef GGML_USE_CUDA - fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUDA - } else if (arg == "--tensor-split" || arg == "-ts") { - if (++i >= argc) { - invalid_param = true; - break; - } -#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL) - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; - std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= llama_max_devices()); - - for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) { - if (i_device < split_arg.size()) { - params.tensor_split[i_device] = std::stof(split_arg[i_device]); - } else { - params.tensor_split[i_device] = 0.0f; - } - } -#else - LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {}); -#endif // GGML_USE_CUDA - } else if (arg == "--main-gpu" || arg == "-mg") { - if (++i >= argc) { - invalid_param = true; - break; - } -#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL) - params.main_gpu = std::stoi(argv[i]); -#else - LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {}); -#endif - } else if (arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.emplace_back(argv[i], 1.0f); - params.use_mmap = false; - } else if (arg == "--lora-scaled") { - if (++i >= argc) { - invalid_param = true; - break; - } - const char * lora_adapter = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); - params.use_mmap = false; - } else if (arg == "--lora-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } else if (arg == "-v" || arg == "--verbose") { -#if SERVER_VERBOSE != 1 - LOG_WARNING("server.cpp is not built with verbose logging.", {}); -#else - server_verbose = true; -#endif - } else if (arg == "--mlock") { - params.use_mlock = true; - } else if (arg == "--no-mmap") { - params.use_mmap = false; - } else if (arg == "--numa") { - if (++i >= argc) { - invalid_param = true; - break; - } else { - std::string value(argv[i]); - /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; break; } - } - } else if (arg == "--embedding" || arg == "--embeddings") { - params.embedding = true; - } else if (arg == "-cb" || arg == "--cont-batching") { - params.cont_batching = true; - } else if (arg == "-fa" || arg == "--flash-attn") { - params.flash_attn = true; - } else if (arg == "-np" || arg == "--parallel") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_parallel = std::stoi(argv[i]); - } else if (arg == "-n" || arg == "--n-predict") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_predict = std::stoi(argv[i]); - } else if (arg == "-spf" || arg == "--system-prompt-file") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::string system_prompt; - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(system_prompt) - ); - sparams.system_prompt = system_prompt; - } else if (arg == "-ctk" || arg == "--cache-type-k") { - params.cache_type_k = argv[++i]; - } else if (arg == "-ctv" || arg == "--cache-type-v") { - params.cache_type_v = argv[++i]; - } else if (arg == "--log-format") { - if (++i >= argc) { - invalid_param = true; - break; - } - if (std::strcmp(argv[i], "json") == 0) { - server_log_json = true; - } else if (std::strcmp(argv[i], "text") == 0) { - server_log_json = false; - } else { - invalid_param = true; - break; - } - } else if (arg == "--log-disable") { - log_set_target(stdout); - LOG_INFO("logging to file is disabled.", {}); - } else if (arg == "--slots-endpoint-disable") { - sparams.slots_endpoint = false; - } else if (arg == "--metrics") { - sparams.metrics_endpoint = true; - } else if (arg == "--slot-save-path") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.slot_save_path = argv[i]; - // if doesn't end with DIRECTORY_SEPARATOR, add it - if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { - sparams.slot_save_path += DIRECTORY_SEPARATOR; - } - } else if (arg == "--chat-template") { - if (++i >= argc) { - invalid_param = true; - break; - } - if (!verify_custom_template(argv[i])) { - fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); - fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); - invalid_param = true; - break; - } - sparams.chat_template = argv[i]; - } else if (arg == "--override-kv") { - if (++i >= argc) { - invalid_param = true; - break; - } - if (!string_parse_kv_override(argv[i], params.kv_overrides)) { - fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - server_print_usage(argv[0], default_params, default_sparams); - exit(1); - } - } - - gpt_params_handle_model_default(params); - - if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(); - params.kv_overrides.back().key[0] = 0; - } - - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - server_print_usage(argv[0], default_params, default_sparams); - exit(1); - } -} - static void log_server_request(const httplib::Request & req, const httplib::Response & res) { // skip GH copilot requests when using default port if (req.path == "/v1/health" || req.path == "/v1/completions") { @@ -2929,16 +2351,22 @@ int main(int argc, char ** argv) { log_disable(); #endif // own arguments required by this example - gpt_params params; - server_params sparams; + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); + return 1; + } + + // TODO: not great to use extern vars + server_log_json = params.log_json; + server_verbose = params.verbose; // struct that contains llama context and inference server_context ctx_server; - server_params_parse(argc, argv, sparams, params); - - if (!sparams.system_prompt.empty()) { - ctx_server.system_prompt_set(sparams.system_prompt); + if (!params.system_prompt.empty()) { + ctx_server.system_prompt_set(params.system_prompt); } if (params.model_alias == "unknown") { @@ -2962,10 +2390,10 @@ int main(int argc, char ** argv) { std::unique_ptr svr; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if (sparams.ssl_key_file != "" && sparams.ssl_cert_file != "") { - LOG_INFO("Running with SSL", {{"key", sparams.ssl_key_file}, {"cert", sparams.ssl_cert_file}}); + if (params.ssl_file_key != "" && params.ssl_file_cert != "") { + LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}}); svr.reset( - new httplib::SSLServer(sparams.ssl_cert_file.c_str(), sparams.ssl_key_file.c_str()) + new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) ); } else { LOG_INFO("Running without SSL", {}); @@ -3019,24 +2447,24 @@ int main(int argc, char ** argv) { }); // set timeouts and change hostname and port - svr->set_read_timeout (sparams.read_timeout); - svr->set_write_timeout(sparams.write_timeout); + svr->set_read_timeout (params.timeout_read); + svr->set_write_timeout(params.timeout_write); - if (!svr->bind_to_port(sparams.hostname, sparams.port)) { - fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port); + if (!svr->bind_to_port(params.hostname, params.port)) { + fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port); return 1; } std::unordered_map log_data; - log_data["hostname"] = sparams.hostname; - log_data["port"] = std::to_string(sparams.port); + log_data["hostname"] = params.hostname; + log_data["port"] = std::to_string(params.port); - if (sparams.api_keys.size() == 1) { - auto key = sparams.api_keys[0]; + if (params.api_keys.size() == 1) { + auto key = params.api_keys[0]; log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0)); - } else if (sparams.api_keys.size() > 1) { - log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; + } else if (params.api_keys.size() > 1) { + log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded"; } // load the model @@ -3053,10 +2481,10 @@ int main(int argc, char ** argv) { const auto model_meta = ctx_server.model_meta(); // if a custom chat template is not supplied, we will use the one that comes with the model (if any) - if (sparams.chat_template.empty()) { + if (params.chat_template.empty()) { if (!ctx_server.validate_model_chat_template()) { LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); - sparams.chat_template = "chatml"; + params.chat_template = "chatml"; } } @@ -3068,11 +2496,11 @@ int main(int argc, char ** argv) { chat.push_back({{"role", "assistant"}, {"content", "Hi there"}}); chat.push_back({{"role", "user"}, {"content", "How are you?"}}); - const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat); + const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat); LOG_INFO("chat template", { {"chat_example", chat_example}, - {"built_in", sparams.chat_template.empty()}, + {"built_in", params.chat_template.empty()}, }); } @@ -3080,7 +2508,7 @@ int main(int argc, char ** argv) { // Middlewares // - auto middleware_validate_api_key = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res) { + auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { // TODO: should we apply API key to all endpoints, including "/health" and "/models"? static const std::set protected_endpoints = { "/props", @@ -3098,7 +2526,7 @@ int main(int argc, char ** argv) { }; // If API key is not set, skip validation - if (sparams.api_keys.empty()) { + if (params.api_keys.empty()) { return true; } @@ -3113,7 +2541,7 @@ int main(int argc, char ** argv) { std::string prefix = "Bearer "; if (auth_header.substr(0, prefix.size()) == prefix) { std::string received_api_key = auth_header.substr(prefix.size()); - if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) { + if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) { return true; // API key is valid } } @@ -3168,7 +2596,7 @@ int main(int argc, char ** argv) { }; res.status = 200; // HTTP OK - if (sparams.slots_endpoint && req.has_param("include_slots")) { + if (params.endpoint_slots && req.has_param("include_slots")) { health["slots"] = result.data.at("slots"); } @@ -3194,7 +2622,7 @@ int main(int argc, char ** argv) { }; const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) { - if (!sparams.slots_endpoint) { + if (!params.endpoint_slots) { res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -3218,7 +2646,7 @@ int main(int argc, char ** argv) { }; const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { - if (!sparams.metrics_endpoint) { + if (!params.endpoint_metrics) { res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -3318,14 +2746,14 @@ int main(int argc, char ** argv) { res.status = 200; // HTTP OK }; - const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) { + const auto handle_slots_save = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { json request_data = json::parse(req.body); std::string filename = request_data.at("filename"); if (!fs_validate_filename(filename)) { res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); return; } - std::string filepath = sparams.slot_save_path + filename; + std::string filepath = params.slot_save_path + filename; server_task task; task.type = SERVER_TASK_TYPE_SLOT_SAVE; @@ -3348,14 +2776,14 @@ int main(int argc, char ** argv) { } }; - const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) { + const auto handle_slots_restore = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { json request_data = json::parse(req.body); std::string filename = request_data.at("filename"); if (!fs_validate_filename(filename)) { res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); return; } - std::string filepath = sparams.slot_save_path + filename; + std::string filepath = params.slot_save_path + filename; server_task task; task.type = SERVER_TASK_TYPE_SLOT_RESTORE; @@ -3530,9 +2958,9 @@ int main(int argc, char ** argv) { res.set_content(models.dump(), "application/json; charset=utf-8"); }; - const auto handle_chat_completions = [&ctx_server, &sparams, &res_error](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template); + json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); const int id_task = ctx_server.queue_tasks.get_new_id(); @@ -3757,29 +3185,29 @@ int main(int argc, char ** argv) { // // register static assets routes - if (!sparams.public_path.empty()) { + if (!params.public_path.empty()) { // Set the base directory for serving static files - svr->set_base_dir(sparams.public_path); + svr->set_base_dir(params.public_path); } + // using embedded static files - svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); - svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); - svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); - svr->Get("/json-schema-to-grammar.mjs", handle_static_file( - json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); + svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); + svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); + svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); + svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); // add new-ui files - svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); - svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); + svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); + svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); - svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); - svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); - svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); + svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); + svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); + svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); // register API routes svr->Get ("/health", handle_health); @@ -3798,7 +3226,7 @@ int main(int argc, char ** argv) { svr->Post("/v1/embeddings", handle_embeddings); svr->Post("/tokenize", handle_tokenize); svr->Post("/detokenize", handle_detokenize); - if (!sparams.slot_save_path.empty()) { + if (!params.slot_save_path.empty()) { // only enable slot endpoints if slot_save_path is set svr->Post("/slots/:id_slot", handle_slots_action); } @@ -3806,12 +3234,12 @@ int main(int argc, char ** argv) { // // Start the server // - if (sparams.n_threads_http < 1) { + if (params.n_threads_http < 1) { // +2 threads for monitoring endpoints - sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); + params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); } - log_data["n_threads_http"] = std::to_string(sparams.n_threads_http); - svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); }; + log_data["n_threads_http"] = std::to_string(params.n_threads_http); + svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); }; LOG_INFO("HTTP server listening", log_data); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index d8a2286e4b1df..b7bfb41d35edc 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -116,13 +116,6 @@ static inline void server_log(const char * level, const char * function, int lin // chat template utils // -// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid -inline bool verify_custom_template(const std::string & tmpl) { - llama_chat_message chat[] = {{"user", "test"}}; - int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); - return res >= 0; -} - // Format given chat. If tmpl is empty, we take the template from model metadata inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { size_t alloc_size = 0; diff --git a/examples/simple/README.md b/examples/simple/README.md index 5d24b1046935c..49e24501cc02b 100644 --- a/examples/simple/README.md +++ b/examples/simple/README.md @@ -3,7 +3,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt. ```bash -./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" +./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" ... diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index b0f8e0fdc4987..69a92cf7dc0c0 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -6,28 +6,27 @@ #include #include -int main(int argc, char ** argv) { - gpt_params params; +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); - if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]); - return 1 ; - } + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); + LOG_TEE("\n"); +} - if (argc >= 2) { - params.model = argv[1]; - } +int main(int argc, char ** argv) { + gpt_params params; - if (argc >= 3) { - params.prompt = argv[2]; - } + params.prompt = "Hello my name is"; + params.n_predict = 32; - if (params.prompt.empty()) { - params.prompt = "Hello my name is"; + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); + return 1; } // total length of the sequence including the prompt - const int n_len = 32; + const int n_predict = params.n_predict; // init LLM @@ -36,9 +35,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_default_params(); - - // model_params.n_gpu_layers = 99; // offload all layers to the GPU + llama_model_params model_params = llama_model_params_from_gpt_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -49,12 +46,7 @@ int main(int argc, char ** argv) { // initialize the context - llama_context_params ctx_params = llama_context_default_params(); - - ctx_params.seed = 1234; - ctx_params.n_ctx = 2048; - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); llama_context * ctx = llama_new_context_with_model(model, ctx_params); @@ -69,14 +61,14 @@ int main(int argc, char ** argv) { tokens_list = ::llama_tokenize(ctx, params.prompt, true); const int n_ctx = llama_n_ctx(ctx); - const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); + const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req); + LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__); + LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__); return 1; } @@ -115,7 +107,7 @@ int main(int argc, char ** argv) { const auto t_main_start = ggml_time_us(); - while (n_cur <= n_len) { + while (n_cur <= n_predict) { // sample the next token { auto n_vocab = llama_n_vocab(model); @@ -134,7 +126,7 @@ int main(int argc, char ** argv) { const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of generation? - if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { + if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { LOG_TEE("\n"); break; diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 12e46fbc91a24..0939a1a6a7a38 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -27,7 +27,8 @@ struct seq_draft { int main(int argc, char ** argv) { gpt_params params; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } diff --git a/llama.cpp b/llama.cpp index c05e2bdb7ae44..06889126ecdc4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -108,7 +108,7 @@ // LLAMA_ATTRIBUTE_FORMAT(2, 3) -static void llama_log_internal (ggml_log_level level, const char* format, ...); +static void llama_log_internal (ggml_log_level level, const char * format, ...); static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py index e986a36045549..0d7219113ec3f 100755 --- a/scripts/run-with-preset.py +++ b/scripts/run-with-preset.py @@ -13,12 +13,12 @@ CLI_ARGS_MAIN_PERPLEXITY = [ "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape", "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag", - "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct", + "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base", "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock", "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q", "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt", - "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n", + "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n", "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed", "simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical", "verbose-prompt"