Skip to content

Commit

Permalink
refine example-specific args
Browse files Browse the repository at this point in the history
  • Loading branch information
ngxson committed Sep 6, 2024
1 parent 53244f9 commit e1281d0
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 19 deletions.
28 changes: 18 additions & 10 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -720,21 +720,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
[&params]() {
params.verbose_prompt = true;
}
));
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(llama_arg(
{"--no-display-prompt"},
format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
[&params]() {
params.display_prompt = false;
}
));
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(llama_arg(
{"-co", "--color"},
format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
[&params]() {
params.use_color = true;
}
));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
add_opt(llama_arg(
{"-s", "--seed"}, "SEED",
format("RNG seed (default: %d, use random seed for < 0)", params.seed),
Expand Down Expand Up @@ -996,7 +996,9 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
).set_env("LLAMA_ARG_FLASH_ATTN"));
add_opt(llama_arg(
{"-p", "--prompt"}, "PROMPT",
"prompt to start generation with\n",
ex == LLAMA_EXAMPLE_MAIN
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
: "prompt to start generation with",
[&params](std::string value) {
params.prompt = value;
}
Expand Down Expand Up @@ -1102,7 +1104,13 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(llama_arg(
{"-cnv", "--conversation"},
"run in conversation mode, does not print special tokens and suffix/prefix\n",
format(
"run in conversation mode:\n"
"- does not print special tokens and suffix/prefix\n"
"- interactive mode is also enabled\n"
"(default: %s)",
params.conversation ? "true" : "false"
),
[&params]() {
params.conversation = true;
}
Expand Down Expand Up @@ -1625,14 +1633,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
[&params](std::string value) {
params.mmproj = value;
}
));
).set_examples({LLAMA_EXAMPLE_LLAVA}));
add_opt(llama_arg(
{"--image"}, "FILE",
"path to an image file. use with multimodal models. Specify multiple times for batching",
[&params](std::string value) {
params.image.emplace_back(value);
}
));
).set_examples({LLAMA_EXAMPLE_LLAVA}));
#ifdef GGML_USE_RPC
add_opt(llama_arg(
{"--rpc"}, "SERVERS",
Expand Down Expand Up @@ -1692,7 +1700,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
}
}
));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
add_opt(llama_arg(
{"-sm", "--split-mode"}, "{none,layer,row}",
"how to split the model across multiple GPUs, one of:\n"
Expand Down Expand Up @@ -1837,7 +1845,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
[&params](std::string value) {
params.model_draft = value;
}
));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
add_opt(llama_arg(
{"-mu", "--model-url"}, "MODEL_URL",
"model download url (default: unused)",
Expand Down Expand Up @@ -2178,7 +2186,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
[&params]() {
params.simple_io = true;
}
));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
add_opt(llama_arg(
{"-ld", "--logdir"}, "LOGDIR",
"path under which to save YAML logs (no logging if unset)",
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ enum llama_example {
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA,

LLAMA_EXAMPLE_COUNT,
};
Expand Down
2 changes: 1 addition & 1 deletion examples/llava/llava-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ int main(int argc, char ** argv) {

gpt_params params;

auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}
Expand Down
9 changes: 8 additions & 1 deletion examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static bool need_insert_eot = false;

static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
printf("\n");
}

static bool file_exists(const std::string & path) {
std::ifstream f(path.c_str());
return f.good();
Expand Down Expand Up @@ -131,7 +138,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
int main(int argc, char ** argv) {
gpt_params params;
g_params = &params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);

if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
Expand Down
8 changes: 1 addition & 7 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ The project is under active development, and we are [looking for feedback and co
| `--verbosity N` | set specific verbosity level (default: 0) |
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
| `--no-display-prompt` | don't print prompt at generation (default: false) |
| `-co, --color` | colorise output to distinguish prompt and user input from generations (default: false) |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
Expand All @@ -46,7 +45,7 @@ The project is under active development, and we are [looking for feedback and co
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `-p, --prompt PROMPT` | prompt to start generation with<br/> |
| `-p, --prompt PROMPT` | prompt to start generation with |
| `-f, --file FNAME` | a file containing the prompt (default: none) |
| `--in-file FNAME` | an input file (repeat to specify multiple files) |
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
Expand Down Expand Up @@ -96,13 +95,10 @@ The project is under active development, and we are [looking for feedback and co
| `-ns, --sequences N` | number of sequences to decode (default: 1) |
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
| `--mmproj FILE` | path to a multimodal projector file for LLaVA. see examples/llava/README.md |
| `--image FILE` | path to an image file. use with multimodal models. Specify multiple times for batching |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-ngld, --gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
Expand All @@ -115,7 +111,6 @@ The project is under active development, and we are [looking for feedback and co
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
Expand All @@ -138,7 +133,6 @@ The project is under active development, and we are [looking for feedback and co
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
| `--log-test` | Log test |
| `--log-disable` | Log disable |
Expand Down

0 comments on commit e1281d0

Please sign in to comment.