refine example-specific args

ggml-org · Sep 6, 2024 · e1281d0 · e1281d0
1 parent 53244f9
commit e1281d0
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 19 deletions.
diff --git a/common/common.cpp b/common/common.cpp
@@ -720,21 +720,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params]() {
             params.verbose_prompt = true;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--no-display-prompt"},
         format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
         [&params]() {
             params.display_prompt = false;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-co", "--color"},
         format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
         [&params]() {
             params.use_color = true;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-s", "--seed"}, "SEED",
         format("RNG seed (default: %d, use random seed for < 0)", params.seed),
@@ -996,7 +996,9 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     ).set_env("LLAMA_ARG_FLASH_ATTN"));
     add_opt(llama_arg(
         {"-p", "--prompt"}, "PROMPT",
-        "prompt to start generation with\n",
+        ex == LLAMA_EXAMPLE_MAIN
+            ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
+            : "prompt to start generation with",
         [&params](std::string value) {
             params.prompt = value;
         }
@@ -1102,7 +1104,13 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"-cnv", "--conversation"},
-        "run in conversation mode, does not print special tokens and suffix/prefix\n",
+        format(
+            "run in conversation mode:\n"
+            "- does not print special tokens and suffix/prefix\n"
+            "- interactive mode is also enabled\n"
+            "(default: %s)",
+            params.conversation ? "true" : "false"
+        ),
         [&params]() {
             params.conversation = true;
         }
@@ -1625,14 +1633,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](std::string value) {
             params.mmproj = value;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     add_opt(llama_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
         [&params](std::string value) {
             params.image.emplace_back(value);
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
 #ifdef GGML_USE_RPC
     add_opt(llama_arg(
         {"--rpc"}, "SERVERS",
@@ -1692,7 +1700,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
                 fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
             }
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-sm", "--split-mode"}, "{none,layer,row}",
         "how to split the model across multiple GPUs, one of:\n"
@@ -1837,7 +1845,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params](std::string value) {
             params.model_draft = value;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(llama_arg(
         {"-mu", "--model-url"}, "MODEL_URL",
         "model download url (default: unused)",
@@ -2178,7 +2186,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
         [&params]() {
             params.simple_io = true;
         }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"-ld", "--logdir"}, "LOGDIR",
         "path under which to save YAML logs (no logging if unset)",

diff --git a/common/common.h b/common/common.h
@@ -77,6 +77,7 @@ enum llama_example {
     LLAMA_EXAMPLE_SERVER,
     LLAMA_EXAMPLE_CVECTOR_GENERATOR,
     LLAMA_EXAMPLE_EXPORT_LORA,
+    LLAMA_EXAMPLE_LLAVA,
 
     LLAMA_EXAMPLE_COUNT,
 };

diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
@@ -278,7 +278,7 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
     if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -40,6 +40,13 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;
 
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    printf("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    printf("\n");
+}
+
 static bool file_exists(const std::string & path) {
     std::ifstream f(path.c_str());
     return f.good();
@@ -131,7 +138,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
 
     if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -25,7 +25,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--verbosity N` | set specific verbosity level (default: 0) |
 | `--verbose-prompt` | print a verbose prompt before generation (default: false) |
 | `--no-display-prompt` | don't print prompt at generation (default: false) |
-| `-co, --color` | colorise output to distinguish prompt and user input from generations (default: false) |
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
@@ -46,7 +45,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
 | `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
 | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
-| `-p, --prompt PROMPT` | prompt to start generation with<br/> |
+| `-p, --prompt PROMPT` | prompt to start generation with |
 | `-f, --file FNAME` | a file containing the prompt (default: none) |
 | `--in-file FNAME` | an input file (repeat to specify multiple files) |
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
@@ -96,13 +95,10 @@ The project is under active development, and we are [looking for feedback and co
 | `-ns, --sequences N` | number of sequences to decode (default: 1) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
-| `--mmproj FILE` | path to a multimodal projector file for LLaVA. see examples/llava/README.md |
-| `--image FILE` | path to an image file. use with multimodal models. Specify multiple times for batching |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
 | `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
-| `-ngld, --gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
@@ -115,7 +111,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
 | `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
 | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
-| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
@@ -138,7 +133,6 @@ The project is under active development, and we are [looking for feedback and co
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
 | `--log-test` | Log test |
 | `--log-disable` | Log disable |