Skip to content

Commit

Permalink
common : refactor cli arg parsing (ggerganov#7675)
Browse files Browse the repository at this point in the history
* common : gpt_params_parse do not print usage

* common : rework usage print (wip)

* common : valign

* common : rework print_usage

* infill : remove cfg support

* common : reorder args

* server : deduplicate parameters

ggml-ci

* common : add missing header

ggml-ci

* common : remote --random-prompt usages

ggml-ci

* examples : migrate to gpt_params

ggml-ci

* batched-bench : migrate to gpt_params

* retrieval : migrate to gpt_params

* common : change defaults for escape and n_ctx

* common : remove chatml and instruct params

ggml-ci

* common : passkey use gpt_params
  • Loading branch information
ggerganov authored Jun 4, 2024
1 parent 554c247 commit 1442677
Show file tree
Hide file tree
Showing 34 changed files with 900 additions and 1,456 deletions.
823 changes: 563 additions & 260 deletions common/common.cpp

Large diffs are not rendered by default.

105 changes: 82 additions & 23 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct gpt_params {
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
Expand Down Expand Up @@ -99,23 +99,23 @@ struct gpt_params {
// // sampling parameters
struct llama_sampling_params sparams;

std::string model = ""; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model = ""; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file
std::string model_url = ""; // model url to download
std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file
std::string prompt = "";
std::string prompt_file = ""; // store the external prompt file name
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files
std::string prompt_file = ""; // store the external prompt file name
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::string logdir = ""; // directory in which to save YAML log files
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
std::string logits_file = ""; // file for saving *all* logits
std::string logits_file = ""; // file for saving *all* logits

std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;

// TODO: avoid tuple, use struct
Expand All @@ -127,8 +127,8 @@ struct gpt_params {
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector

int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
Expand All @@ -142,30 +142,28 @@ struct gpt_params {

bool kl_divergence = false; // compute KL divergence

bool random_prompt = false; // do not randomize prompt if none provided
bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool special = false; // enable special token output
bool interactive = false; // interactive mode
bool interactive_first = false; // wait for user input immediately
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it

bool embedding = false; // get only sentence embedding
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool interactive_first = false; // wait for user input immediately
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
bool instruct = false; // instruction mode (used for Alpaca models)
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose = false;
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode
Expand All @@ -180,6 +178,47 @@ struct gpt_params {
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)

// server params
int32_t port = 8080;
int32_t timeout_read = 600;
int32_t timeout_write = timeout_read;
int32_t n_threads_http = -1;

std::string hostname = "127.0.0.1";
std::string public_path = "";
std::string chat_template = "";
std::string system_prompt = "";

std::vector<std::string> api_keys;

std::string ssl_file_key = "";
std::string ssl_file_cert = "";

bool endpoint_slots = true;
bool endpoint_metrics = false;

bool log_json = false;

std::string slot_save_path;

// batched-bench params
bool is_pp_shared = false;

std::vector<int32_t> n_pp;
std::vector<int32_t> n_tg;
std::vector<int32_t> n_pl;

// retrieval params
std::vector<std::string> context_files; // context files to embed

int32_t chunk_size = 64; // chunk size for context embedding

std::string chunk_separator = "\n"; // chunk separator for context embedding

// passkey params
int32_t n_junk = 250; // number of times to repeat the junk text
int32_t i_pos = -1; // position of the passkey in the junk text
};

void gpt_params_handle_model_default(gpt_params & params);
Expand All @@ -199,7 +238,20 @@ std::vector<std::string> string_split(std::string input, char separator);

std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
std::string string_random_prompt(std::mt19937 & rng);

template<class T>
static std::vector<T> string_split(const std::string & str, char delim) {
std::vector<T> values;
std::istringstream str_stream(str);
std::string token;
while (std::getline(str_stream, token, delim)) {
T value;
std::istringstream token_stream(token);
token_stream >> value;
values.push_back(value);
}
return values;
}

bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
Expand Down Expand Up @@ -282,6 +334,13 @@ std::string llama_detokenize_bpe(
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);

//
// Chat template utils
//

// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool llama_chat_verify_template(const std::string & tmpl);

//
// KV cache utils
//
Expand Down
8 changes: 4 additions & 4 deletions examples/batched-bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)

```bash
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]

# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99

# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps

# custom set of batches
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
```

## Sample results
Expand Down
92 changes: 20 additions & 72 deletions examples/batched-bench/batched-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
return ret;
}

int main(int argc, char ** argv) {
gpt_params params;

if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ;
}

int n_kv_max = 2048;
int n_batch = 2048;
int n_ubatch = 512;
bool flash_attn = false;
int is_pp_shared = 0;
int n_gpu_layers = 0;

std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
std::vector<int> n_tg = { 128, 256, };
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };

if (argc >= 2) {
params.model = argv[1];
}

if (argc >= 3) {
n_kv_max = std::atoi(argv[2]);
}

if (argc >= 4) {
n_batch = std::atoi(argv[3]);
}

if (argc >= 5) {
n_ubatch = std::atoi(argv[4]);
}

if (argc >= 6) {
flash_attn = std::atoi(argv[5]);
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);

if (argc >= 7) {
is_pp_shared = std::atoi(argv[6]);
}
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG_TEE("\n");
}

if (argc >= 8) {
n_gpu_layers = std::atoi(argv[7]);
}
int main(int argc, char ** argv) {
gpt_params params;

if (argc >= 9) {
n_pp = parse_list(argv[8]);
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}

if (argc >= 10) {
n_tg = parse_list(argv[9]);
}
int is_pp_shared = params.is_pp_shared;

if (argc >= 11) {
n_pl = parse_list(argv[10]);
}
std::vector<int> n_pp = params.n_pp;
std::vector<int> n_tg = params.n_tg;
std::vector<int> n_pl = params.n_pl;

// init LLM

Expand All @@ -97,12 +57,7 @@ int main(int argc, char ** argv) {

// initialize the model

llama_model_params model_params = llama_model_default_params();

const std::vector<float> t_split(llama_max_devices(), 0.0f);

model_params.n_gpu_layers = n_gpu_layers;
model_params.tensor_split = t_split.data();
llama_model_params model_params = llama_model_params_from_gpt_params(params);

llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

Expand All @@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
return 1;
}

llama_context_params ctx_params = llama_context_default_params();

ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = n_batch;
ctx_params.n_ubatch = n_ubatch;
ctx_params.flash_attn = flash_attn;

ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);

// ensure enough sequences are available
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
Expand All @@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
return 1;
}

const int32_t n_kv_max = llama_n_ctx(ctx);

llama_batch batch = llama_batch_init(n_kv_max, 0, 1);

// decode in batches of ctx_params.n_batch tokens
Expand Down Expand Up @@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
}

LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");

LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
Expand Down
2 changes: 1 addition & 1 deletion examples/batched/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
The example demonstrates batched generation from a given prompt

```bash
./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4

...

Expand Down
Loading

0 comments on commit 1442677

Please sign in to comment.