Skip to content

Commit

Permalink
tokenize : add --show-count-only (token) option
Browse files Browse the repository at this point in the history
This commit adds a new option to the tokenize example that allows
printing only the total number of tokens without printing the actual
tokens.

The motivation for this change is that is can be useful to know the
total number of tokens in a given input without having to print the
tokens themselves.
  • Loading branch information
danbev committed Aug 26, 2024
1 parent 7a3df79 commit 6e075c8
Showing 1 changed file with 26 additions and 19 deletions.
45 changes: 26 additions & 19 deletions examples/tokenize/tokenize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
fprintf(stream, " --no-parse-special do not parse control tokens.\n");
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
fprintf(stream, " --show-count print the total number of tokens.\n");
fprintf(stream, " --show-count-only only print the total number of tokens (skip the printing of the actual tokens).\n");
}

static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
Expand Down Expand Up @@ -199,6 +200,7 @@ int main(int raw_argc, char ** raw_argv) {
bool no_parse_special = false;
bool disable_logging = false;
bool show_token_count = false;
bool show_token_count_only = false;
const char * model_path = NULL;
const char * prompt_path = NULL;
const char * prompt_arg = NULL;
Expand Down Expand Up @@ -259,6 +261,9 @@ int main(int raw_argc, char ** raw_argv) {
else if (arg == "--show-count") {
show_token_count = true;
}
else if (arg == "--show-count-only") {
show_token_count_only = show_token_count = true;
}
else {
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
return 1;
Expand Down Expand Up @@ -369,30 +374,32 @@ int main(int raw_argc, char ** raw_argv) {
std::vector<llama_token> tokens;
tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);

if (printing_ids) {
printf("[");
}

for (int i = 0; i < (int) tokens.size(); i++) {
if (!show_token_count_only) {
if (printing_ids) {
if (i > 0) {
printf(", ");
}
printf("%d", tokens[i]);
} else {
bool invalid_utf8 = false;
printf("%6d -> '", tokens[i]);
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
if (invalid_utf8) {
printf("' (utf-8 decode failure)\n");
printf("[");
}

for (int i = 0; i < (int) tokens.size(); i++) {
if (printing_ids) {
if (i > 0) {
printf(", ");
}
printf("%d", tokens[i]);
} else {
printf("'\n");
bool invalid_utf8 = false;
printf("%6d -> '", tokens[i]);
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
if (invalid_utf8) {
printf("' (utf-8 decode failure)\n");
} else {
printf("'\n");
}
}
}
}

if (printing_ids) {
printf("]\n");
if (printing_ids) {
printf("]\n");
}
}

if (show_token_count) {
Expand Down

0 comments on commit 6e075c8

Please sign in to comment.