Skip to content

Commit

Permalink
tests : refactor vocab tests
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Apr 29, 2024
1 parent ef4cca9 commit 43708d2
Show file tree
Hide file tree
Showing 15 changed files with 316 additions and 1,010 deletions.
43 changes: 28 additions & 15 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ class TOKENIZER_TYPE(IntEnum):

# TODO: add models here, base models preferred
models = [
{ "name": "llama-v2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{ "name": "llama-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{ "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{ "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{ "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{ "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{ "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
Expand All @@ -64,7 +64,7 @@ def download_file_with_auth(url, token, save_path):
if response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(response.content)
print("File downloaded successfully.")
print(f"File {save_path} downloaded successfully")
else:
print(f"Failed to download file. Status code: {response.status_code}")

Expand All @@ -82,6 +82,10 @@ def download_file_with_auth(url, token, save_path):

print(f"Downloading {name} to models/tokenizers/{name}")

url = f"{repo}/raw/main/config.json"
save_path = f"models/tokenizers/{name}/config.json"
download_file_with_auth(url, token, save_path)

url = f"{repo}/raw/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
Expand Down Expand Up @@ -219,7 +223,7 @@ def download_file_with_auth(url, token, save_path):
"333333333",
]

# write the tests in ./models/test-vocab-inp.txt
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
# the format is:
#
# test0
Expand All @@ -229,14 +233,7 @@ def download_file_with_auth(url, token, save_path):
# ...
#

with open(f"models/test-vocab-inp.txt", "w") as f:
for text in tests:
f.write(f"{text}")
f.write("\n__ggml_vocab_test__\n")

print("Tests written in ./models/test-vocab-inp.txt")

# with each model, encode all tests and write the results in ./models/test-vocab-out-{name}.txt
# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
# for each test, write the resulting tokens on a separate line

for model in models:
Expand All @@ -247,11 +244,27 @@ def download_file_with_auth(url, token, save_path):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")

with open(f"models/test-vocab-out-{name}.txt", "w") as f:
with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
for text in tests:
res = tokenizer.encode(text)
f.write(f"{text}")
f.write("\n__ggml_vocab_test__\n")

with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
for text in tests:
res = tokenizer.encode(text, add_special_tokens=False)
for r in res:
f.write(f" {r}")
f.write("\n")

print(f"Test results for {name} written in ./models/test-vocab-out-{name}.txt")
print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")

# generate commands for creating vocab files

print("\nRun the following commands to generate the vocab files for testing:\n")

for model in models:
name = model["name"]

print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")

print("\n")
2 changes: 1 addition & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
# don't do this manually - use the convert-hf-to-gguf-update.py script!
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-v3"
res = "llama-bpe"
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
res = "deepseek-llm"
Expand Down
7 changes: 4 additions & 3 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4339,8 +4339,9 @@ static void llm_load_vocab(
tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3") {
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
} else if (
tokenizer_pre == "deepseek-llm") {
Expand Down Expand Up @@ -12583,7 +12584,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
if (add_special && vocab.special_add_bos == 1) {
if (add_special && vocab.special_add_bos != 0) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id);
}
Expand Down
Binary file modified models/ggml-vocab-deepseek-coder.gguf
Binary file not shown.
Binary file modified models/ggml-vocab-deepseek-llm.gguf
Binary file not shown.
Binary file removed models/ggml-vocab-llama-v3.gguf
Binary file not shown.
Binary file removed models/ggml-vocab-llama.gguf
Binary file not shown.
35 changes: 20 additions & 15 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,16 @@ function(llama_target_and_test source)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
endfunction()

# llama_target_and_test(test-double-float.cpp) # SLOW
llama_target_and_test(test-quantize-fns.cpp)
llama_target_and_test(test-quantize-perf.cpp)
llama_target_and_test(test-sampling.cpp)
llama_target_and_test(test-chat-template.cpp)

llama_target_and_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_target_and_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
llama_target_and_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)

llama_target_and_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_target_and_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
# build test-tokenizer-0 target once and add many tests
add_executable(test-tokenizer-0 test-tokenizer-0.cpp get-model.cpp)
target_link_libraries(test-tokenizer-0 PRIVATE common)
install(TARGETS test-tokenizer-0 RUNTIME)

llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)

# build test-tokenizer-1-bpe target once and add many tests
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp get-model.cpp)
Expand All @@ -96,9 +91,19 @@ llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CUR
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG

# build test-tokenizer-1-spm target once and add many tests
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp get-model.cpp)
target_link_libraries(test-tokenizer-1-spm PRIVATE common)
install(TARGETS test-tokenizer-1-spm RUNTIME)

llama_target_and_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
llama_target_and_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)


# llama_target_and_test(test-double-float.cpp) # SLOW
llama_target_and_test(test-quantize-fns.cpp)
llama_target_and_test(test-quantize-perf.cpp)
llama_target_and_test(test-sampling.cpp)
llama_target_and_test(test-chat-template.cpp)

llama_target_and_test(test-grammar-parser.cpp)
llama_target_and_test(test-llama-grammar.cpp)
Expand Down
188 changes: 0 additions & 188 deletions tests/test-tokenizer-0-deepseek-coder.cpp

This file was deleted.

Loading

0 comments on commit 43708d2

Please sign in to comment.