diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 553229a59b145..b3a344ef3d59c 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -68,6 +68,7 @@ def download_file_with_auth(url, token, save_path): else: print(f"Failed to download file. Status code: {response.status_code}") +# download the tokenizer models for model in models: name = model["name"] repo = model["repo"] @@ -173,3 +174,84 @@ def download_file_with_auth(url, token, save_path): print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") print("\n") +# generate tests for each tokenizer model + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\n\n", + "\n\n\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + " (", + "\n =", + "' era", + "Hello, y'all! How are you 😁 ?ζˆ‘ζƒ³εœ¨appleε·₯作1314151倩~", + "3", + "33", + "333", + "3333", + "33333", + "333333", + "3333333", + "33333333", + "333333333", +] + +# write the tests in ./models/test-vocab-inp.txt +# the format is: +# +# test0 +# __ggml_vocab_test__ +# test1 +# __ggml_vocab_test__ +# ... +# + +with open(f"models/test-vocab-inp.txt", "w") as f: + for text in tests: + f.write(f"{text}") + f.write("\n__ggml_vocab_test__\n") + +print("Tests written in ./models/test-vocab-inp.txt") + +# with each model, encode all tests and write the results in ./models/test-vocab-out-{name}.txt +# for each test, write the resulting tokens on a separate line + +for model in models: + name = model["name"] + tokt = model["tokt"] + + # create the tokenizer + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + + with open(f"models/test-vocab-out-{name}.txt", "w") as f: + for text in tests: + res = tokenizer.encode(text) + for r in res: + f.write(f" {r}") + f.write("\n") + + print(f"Test results for {name} written in ./models/test-vocab-out-{name}.txt") diff --git a/models/ggml-vocab-stablelm-3b-4e1t.gguf b/models/ggml-vocab-stablelm.gguf similarity index 100% rename from models/ggml-vocab-stablelm-3b-4e1t.gguf rename to models/ggml-vocab-stablelm.gguf diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3acf28ba41cdd..ca7bbb6ec9d70 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,10 +1,40 @@ +function(llama_test target) + include(CMakeParseArguments) + set(options) + set(oneValueArgs NAME LABEL WORKING_DIRECTORY) + set(multiValueArgs ARGS) + cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if (NOT DEFINED LLAMA_TEST_LABEL) + set(LLAMA_TEST_LABEL "main") + endif() + if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY) + set(LLAMA_TEST_WORKING_DIRECTORY .) + endif() + if (DEFINED LLAMA_TEST_NAME) + set(TEST_NAME ${LLAMA_TEST_NAME}) + else() + set(TEST_NAME ${target}) + endif() + + set(TEST_TARGET ${target}) + + add_test( + NAME ${TEST_NAME} + WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY} + COMMAND $ + ${LLAMA_TEST_ARGS}) + + set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL}) +endfunction() + # Builds and runs a test source file. # Optional args: # - NAME: name of the executable & test target (defaults to the source file name without extension) # - LABEL: label for the test (defaults to main) # - ARGS: arguments to pass to the test executable # - WORKING_DIRECTORY -function(llama_test source) +function(llama_target_and_test source) include(CMakeParseArguments) set(options) set(oneValueArgs NAME LABEL WORKING_DIRECTORY) @@ -35,45 +65,54 @@ function(llama_test source) set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL}) endfunction() -# llama_test(test-double-float.cpp) # SLOW -llama_test(test-quantize-fns.cpp) -llama_test(test-quantize-perf.cpp) -llama_test(test-sampling.cpp) -llama_test(test-chat-template.cpp) - -llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) -llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf) -llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) - -llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) -llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) - -llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) -llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) - -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-stablelm-3b-4e1t ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf) -#llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG - -llama_test(test-grammar-parser.cpp) -llama_test(test-llama-grammar.cpp) -llama_test(test-grammar-integration.cpp) -llama_test(test-grad0.cpp) -# llama_test(test-opt.cpp) # SLOW -llama_test(test-backend-ops.cpp) - -llama_test(test-rope.cpp) - -llama_test(test-model-load-cancel.cpp LABEL "model") -llama_test(test-autorelease.cpp LABEL "model") - -llama_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) +# llama_target_and_test(test-double-float.cpp) # SLOW +llama_target_and_test(test-quantize-fns.cpp) +llama_target_and_test(test-quantize-perf.cpp) +llama_target_and_test(test-sampling.cpp) +llama_target_and_test(test-chat-template.cpp) + +llama_target_and_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) +llama_target_and_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf) +llama_target_and_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) + +llama_target_and_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) +llama_target_and_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) + +llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) +llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) + +# build test-tokenizer-1-bpe target once and add many tests +add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp get-model.cpp) +target_link_libraries(test-tokenizer-1-bpe PRIVATE common) +install(TARGETS test-tokenizer-1-bpe RUNTIME) + +llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) +llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) +llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) +llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf) +llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf) +llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) +llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) +llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG + + + + + +llama_target_and_test(test-grammar-parser.cpp) +llama_target_and_test(test-llama-grammar.cpp) +llama_target_and_test(test-grammar-integration.cpp) +llama_target_and_test(test-grad0.cpp) +# llama_target_and_test(test-opt.cpp) # SLOW +llama_target_and_test(test-backend-ops.cpp) + +llama_target_and_test(test-rope.cpp) + +llama_target_and_test(test-model-load-cancel.cpp LABEL "model") +llama_target_and_test(test-autorelease.cpp LABEL "model") + +llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server) # dummy executable - not installed