tests : refactor vocab tests

ggml-ci
ggerganov · Apr 29, 2024 · 43708d2 · 43708d2
1 parent ef4cca9
commit 43708d2
Show file tree

Hide file tree

Showing 15 changed files with 316 additions and 1,010 deletions.
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -46,8 +46,8 @@ class TOKENIZER_TYPE(IntEnum):
 
 # TODO: add models here, base models preferred
 models = [
-        { "name": "llama-v2",       "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf",                },
-        { "name": "llama-v3",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B",              },
+        { "name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf",                },
+        { "name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B",              },
         { "name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base",        },
         { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base",    },
         { "name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b",                        },
@@ -64,7 +64,7 @@ def download_file_with_auth(url, token, save_path):
     if response.status_code == 200:
         with open(save_path, 'wb') as f:
             f.write(response.content)
-        print("File downloaded successfully.")
+        print(f"File {save_path} downloaded successfully")
     else:
         print(f"Failed to download file. Status code: {response.status_code}")
 
@@ -82,6 +82,10 @@ def download_file_with_auth(url, token, save_path):
 
     print(f"Downloading {name} to models/tokenizers/{name}")
 
+    url = f"{repo}/raw/main/config.json"
+    save_path = f"models/tokenizers/{name}/config.json"
+    download_file_with_auth(url, token, save_path)
+
     url = f"{repo}/raw/main/tokenizer.json"
     save_path = f"models/tokenizers/{name}/tokenizer.json"
     download_file_with_auth(url, token, save_path)
@@ -219,7 +223,7 @@ def download_file_with_auth(url, token, save_path):
     "333333333",
 ]
 
-# write the tests in ./models/test-vocab-inp.txt
+# write the tests to ./models/ggml-vocab-{name}.gguf.inp
 # the format is:
 #
 # test0
@@ -229,14 +233,7 @@ def download_file_with_auth(url, token, save_path):
 # ...
 #
 
-with open(f"models/test-vocab-inp.txt", "w") as f:
-    for text in tests:
-        f.write(f"{text}")
-        f.write("\n__ggml_vocab_test__\n")
-
-print("Tests written in ./models/test-vocab-inp.txt")
-
-# with each model, encode all tests and write the results in ./models/test-vocab-out-{name}.txt
+# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
 # for each test, write the resulting tokens on a separate line
 
 for model in models:
@@ -247,11 +244,27 @@ def download_file_with_auth(url, token, save_path):
     from transformers import AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
 
-    with open(f"models/test-vocab-out-{name}.txt", "w") as f:
+    with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
         for text in tests:
-            res = tokenizer.encode(text)
+            f.write(f"{text}")
+            f.write("\n__ggml_vocab_test__\n")
+
+    with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
+        for text in tests:
+            res = tokenizer.encode(text, add_special_tokens=False)
             for r in res:
                 f.write(f" {r}")
             f.write("\n")
 
-    print(f"Test results for {name} written in ./models/test-vocab-out-{name}.txt")
+    print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
+
+# generate commands for creating vocab files
+
+print("\nRun the following commands to generate the vocab files for testing:\n")
+
+for model in models:
+    name = model["name"]
+
+    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
+
+print("\n")
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -283,7 +283,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         #       don't do this manually - use the convert-hf-to-gguf-update.py script!
         if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
             # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-            res = "llama-v3"
+            res = "llama-bpe"
         if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
             # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
             res = "deepseek-llm"

diff --git a/llama.cpp b/llama.cpp
@@ -4339,8 +4339,9 @@ static void llm_load_vocab(
                     tokenizer_pre == "default") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
-                    tokenizer_pre == "llama3" ||
-                    tokenizer_pre == "llama-v3") {
+                    tokenizer_pre == "llama3"   ||
+                    tokenizer_pre == "llama-v3" ||
+                    tokenizer_pre == "llama-bpe") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
             } else if (
                     tokenizer_pre == "deepseek-llm") {
@@ -12583,7 +12584,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
             } break;
         case LLAMA_VOCAB_TYPE_BPE:
             {
-                if (add_special && vocab.special_add_bos == 1) {
+                if (add_special && vocab.special_add_bos != 0) {
                     GGML_ASSERT(vocab.special_bos_id != -1);
                     output.push_back(vocab.special_bos_id);
                 }

diff --git a/models/ggml-vocab-deepseek-coder.gguf b/models/ggml-vocab-deepseek-coder.gguf
diff --git a/models/ggml-vocab-deepseek-llm.gguf b/models/ggml-vocab-deepseek-llm.gguf
diff --git a/models/ggml-vocab-llama-v3.gguf b/models/ggml-vocab-llama-v3.gguf
diff --git a/models/ggml-vocab-llama.gguf b/models/ggml-vocab-llama.gguf
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -65,21 +65,16 @@ function(llama_target_and_test source)
     set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
 endfunction()
 
-# llama_target_and_test(test-double-float.cpp) # SLOW
-llama_target_and_test(test-quantize-fns.cpp)
-llama_target_and_test(test-quantize-perf.cpp)
-llama_target_and_test(test-sampling.cpp)
-llama_target_and_test(test-chat-template.cpp)
-
-llama_target_and_test(test-tokenizer-0-llama.cpp    NAME test-tokenizer-0-llama                ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_target_and_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
-llama_target_and_test(test-tokenizer-0-falcon.cpp   NAME test-tokenizer-0-falcon               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-
-llama_target_and_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
-llama_target_and_test(test-tokenizer-0-deepseek-llm.cpp   NAME test-tokenizer-0-deepseek-llm   ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
+# build test-tokenizer-0 target once and add many tests
+add_executable(test-tokenizer-0 test-tokenizer-0.cpp get-model.cpp)
+target_link_libraries(test-tokenizer-0 PRIVATE common)
+install(TARGETS test-tokenizer-0 RUNTIME)
 
-llama_target_and_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-llama                  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_target_and_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-baichuan               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
 
 # build test-tokenizer-1-bpe target once and add many tests
 add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp get-model.cpp)
@@ -96,9 +91,19 @@ llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CUR
 llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
 
+# build test-tokenizer-1-spm target once and add many tests
+add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp get-model.cpp)
+target_link_libraries(test-tokenizer-1-spm PRIVATE common)
+install(TARGETS test-tokenizer-1-spm RUNTIME)
 
+llama_target_and_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
+llama_target_and_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
 
-
+# llama_target_and_test(test-double-float.cpp) # SLOW
+llama_target_and_test(test-quantize-fns.cpp)
+llama_target_and_test(test-quantize-perf.cpp)
+llama_target_and_test(test-sampling.cpp)
+llama_target_and_test(test-chat-template.cpp)
 
 llama_target_and_test(test-grammar-parser.cpp)
 llama_target_and_test(test-llama-grammar.cpp)

diff --git a/tests/test-tokenizer-0-deepseek-coder.cpp b/tests/test-tokenizer-0-deepseek-coder.cpp