Skip to content

Commit

Permalink
tests : fix test-tokenizer-random.py
Browse files Browse the repository at this point in the history
Apparently, gcc applies optimisations even when pre-processing,
which confuses pycparser.
  • Loading branch information
compilade committed Jul 7, 2024
1 parent 6f215f1 commit 6ec70c9
Showing 1 changed file with 8 additions and 7 deletions.
15 changes: 8 additions & 7 deletions tests/test-tokenizer-random.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import random
import unicodedata

from pathlib import Path
from typing import Any, Iterator, cast
from typing_extensions import Buffer

Expand All @@ -39,7 +40,7 @@ def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [
self.lib.llama_backend_init()

def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]:
cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
cmd += ["-I" + path for path in path_includes] + [path_llama_h]
res = subprocess.run(cmd, stdout=subprocess.PIPE)
assert (res.returncode == 0)
Expand Down Expand Up @@ -480,8 +481,8 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:

def main(argv: list[str] | None = None):
parser = argparse.ArgumentParser()
parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file")
parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args(argv)

Expand Down Expand Up @@ -523,7 +524,7 @@ def main(argv: list[str] | None = None):
format = "%(levelname)s %(message)s",
)

path_tokenizers = "./models/tokenizers/"
path_tokenizers = Path("./models/tokenizers/")
path_vocab_format = "./models/ggml-vocab-%s.gguf"

tokenizers = [
Expand Down Expand Up @@ -559,6 +560,6 @@ def main(argv: list[str] | None = None):
for tokenizer in tokenizers:
logger.info("-" * 50)
logger.info(f"TOKENIZER: '{tokenizer}'")
vocab_file = path_vocab_format % tokenizer
dir_tokenizer = path_tokenizers + "/" + tokenizer
main([vocab_file, dir_tokenizer, "--verbose"])
vocab_file = Path(path_vocab_format % tokenizer)
dir_tokenizer = path_tokenizers / tokenizer
main([str(vocab_file), str(dir_tokenizer), "--verbose"])

0 comments on commit 6ec70c9

Please sign in to comment.