Skip to content

Commit

Permalink
refactor: Clean up constants and simplify the custom hf hub api
Browse files Browse the repository at this point in the history
  • Loading branch information
teleprint-me committed Jun 2, 2024
1 parent ce8524a commit 5836d6c
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 106 deletions.
79 changes: 43 additions & 36 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class General:
SOURCE_URL = "general.source.url"
SOURCE_REPO = "general.source.repository"
FILE_TYPE = "general.file_type"
ENDIANESS = "general.endianess"

class LLM:
VOCAB_SIZE = "{arch}.vocab_size"
Expand Down Expand Up @@ -77,20 +78,20 @@ class SSM:
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"

class Tokenizer:
MODEL = "tokenizer.model" # STRING: e.g. llama, gpt2, etc...
TYPE = "tokenizer.type" # STRING: BPE, SPM, WPM, etc.
NORM = "tokenizer.norm" # OBJECT {"type": "ByteLevel", ...}
PRE = "tokenizer.pre" # OBJECT {"type": "ByteLevel", ...}
ADDED = "tokenizer.added" # ARRAY of OBJECTs: [{"id": 1, ...}, ...]
VOCAB = "tokenizer.vocab" # ARRAY of STRINGs: ["[BOS]", ...]
MERGES = "tokenizer.merges" # ARRAY of STRINGs: ["▁ t", ...]
TOKEN_TYPE = "tokenizer.token_type" # ARRAY of INT [2, ...]
TOKEN_TYPE_COUNT = "tokenizer.token_type_count" # BERT token types
SCORES = "tokenizer.scores" # WPM only
MODEL = "tokenizer.model" # STRING: e.g. llama, gpt2, etc...
TYPE = "tokenizer.type" # STRING: BPE, SPM, WPM, etc.
NORM = "tokenizer.norm" # OBJECT {"type": "ByteLevel", ...}
PRE = "tokenizer.pre" # OBJECT {"type": "ByteLevel", ...}
ADDED = "tokenizer.added" # ARRAY of OBJECTs: [{"id": 1, ...}, ...]
VOCAB = "tokenizer.vocab" # ARRAY of STRINGs: ["[BOS]", ...]
MERGES = "tokenizer.merges" # ARRAY of STRINGs: ["▁ t", ...]
TOKEN_TYPE = "tokenizer.token_type" # ARRAY of INT [2, ...]
TOKEN_TYPE_COUNT = "tokenizer.token_type_count" # BERT token types
SCORES = "tokenizer.scores" # WPM only
BOS_ID = "tokenizer.bos_token_id"
EOS_ID = "tokenizer.eos_token_id"
UNK_ID = "tokenizer.unknown_token_id"
SEP_ID = "tokenizer.seperator_token_id"
SEP_ID = "tokenizer.separator_token_id" # Fixed typo
PAD_ID = "tokenizer.padding_token_id"
CLS_ID = "tokenizer.cls_token_id"
MASK_ID = "tokenizer.mask_token_id"
Expand Down Expand Up @@ -1038,6 +1039,19 @@ def get_type(val: Any) -> GGUFValueType:
}


#
# Model File Types
#
class ModelFileExtension(Enum):
PT = ".pt" # torch
PTH = ".pth" # torch
BIN = ".bin" # torch
SAFETENSORS = ".safetensors" # safetensors
JSON = ".json" # transformers/tokenizers
MODEL = ".model" # sentencepiece
GGUF = ".gguf" # ggml/llama.cpp


#
# Tokenizer Types
#
Expand All @@ -1050,51 +1064,43 @@ class GGUFTokenType(IntEnum):
BYTE = 6


class GGUFTokenizerType(Enum):
class HFTokenizerType(Enum):
SPM = "SPM" # SentencePiece LLaMa tokenizer
BPE = "BPE" # BytePair GPT-2 tokenizer
WPM = "WPM" # WordPiece BERT tokenizer


#
# Model File Types
#
class GGUFFileExtension(Enum):
PT = ".pt" # torch
PTH = ".pth" # torch
BIN = ".bin" # torch
SAFETENSORS = ".safetensors" # safetensors
JSON = ".json" # transformers/tokenizers
MODEL = ".model" # sentencepiece
GGUF = ".gguf" # ggml/llama.cpp


#
# Normalizer Types
#
class GGUFNormalizerType(Enum):
class HFNormalizerType(Enum):
SEQUENCE = "Sequence"
NFC = "NFC"
NFD = "NFD"
NFKC = "NFKC"
NFKD = "NFKD"
NFC = "NFC"
NFD = "NFD"
NFKC = "NFKC"
NFKD = "NFKD"


#
# Pre-tokenizer Types
#
class GGUFPreTokenizerType(Enum):
WHITESPACE = "Whitespace"
METASPACE = "Metaspace"
BYTE_LEVEL = "ByteLevel"
class HFPreTokenizerType(Enum):
WHITESPACE = "Whitespace"
METASPACE = "Metaspace"
BYTE_LEVEL = "ByteLevel"
BERT_PRE_TOKENIZER = "BertPreTokenizer"
SEQUENCE = "Sequence"
SEQUENCE = "Sequence"


#
# HF Vocab Files
#
HF_TOKENIZER_BPE_FILES: tuple[str, ...] = ("config.json", "tokenizer_config.json", "tokenizer.json",)
HF_TOKENIZER_BPE_FILES = (
"config.json",
"tokenizer_config.json",
"tokenizer.json",
)

HF_TOKENIZER_SPM_FILES: tuple[str, ...] = HF_TOKENIZER_BPE_FILES + ("tokenizer.model",)

#
Expand Down Expand Up @@ -1123,6 +1129,7 @@ class GGUFPreTokenizerType(Enum):
KEY_GENERAL_SOURCE_URL = GGUFMetadataKeys.General.SOURCE_URL
KEY_GENERAL_SOURCE_REPO = GGUFMetadataKeys.General.SOURCE_REPO
KEY_GENERAL_FILE_TYPE = GGUFMetadataKeys.General.FILE_TYPE
KEY_GENERAL_ENDIANESS = GGUFMetadataKeys.General.ENDIANESS

# LLM
KEY_VOCAB_SIZE = GGUFMetadataKeys.LLM.VOCAB_SIZE
Expand Down
Loading

0 comments on commit 5836d6c

Please sign in to comment.