Skip to content

Commit

Permalink
feat: display eom/eot in tokenizer
Browse files Browse the repository at this point in the history
Signed-off-by: thxCode <[email protected]>
  • Loading branch information
thxCode committed Aug 5, 2024
1 parent 3d12f93 commit 41345ae
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 30 deletions.
60 changes: 30 additions & 30 deletions cmd/gguf-parser/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,11 @@ $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistra
| ARCHITECTURE | 32768 | 4096 | 4 | true | 32 | 32 | 14336 | 0 | 32032 |
+--------------+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 450.50 KiB | 32032 | N/A | 1 | 32000 | N/A | N/A | N/A |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | EOT Token | EOM Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 450.50 KiB | 32032 | N/A | 1 | 32000 | N/A | N/A | N/A | N/A | N/A |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------+------------------------------+------------+-------------+
| \ | Arch | Context Size | Flash Attention | MMap Support | Embedding Only | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM |
Expand All @@ -138,11 +138,11 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8
| ARCHITECTURE | 32768 | 4096 | 4 | true | 32 | 32 | 14336 | 8 | 32002 |
+--------------+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 449.91 KiB | 32002 | N/A | 1 | 32000 | 0 | N/A | 2 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | EOT Token | EOM Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 449.91 KiB | 32002 | N/A | 1 | 32000 | N/A | N/A | 0 | N/A | 2 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------+------------------------------------+------------+-------------+
| \ | Arch | Context Size | Flash Attention | MMap Support | Embedding Only | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM |
Expand All @@ -168,11 +168,11 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode
| ARCHITECTURE | 8192 | 4096 | 4 | true | 32 | 32 | 14336 | 0 | 128256 |
+--------------+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | gpt2 | 2 MiB | 128256 | N/A | 128000 | 128001 | 128002 | N/A | 0 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | EOT Token | EOM Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | gpt2 | 2 MiB | 128256 | N/A | 128000 | 128001 | N/A | N/A | 128002 | N/A | 0 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------+---------------------------------+------------+-------------+
| \ | Arch | Context Size | Flash Attention | MMap Support | Embedding Only | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM |
Expand All @@ -198,11 +198,11 @@ $ gguf-parser --ms-repo="shaowenchen/chinese-alpaca-2-13b-16k-gguf" --ms-file="c
| ARCHITECTURE | 16384 | 5120 | 1 | true | N/A | 40 | 13824 | 0 | 55296 |
+--------------+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 769.83 KiB | 55296 | N/A | 1 | 2 | N/A | N/A | N/A |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | EOT Token | EOM Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 769.83 KiB | 55296 | N/A | 1 | 2 | N/A | N/A | N/A | N/A | N/A |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------+-----------------------------------+------------+-------------+
| \ | Arch | Context Size | Flash Attention | MMap Support | Embedding Only | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM |
Expand All @@ -228,11 +228,11 @@ $ gguf-parser --ol-model="gemma2"
| ARCHITECTURE | 8192 | 3584 | 2 | true | 16 | 42 | 14336 | 0 | 256000 |
+--------------+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 3.80 MiB | 256000 | N/A | 2 | 1 | 3 | N/A | 0 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | EOT Token | EOM Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 3.80 MiB | 256000 | N/A | 2 | 1 | N/A | N/A | 3 | N/A | 0 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+--------------+--------+--------------+-----------------+--------------+----------------+----------------+----------------+---------------------------------+------------+-------------+
| \ | Arch | Context Size | Flash Attention | MMap Support | Embedding Only | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM |
Expand All @@ -258,11 +258,11 @@ $ gguf-parser --ol-model="gemma2" --ol-usage
| ARCHITECTURE | 8192 | 3584 | 2 | true | 16 | 42 | 14336 | 0 | 256000 |
+--------------+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 3.80 MiB | 256000 | N/A | 2 | 1 | 3 | N/A | 0 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | EOT Token | EOM Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 3.80 MiB | 256000 | N/A | 2 | 1 | N/A | N/A | 3 | N/A | 0 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+--------------+--------+--------------+-----------------+--------------+----------------+----------------+----------------+----------------------------------+------------+-------------+
| \ | Arch | Context Size | Flash Attention | MMap Support | Embedding Only | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM |
Expand Down
4 changes: 4 additions & 0 deletions cmd/gguf-parser/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,8 @@ func mainAction(c *cli.Context) error {
"Added Tokens Len",
"BOS Token",
"EOS Token",
"EOT Token",
"EOM Token",
"Unknown Token",
"Separator Token",
"Padding Token",
Expand All @@ -918,6 +920,8 @@ func mainAction(c *cli.Context) error {
sprintf(tenary(t.AddedTokensLength <= 0, "N/A", t.AddedTokensLength)),
sprintf(tenary(t.BOSTokenID < 0, "N/A", t.BOSTokenID)),
sprintf(tenary(t.EOSTokenID < 0, "N/A", t.EOSTokenID)),
sprintf(tenary(t.EOTTokenID < 0, "N/A", t.EOTTokenID)),
sprintf(tenary(t.EOMTokenID < 0, "N/A", t.EOMTokenID)),
sprintf(tenary(t.UnknownTokenID < 0, "N/A", t.UnknownTokenID)),
sprintf(tenary(t.SeparatorTokenID < 0, "N/A", t.SeparatorTokenID)),
sprintf(tenary(t.PaddingTokenID < 0, "N/A", t.PaddingTokenID)),
Expand Down
20 changes: 20 additions & 0 deletions file_tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ type GGUFTokenizerMetadata struct {
//
// Use -1 if the token is not found.
EOSTokenID int64 `json:"eosTokenID"`
// EOTTokenID is the ID of the end of text token.
//
// Use -1 if the token is not found.
EOTTokenID int64 `json:"eotTokenID"`
// EOMTokenID is the ID of the end of message token.
//
// Use -1 if the token is not found.
EOMTokenID int64 `json:"eomTokenID"`
// UnknownTokenID is the ID of the unknown token.
//
// Use -1 if the token is not found.
Expand Down Expand Up @@ -50,6 +58,8 @@ func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizerMetadata) {
addedTokensKey = "tokenizer.ggml.added_tokens"
bosTokenIDKey = "tokenizer.ggml.bos_token_id"
eosTokenIDKey = "tokenizer.ggml.eos_token_id"
eotTokenIDKey = "tokenizer.ggml.eot_token_id"
eomTokenIDKey = "tokenizer.ggml.eom_token_id"
unknownTokenIDKey = "tokenizer.ggml.unknown_token_id"
separatorTokenIDKey = "tokenizer.ggml.separator_token_id"
paddingTokenIDKey = "tokenizer.ggml.padding_token_id"
Expand All @@ -62,13 +72,17 @@ func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizerMetadata) {
addedTokensKey,
bosTokenIDKey,
eosTokenIDKey,
eotTokenIDKey,
eomTokenIDKey,
unknownTokenIDKey,
separatorTokenIDKey,
paddingTokenIDKey,
})

gt.BOSTokenID = -1
gt.EOSTokenID = -1
gt.EOTTokenID = -1
gt.EOMTokenID = -1
gt.UnknownTokenID = -1
gt.SeparatorTokenID = -1
gt.PaddingTokenID = -1
Expand All @@ -95,6 +109,12 @@ func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizerMetadata) {
if v, ok := m[eosTokenIDKey]; ok {
gt.EOSTokenID = ValueNumeric[int64](v)
}
if v, ok := m[eotTokenIDKey]; ok {
gt.EOTTokenID = ValueNumeric[int64](v)
}
if v, ok := m[eomTokenIDKey]; ok {
gt.EOMTokenID = ValueNumeric[int64](v)
}
if v, ok := m[unknownTokenIDKey]; ok {
gt.UnknownTokenID = ValueNumeric[int64](v)
}
Expand Down

0 comments on commit 41345ae

Please sign in to comment.