Greedily tokenize strings with the longest tokens iteratively,
compatible with transformers.PretrainedTokenizer
and transformers.AutoTokenizer
.
git clone https://github.com/ModelTC/greedy-tokenizer.git
cd greedy-tokenizer
pip install -e .
Or use the source file directly.
from greedy_tokenizer import GreedyTokenizer
from transformers import AutoTokenizer
# Construct GreedyTokenizer with other PretrainedTokenizer
tokenizer = GreedyTokenizer.from_other_pretrained(
"internlm/internlm2-chat-7b",
trust_remote_code=True,
revision="main",
use_fast=False,
)
# Or, you can use:
# old_tokenizer = AutoTokenizer.from_pretrained(...)
# tokenizer = GreedyTokenizer.mock_tokenizer(old_tokenizer)
seq = "Hello! 你好呀!🌠"
tokens = tokenizer.tokenize(seq)
print(tokens)
# ['Hello', '!', ' ', '你好', '呀', '!', '<0xF0>', '<0x9F>', '<0x8C>', '<0xA0>']
assert tokenizer.convert_tokens_to_string(tokens) == seq
# GreedyTokenizer can also be saved and loaded
tokenizer.save_pretrained("/tmp/internlm2-chat-gt")
tokenizer = AutoTokenizer.from_pretrained(
"/tmp/internlm2-chat-gt",
trust_remote_code=True,
use_fast=False,
)
# No subwords required!
gt = GreedyTokenizer(vocab=[f'<0x{i:02x}>' for i in range(256)] + ['你好呀'])
print(gt.tokenize('你好你好呀'))
# ['<0xe4>', '<0xbd>', '<0xa0>', '<0xe5>', '<0xa5>', '<0xbd>', '你好呀']
pip install -e ".[test]"
pytest -s
# You can set some environment variables
# DATASET=happylkx/InstructCoder COLUMN=input pytest -s
- © 2023 Chielo Newctle <[email protected]>
- © 2023 ModelTC Team
This project is licensed under either of
at your option.
The SPDX license identifier for this project is MIT OR Apache-2.0
.