Skip to content

Commit

Permalink
chore: fixed data paths in tokenizer tests
Browse files Browse the repository at this point in the history
  • Loading branch information
le1nux committed Jan 13, 2025
1 parent 14718cc commit 4648f1b
Showing 1 changed file with 23 additions and 16 deletions.
39 changes: 23 additions & 16 deletions tests/dataloader/test_end_to_end_indexation_and_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
)
sentence_piece_settings = TokenizerSettings(
tokenizer_type=TokenizerTypes.sentence_piece,
tokenizer_name_or_path="../data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model",
tokenizer_name_or_path="data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model",
)


Expand All @@ -29,65 +29,72 @@
[
# without errors
# test with the actual eod token
(gpt2_settings, Path("data/datasets/lorem_ipsum_long.jsonl"), "text", "<|endoftext|>", False, None),
(xlm_roberta_large_settings, Path("data/datasets/lorem_ipsum_long.jsonl"), "text", "</s>", False, None),
(sentence_piece_settings, Path("data/datasets/lorem_ipsum_long.jsonl"), "text", "</s>", False, None),
(gpt2_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "<|endoftext|>", False, None),
(xlm_roberta_large_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "</s>", False, None),
(sentence_piece_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "</s>", False, None),
# without \n in the last line
(
gpt2_settings,
Path("data/datasets/lorem_ipsum_without_last_newline.jsonl"),
Path("tests/data/datasets/lorem_ipsum_without_last_newline.jsonl"),
"text",
"<|endoftext|>",
False,
None,
),
(
xlm_roberta_large_settings,
Path("data/datasets/lorem_ipsum_without_last_newline.jsonl"),
Path("tests/data/datasets/lorem_ipsum_without_last_newline.jsonl"),
"text",
"</s>",
False,
None,
),
(
sentence_piece_settings,
Path("data/datasets/lorem_ipsum_without_last_newline.jsonl"),
Path("tests/data/datasets/lorem_ipsum_without_last_newline.jsonl"),
"text",
"</s>",
False,
None,
),
(gpt2_settings, Path("data/datasets/danish_test_dataset.jsonl"), "text", "<|endoftext|>", False, None),
(xlm_roberta_large_settings, Path("data/datasets/danish_test_dataset.jsonl"), "text", "</s>", False, None),
(sentence_piece_settings, Path("data/datasets/danish_test_dataset.jsonl"), "text", "</s>", False, None),
(gpt2_settings, Path("tests/data/datasets/danish_test_dataset.jsonl"), "text", "<|endoftext|>", False, None),
(
xlm_roberta_large_settings,
Path("tests/data/datasets/danish_test_dataset.jsonl"),
"text",
"</s>",
False,
None,
),
(sentence_piece_settings, Path("tests/data/datasets/danish_test_dataset.jsonl"), "text", "</s>", False, None),
# we also accept tokens as eod token that are not the original eod token or any other special token
# A normal token such as "a" will pass through. It is the users obligation to pick the correct eod token
# for a given tokenizer. The reason is that there is no way to get this information for all tokenizer
# implementations regarding the true eod token!
(gpt2_settings, Path("data/datasets/lorem_ipsum_long.jsonl"), "text", "a", False, None),
(xlm_roberta_large_settings, Path("data/datasets/lorem_ipsum_long.jsonl"), "text", "a", False, None),
(sentence_piece_settings, Path("data/datasets/lorem_ipsum_long.jsonl"), "text", "a", False, None),
(gpt2_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "a", False, None),
(xlm_roberta_large_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "a", False, None),
(sentence_piece_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "a", False, None),
# with errors / warnings
# eod token is not a single token
(
gpt2_settings,
Path("data/datasets/lorem_ipsum_long.jsonl"),
Path("tests/data/datasets/lorem_ipsum_long.jsonl"),
"text",
"abc123",
False,
"The provided eod token .* has the same token id (.*) as the unk token",
),
(
xlm_roberta_large_settings,
Path("data/datasets/lorem_ipsum_long.jsonl"),
Path("tests/data/datasets/lorem_ipsum_long.jsonl"),
"text",
"abc123",
False,
"The provided eod token .* has the same token id (.*) as the unk token",
),
# with errors
# eod token is not a single token
(sentence_piece_settings, Path("data/datasets/lorem_ipsum_long.jsonl"), "text", "abc123", True, None),
(sentence_piece_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "abc123", True, None),
],
)
def test_end_to_end_indexation_and_tokenization_consistency(
Expand Down

0 comments on commit 4648f1b

Please sign in to comment.