diff --git a/CHANGELOG_DEV.md b/CHANGELOG_DEV.md index 42e629f95..d30a6280d 100644 --- a/CHANGELOG_DEV.md +++ b/CHANGELOG_DEV.md @@ -153,4 +153,14 @@ https://github.com/Modalities/modalities/blob/0483362abac93e45850e56adaea7921e96 I added a switch case that maps to the respective byte sizes, when packing the data. -This adds some inefficiencies as a vobabulary size > 65536 already requires 4 bytes per token, effectively doubling the storage requirements. \ No newline at end of file +This adds some inefficiencies as a vobabulary size > 65536 already requires 4 bytes per token, effectively doubling the storage requirements. + + +## PR #283: Bug fix: Only append eod token once when packing / tokenizing + +Some HF tokenisers such as `xlm-roberta-large` add special tokens (e.g., eod token) automatically when encoding text, whereas others, such as `gpt2`, do not add special tokens. + +This side-effect in the transformers library has lead to the eod token being appended twice when tokenizing / packing our data. We added a check for this and only append the eod token once now: +https://github.com/Modalities/modalities/blob/1c1ccdc973283c45bc8c9fadf4d20f03e435cd04/src/modalities/dataloader/create_packed_data.py#L327-L330 + +Additionally, I added a script that verifies the consistency of the indexation and tokenization of a given JSONL file. We run the indexation and tokenization routines in modalities and compare it to tokenized JSONL file to which we applied the HF tokenizer directly. \ No newline at end of file diff --git a/data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model b/data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model new file mode 100644 index 000000000..0be1dd03a Binary files /dev/null and b/data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model differ diff --git a/src/modalities/dataloader/create_packed_data.py b/src/modalities/dataloader/create_packed_data.py index 7dc5fce49..5984de182 100644 --- a/src/modalities/dataloader/create_packed_data.py +++ b/src/modalities/dataloader/create_packed_data.py @@ -63,8 +63,8 @@ def __init__( self.tokenizer = tokenizer self.eod_token = eod_token self._token_size_in_bytes = self._get_required_num_of_bytes_to_repr(self.tokenizer.vocab_size) - encoded_eod_token = self.tokenizer.get_token_id(self.eod_token) - self._encoded_eos_token_as_bytes = self._encoded_token_to_bytes(encoded_eod_token) + eod_token_id = self.tokenizer.get_token_id(self.eod_token) + self._encoded_eod_token_as_bytes = self._encoded_token_to_bytes(eod_token_id) self.jq_filter = jq.compile(jq_pattern) self._number_of_processes = number_of_processes self._reader = LargeFileLinesReader(src_path, index_path=index_path) # reads string with utf-8 encoding @@ -323,7 +323,10 @@ def _process_line(self, line: str, process_id: int) -> bytes: tokens = self.tokenizer.tokenize(jq_retrieved_text) if len(tokens) == 0: raise EmptySampleError("Received empty sample...") - return b"".join(map(self._encoded_token_to_bytes, tokens)) + self._encoded_eos_token_as_bytes + token_byte_string = b"".join(map(self._encoded_token_to_bytes, tokens)) + if not token_byte_string.endswith(self._encoded_eod_token_as_bytes): + token_byte_string = token_byte_string + self._encoded_eod_token_as_bytes + return token_byte_string class EmbeddedStreamData: diff --git a/src/modalities/tokenization/tokenizer_wrapper.py b/src/modalities/tokenization/tokenizer_wrapper.py index e9e778fc0..211f5801f 100644 --- a/src/modalities/tokenization/tokenizer_wrapper.py +++ b/src/modalities/tokenization/tokenizer_wrapper.py @@ -1,3 +1,4 @@ +import warnings from abc import ABC from typing import Optional @@ -62,6 +63,20 @@ def get_token_id(self, token: str) -> int: """ raise NotImplementedError + def is_special_token_id(self, token_id: int) -> bool: + """Returns whether a token ID is a special token ID. + + Args: + token_id (int): Token ID to check. + + Raises: + NotImplementedError: Must be implemented by a subclass. + + Returns: + bool: Flag whether the token ID is a special token ID. + """ + raise NotImplementedError + class PreTrainedHFTokenizer(TokenizerWrapper): """Wrapper for pretrained Hugging Face tokenizers.""" @@ -102,6 +117,7 @@ def __init__( self.max_length = max_length self.truncation = truncation self.padding = padding + self.special_token_ids = set(self.tokenizer.all_special_ids) @property def vocab_size(self) -> int: @@ -163,10 +179,25 @@ def get_token_id(self, token: str) -> int: int: Token ID. """ token_id = self.tokenizer.convert_tokens_to_ids(token) - if isinstance(token_id, list): + if not isinstance(token_id, int): raise ValueError("Token is not represented by a single token id!") + if token_id is None: + raise ValueError("Token is not represented by a single token id!") + elif token_id == self.tokenizer.unk_token_id: + warnings.warn(f"The provided eod token {token} has the same token id ({token_id}) as the unk token") return token_id + def is_special_token_id(self, token_id: int) -> bool: + """Returns whether a token ID is a special token ID. + + Args: + token_id (int): Token ID to check. + + Returns: + bool: Flag whether the token ID is a special token ID. + """ + return token_id in self.special_token_ids + class PreTrainedSPTokenizer(TokenizerWrapper): """Wrapper for pretrained SentencePiece tokenizers.""" @@ -189,8 +220,8 @@ def tokenize(self, text: str) -> list[int]: Returns: list[int]: List of token IDs. """ - tokens = self.tokenizer.encode(text) - return tokens + token_ids = self.tokenizer.Encode(text) + return token_ids def decode(self, token_ids: list[int]) -> str: """Decodes a list of token IDs into the original text. @@ -201,7 +232,7 @@ def decode(self, token_ids: list[int]) -> str: Returns: str: Decoded text. """ - decoded_text = self.tokenizer.decode(token_ids) + decoded_text = self.tokenizer.Decode(token_ids) return decoded_text @property @@ -226,6 +257,22 @@ def get_token_id(self, token: str) -> int: int: Token ID. """ piece_id = self.tokenizer.PieceToId(token) + if not isinstance(piece_id, int): + raise ValueError("Token cannot be represented by a single token ID!") if piece_id == self.tokenizer.unk_id(): - raise ValueError("Token is not represented by a single token id!") + raise ValueError("Token cannot be represented by a single token id!") return piece_id + + def is_special_token_id(self, token_id: int) -> bool: + """Returns whether a token ID is a special token ID. + + Args: + token_id (int): Token ID to check. + + Raises: + NotImplementedError: Must be implemented by a subclass. + + Returns: + bool: Flag whether the token ID is a special token ID. + """ + return self.tokenizer.IsControl(token_id) diff --git a/src/modalities/utils/verify_tokenization_consistency.py b/src/modalities/utils/verify_tokenization_consistency.py new file mode 100644 index 000000000..224a8cfee --- /dev/null +++ b/src/modalities/utils/verify_tokenization_consistency.py @@ -0,0 +1,205 @@ +import json +import os +import pickle +import tempfile +import warnings +from enum import Enum +from pathlib import Path +from typing import Callable + +import sentencepiece as spm +import tqdm +from transformers import AutoTokenizer + +from modalities.api import create_raw_data_index, pack_encoded_data +from modalities.dataloader.dataset import PackedMemMapDatasetBase + + +class TokenizerTypes(Enum): + sentence_piece = "sentence_piece" + hugging_face = "hugging_face" + + +def _run_tokenization( + src_path: Path, index_path: Path, pbin_path: Path, eod_token: str, tokenizer_config: dict, jq_pattern: str = ".text" +): + # create index + create_raw_data_index(src_path=src_path, index_path=index_path) + # run tokenization + num_cpus = os.cpu_count() + + tokenization_config_dict = { + "settings": { + "src_path": src_path, + "dst_path": pbin_path, + "index_path": index_path, + "jq_pattern": jq_pattern, + "num_cpus": num_cpus, + "eod_token": eod_token, + "processing_batch_size": 10, + "raw_samples_queue_size": 300, + "processed_samples_queue_size": 300, + }, + "tokenizer": {**tokenizer_config}, + } + + pack_encoded_data(config_dict=tokenization_config_dict) + + +def _verify_index(src_path: Path, index_path: Path): + with open(src_path, "rb") as f: + jsonl_binary_string = f.read() + + with open(src_path, "rb") as f: + binary_string_list = f.readlines() + + with open(src_path, "r", encoding="utf-8") as f: + string_list = f.readlines() + + with open(index_path, "rb") as f: + jsonl_index = pickle.load(f) + + assert ( + len(jsonl_binary_string.split(b"\n")) - int(jsonl_binary_string.endswith(b"\n")) + == len(binary_string_list) + == len(string_list) + == len(jsonl_index) + ) + + for i, (offset, length) in tqdm.tqdm(enumerate(jsonl_index), desc="Verifying index"): + # check that the index works correctly on the binary data + binary_string = binary_string_list[i] + if binary_string.endswith(b"\n"): + binary_string = binary_string[:-1] + assert jsonl_binary_string[offset : offset + length] == binary_string + + # check that string when encoded with utf-8 matches the binary data + string = string_list[i] + if string.endswith("\n"): + string = string[:-1] + assert jsonl_binary_string[offset : offset + length] == string.encode("utf-8") + + +def _verify_pbin( + src_path: Path, + pbin_path: Path, + eod_token_id: int, + tokenizer: Callable[[str], list[int]], + jsonl_text_key: str, +): + dataset = PackedMemMapDatasetBase(raw_data_path=pbin_path, sample_key="text", load_index=True) + + with open(src_path, "r", encoding="utf-8") as f: + string_list = f.readlines() + string_list_tokenized = [tokenizer(json.loads(string)[jsonl_text_key]) for string in string_list] + + for i in tqdm.tqdm(range(len(dataset)), desc="Verifying pbin"): + pbin_sample = dataset[i]["text"] + recomputed_sample = string_list_tokenized[i] + + # make sure that only the last token is the eod token + # and that the second last token is not the eod token + assert pbin_sample[-1] == eod_token_id + assert pbin_sample[-2] != eod_token_id + + # we need to check if tokenizer adds the eod token as + # some tokenizers don't add the eod token at the end of the string + # whereas modalities always adds the eod token at the end of the string + if recomputed_sample[-1] != eod_token_id: + if i == 0: + warnings.warn("The tokenizer does not add the eod token at the end of the string!") + assert len(pbin_sample) - 1 == len(recomputed_sample) + assert all(pbin_sample[:-1] == recomputed_sample) + else: + assert len(pbin_sample) == len(recomputed_sample) + assert all(pbin_sample == recomputed_sample) + + +def build_hf_tokenization_components(tokenizer_path_or_name: str, eod_token: str): + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path_or_name) + + def tokenizer_callable(text: str) -> list[int]: + return tokenizer(text, add_special_tokens=True, max_length=51200000, padding=False, truncation=False)[ + "input_ids" + ] + + tokenizer_config = { + "component_key": "tokenizer", + "variant_key": "pretrained_hf_tokenizer", + "config": { + "pretrained_model_name_or_path": tokenizer_path_or_name, + "padding": False, + "max_length": 51200000, + }, + } + + eod_token_id = tokenizer.convert_tokens_to_ids(eod_token) + return tokenizer_callable, tokenizer_config, eod_token_id + + +def build_sp_tokenization_components(tokenizer_path: Path, eod_token: str): + tokenizer = spm.SentencePieceProcessor() + tokenizer.Load(tokenizer_path) + + def tokenizer_callable(text: str) -> list[int]: + return tokenizer.Encode(text) + + tokenizer_config = { + "component_key": "tokenizer", + "variant_key": "pretrained_sp_tokenizer", + "config": { + "tokenizer_model_file": tokenizer_path, + }, + } + + eod_token_id = tokenizer.PieceToId(eod_token) + return tokenizer_callable, tokenizer_config, eod_token_id + + +def verify_tokenization_consistency( + src_path: Path, + eod_token: str, + eod_token_id: int, + tokenizer: Callable[[str], list[int]], + tokenizer_config: dict, + jsonl_text_key: str, +): + """Verifies that the indexation and tokenization is consistent. + This function applies the indexation and tokenization routines and then verifies + that the index always captures entire samples and that the tokens in the JSON + are correctly determined. + For an example verification check out the test_end_to_end_indexation_and_tokenization_consistency test + + Args: + src_path (Path): Path to the JSONL file + eod_token (str): end of document token + eod_token_id (int): The token id of the end of document token + tokenizer (Callable[[str], list[int]]): Callable executing the tokenization + tokenizer_config (dict): Tokenizer config (same as used in the tokenization entry point) + jsonl_text_key (str): The key mapping to the text of interest in each JSON file + """ + # run indeaxing and tokenization + with tempfile.TemporaryDirectory() as tmp_dir: + index_path = Path(tmp_dir) / "index.idx" + pbin_path = Path(tmp_dir) / "data.pbin" + _run_tokenization( + src_path=src_path, + index_path=index_path, + pbin_path=pbin_path, + eod_token=eod_token, + tokenizer_config=tokenizer_config, + jq_pattern=f".{jsonl_text_key}", + ) + + # verify the index + _verify_index(src_path=src_path, index_path=index_path) + print("Index verified") + # verify the tokenized data + _verify_pbin( + src_path=src_path, + pbin_path=pbin_path, + eod_token_id=eod_token_id, + tokenizer=tokenizer, + jsonl_text_key=jsonl_text_key, + ) + print("Tokenization verified") diff --git a/tests/data/datasets/lorem_ipsum_long.jsonl b/tests/data/datasets/lorem_ipsum_long.jsonl index 6a3e4c01d..3ce5f9e28 100644 --- a/tests/data/datasets/lorem_ipsum_long.jsonl +++ b/tests/data/datasets/lorem_ipsum_long.jsonl @@ -497,4 +497,4 @@ {"text": "496 Lorem ipsum dolor sit amet, consetetur "} {"text": "497 Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."} {"text": "498 Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."} -{"text": "499 Lorem ipsum dolor sit amet, consetetur sadipscing elitr"} \ No newline at end of file +{"text": "499 Lorem ipsum dolor sit amet, consetetur sadipscing elitr"} diff --git a/tests/data/datasets/lorem_ipsum_without_last_newline.jsonl b/tests/data/datasets/lorem_ipsum_without_last_newline.jsonl new file mode 100644 index 000000000..5409c175c --- /dev/null +++ b/tests/data/datasets/lorem_ipsum_without_last_newline.jsonl @@ -0,0 +1,9 @@ +{"text": "0 Lorem ipsum dolor sit amet, consetetur "} +{"text": "1 Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."} +{"text": "2 Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."} +{"text": "3 Lorem ipsum dolor sit amet, consetetur sadipscing elitr"} +{"text": "4 Lorem ipsum dolor sit amet, consetetur "} +{"text": "5 Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."} +{"text": "6 Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."} +{"text": "7 Lorem ipsum dolor sit amet, consetetur sadipscing elitr"} +{"text": "8 Lorem ipsum dolor sit amet, consetetur "} \ No newline at end of file diff --git a/tests/dataloader/test_end_to_end_indexation_and_tokenization.py b/tests/dataloader/test_end_to_end_indexation_and_tokenization.py new file mode 100644 index 000000000..f081c1657 --- /dev/null +++ b/tests/dataloader/test_end_to_end_indexation_and_tokenization.py @@ -0,0 +1,154 @@ +from collections import namedtuple +from pathlib import Path + +import pytest + +from modalities.utils.verify_tokenization_consistency import ( + TokenizerTypes, + build_hf_tokenization_components, + build_sp_tokenization_components, + verify_tokenization_consistency, +) + +TokenizerSettings = namedtuple("TokenizerSettings", "tokenizer_type tokenizer_name_or_path") +gpt2_settings = TokenizerSettings( + tokenizer_type=TokenizerTypes.hugging_face, + tokenizer_name_or_path="gpt2", +) +xlm_roberta_large_settings = TokenizerSettings( + tokenizer_type=TokenizerTypes.hugging_face, tokenizer_name_or_path="xlm-roberta-large" +) +sentence_piece_settings = TokenizerSettings( + tokenizer_type=TokenizerTypes.sentence_piece, + tokenizer_name_or_path="data/tokenizer/sentencepiece_dclm/en_32k_tokenizer.model", +) + + +@pytest.mark.parametrize( + "tokenizer_settings, src_path, jsonl_text_key, eod_token, expect_error, expected_warning", + [ + # without errors + # test with the actual eod token + (gpt2_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "<|endoftext|>", False, None), + (xlm_roberta_large_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "", False, None), + (sentence_piece_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "", False, None), + # without \n in the last line + ( + gpt2_settings, + Path("tests/data/datasets/lorem_ipsum_without_last_newline.jsonl"), + "text", + "<|endoftext|>", + False, + None, + ), + ( + xlm_roberta_large_settings, + Path("tests/data/datasets/lorem_ipsum_without_last_newline.jsonl"), + "text", + "", + False, + None, + ), + ( + sentence_piece_settings, + Path("tests/data/datasets/lorem_ipsum_without_last_newline.jsonl"), + "text", + "", + False, + None, + ), + (gpt2_settings, Path("tests/data/datasets/danish_test_dataset.jsonl"), "text", "<|endoftext|>", False, None), + ( + xlm_roberta_large_settings, + Path("tests/data/datasets/danish_test_dataset.jsonl"), + "text", + "", + False, + None, + ), + (sentence_piece_settings, Path("tests/data/datasets/danish_test_dataset.jsonl"), "text", "", False, None), + # we also accept tokens as eod token that are not the original eod token or any other special token + # A normal token such as "a" will pass through. It is the users obligation to pick the correct eod token + # for a given tokenizer. The reason is that there is no way to get this information for all tokenizer + # implementations regarding the true eod token! + (gpt2_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "a", False, None), + (xlm_roberta_large_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "a", False, None), + (sentence_piece_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "a", False, None), + # with errors / warnings + # eod token is not a single token + ( + gpt2_settings, + Path("tests/data/datasets/lorem_ipsum_long.jsonl"), + "text", + "abc123", + False, + "The provided eod token .* has the same token id (.*) as the unk token", + ), + ( + xlm_roberta_large_settings, + Path("tests/data/datasets/lorem_ipsum_long.jsonl"), + "text", + "abc123", + False, + "The provided eod token .* has the same token id (.*) as the unk token", + ), + # with errors + # eod token is not a single token + (sentence_piece_settings, Path("tests/data/datasets/lorem_ipsum_long.jsonl"), "text", "abc123", True, None), + ], +) +def test_end_to_end_indexation_and_tokenization_consistency( + tokenizer_settings: TokenizerSettings, + src_path: Path, + jsonl_text_key: str, + eod_token: str, + expect_error: bool, + expected_warning: str, +): + # hf + if tokenizer_settings.tokenizer_type == TokenizerTypes.hugging_face: + tokenizer_callable, tokenizer_config, eod_token_id = build_hf_tokenization_components( + tokenizer_path_or_name=tokenizer_settings.tokenizer_name_or_path, + eod_token=eod_token, + ) + print(f"{eod_token_id=}") + + # sentence piece + elif tokenizer_settings.tokenizer_type == TokenizerTypes.sentence_piece: + tokenizer_callable, tokenizer_config, eod_token_id = build_sp_tokenization_components( + tokenizer_path=tokenizer_settings.tokenizer_name_or_path, + eod_token=eod_token, + ) + + else: + raise ValueError(f"Tokenizer type {tokenizer_settings.tokenizer_type} not supported!") + + if expect_error: + with pytest.raises(Exception): + verify_tokenization_consistency( + src_path=src_path, + eod_token=eod_token, + eod_token_id=eod_token_id, + tokenizer=tokenizer_callable, + tokenizer_config=tokenizer_config, + jsonl_text_key=jsonl_text_key, + ) + elif expected_warning is not None: + with pytest.warns(UserWarning, match=expected_warning): + verify_tokenization_consistency( + src_path=src_path, + eod_token=eod_token, + eod_token_id=eod_token_id, + tokenizer=tokenizer_callable, + tokenizer_config=tokenizer_config, + jsonl_text_key=jsonl_text_key, + ) + else: + verify_tokenization_consistency( + src_path=src_path, + eod_token=eod_token, + eod_token_id=eod_token_id, + tokenizer=tokenizer_callable, + tokenizer_config=tokenizer_config, + jsonl_text_key=jsonl_text_key, + ) diff --git a/tests/dataloader/test_large_file_lines_reader.py b/tests/dataloader/test_large_file_lines_reader.py index 47afd9074..1234c5edd 100644 --- a/tests/dataloader/test_large_file_lines_reader.py +++ b/tests/dataloader/test_large_file_lines_reader.py @@ -27,7 +27,7 @@ def create_dummy_data(tmpdir_path: Path, byte_content: bytes) -> Path: b"It also includes malformatted json chars, like\n{{\nbut does not come with a trailing newline char...", ], ) -def test_index_creation(tmpdir, dummy_binary_content: bytes): +def test_index_creation(tmpdir: Path, dummy_binary_content: bytes): # dumps the dummy content to a file # e.g. the line "ø This is \na du" is represented by the hex string: # c3 b8 20 54 68 69 73 20 69 73 20 0a 61 20 64 75