Skip to content

Commit

Permalink
test: save tokenized input
Browse files Browse the repository at this point in the history
  • Loading branch information
mali-git committed Jan 21, 2025
1 parent 906a264 commit 5890220
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 9 deletions.
14 changes: 14 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import dataclasses
import os
import pickle
import string
from pathlib import Path
from unittest.mock import MagicMock

Expand All @@ -12,6 +13,7 @@
from modalities.checkpointing.checkpoint_saving import CheckpointSaving
from modalities.config.config import load_app_config_dict
from modalities.dataloader.create_index import IndexGenerator
from modalities.dataloader.create_packed_data import PackedDataGenerator
from modalities.dataloader.dataloader import LLMDataLoader
from modalities.dataloader.large_file_lines_reader import LargeFileLinesReader
from modalities.evaluator import Evaluator
Expand Down Expand Up @@ -223,3 +225,15 @@ def torch_distributed_cleanup():
else:
# see https://pytorch.org/docs/2.4/_modules/torch/cuda.html#device_count
torch.cuda._cached_device_count = None


@pytest.fixture
def encoding_set_up():
# Define the vocabulary
vocabulary = {char: idx for idx, char in enumerate(string.ascii_lowercase)}

# Ensure num_bytes_per_token is valid
num_bytes_per_token = PackedDataGenerator._get_required_num_of_bytes_to_repr(len(vocabulary))
assert num_bytes_per_token == 1 # This assertion will fail within the test framework if incorrect

return vocabulary, num_bytes_per_token
31 changes: 22 additions & 9 deletions tests/dataloader/test_shuffle_tokenized_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,21 @@
from modalities.dataloader.shuffle_tokenized_data import _process_batch, shuffle_tokenized_data


def test_process_batch_with_embedded_stream_with_memmap(tmp_path):
def _tokenize(text: str, vocabulary: dict[str, int]) -> list[int]:
text = text.lower()
return [vocabulary[char] for char in text]


def _convert_tokens_to_bytes(tokens: list[int], num_bytes_per_token: int) -> bytes:
return b"".join([token.to_bytes(num_bytes_per_token, byteorder="little", signed=False) for token in tokens])


def test_process_batch(tmp_path, encoding_set_up):
vocabulary, num_bytes_per_token = encoding_set_up
# Create a temporary file
file_path = tmp_path / "test_data.pbin"
data = b"IloveModalities" # Example data
data = _tokenize(text="IloveModalities", vocabulary=vocabulary)
data = _convert_tokens_to_bytes(data, num_bytes_per_token=num_bytes_per_token)

with open(file_path, "wb") as f:
f.write(data)
Expand All @@ -23,25 +34,27 @@ def test_process_batch_with_embedded_stream_with_memmap(tmp_path):
new_data, new_index = _process_batch(batch=batch, data=in_memory_data, start_position=0)

# Validate the result
expected_data = b"IloveModalities"
expected_index = [(0, 1), (1, 4), (5, 10)]
expected_data = data
expected_index = batch
assert (new_data, new_index) == (expected_data, expected_index)


def test_shuffle_tokenized_data(tmp_path):
def test_shuffle_tokenized_data(tmp_path, encoding_set_up):
vocabulary, num_bytes_per_token = encoding_set_up
# Create test input data
data = b"IloveModalities"
data = _tokenize(text="IloveModalities", vocabulary=vocabulary)
data = _convert_tokens_to_bytes(data, num_bytes_per_token=num_bytes_per_token)
data_section_length_as_bytes = len(data).to_bytes(
EmbeddedStreamData.DATA_SECTION_LENGTH_IN_BYTES, byteorder="little"
)
token_size_in_bytes = 4
token_size_as_bytes = token_size_in_bytes.to_bytes(
token_size_as_bytes = num_bytes_per_token.to_bytes(
EmbeddedStreamData.TOKEN_SIZE_DESCRIPTOR_LENGTH_IN_BYTES, byteorder="little"
)
index = [(0, 1), (1, 4), (5, 10)]

# Prepare the input file
input_path = tmp_path / "input.pbin"
output_path = tmp_path / "output.pbin"
with input_path.open("wb") as f:
f.write(data_section_length_as_bytes)
f.write(token_size_as_bytes)
Expand All @@ -51,7 +64,7 @@ def test_shuffle_tokenized_data(tmp_path):
for batch_size in [1, 2, 3]:
# Call shuffle_tokenized_data
output_path = tmp_path / "input_shuffled.pbin"
shuffle_tokenized_data(input_path, batch_size=batch_size)
shuffle_tokenized_data(input_data_path=input_path, output_data_path=output_path, batch_size=batch_size)

# Validate the output
assert output_path.is_file()
Expand Down

0 comments on commit 5890220

Please sign in to comment.