diff --git a/src/delphi/dataset/dataset.py b/src/delphi/dataset/dataset.py new file mode 100644 index 00000000..41a43a6f --- /dev/null +++ b/src/delphi/dataset/dataset.py @@ -0,0 +1,15 @@ +from datasets import load_dataset +from tqdm.auto import tqdm + + +def load_clean_dataset(split: str, tokenized: bool = False) -> list[str]: + # checking just startswith, because you can include slice like "train[:1000]" + assert split.startswith("train") or split.startswith("validation") + hf_ds = load_dataset( + f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}" + ) + dataset = [] + # hf_ds technically isn't guaranteed to be subscriptable, but it is in this case + for sample_txt in tqdm(hf_ds["tokens" if tokenized else "text"]): # type: ignore + dataset.append(sample_txt) + return dataset