Skip to content

Commit

Permalink
Add load_clean_dataset function to dataset.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Jai committed Feb 1, 2024
1 parent cb3976e commit 55a1c98
Showing 1 changed file with 15 additions and 0 deletions.
15 changes: 15 additions & 0 deletions src/delphi/dataset/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from datasets import load_dataset
from tqdm.auto import tqdm


def load_clean_dataset(split: str, tokenized: bool = False) -> list[str]:
# checking just startswith, because you can include slice like "train[:1000]"
assert split.startswith("train") or split.startswith("validation")
hf_ds = load_dataset(
f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}"
)
dataset = []
# hf_ds technically isn't guaranteed to be subscriptable, but it is in this case
for sample_txt in tqdm(hf_ds["tokens" if tokenized else "text"]): # type: ignore
dataset.append(sample_txt)
return dataset

0 comments on commit 55a1c98

Please sign in to comment.