Add load_clean_dataset function to dataset.py

delphi-suite · Feb 1, 2024 · 55a1c98 · 55a1c98
1 parent cb3976e
commit 55a1c98
Showing 1 changed file with 15 additions and 0 deletions.
diff --git a/src/delphi/dataset/dataset.py b/src/delphi/dataset/dataset.py
@@ -0,0 +1,15 @@
+from datasets import load_dataset
+from tqdm.auto import tqdm
+
+
+def load_clean_dataset(split: str, tokenized: bool = False) -> list[str]:
+    # checking just startswith, because you can include slice like "train[:1000]"
+    assert split.startswith("train") or split.startswith("validation")
+    hf_ds = load_dataset(
+        f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}"
+    )
+    dataset = []
+    # hf_ds technically isn't guaranteed to be subscriptable, but it is in this case
+    for sample_txt in tqdm(hf_ds["tokens" if tokenized else "text"]):  # type: ignore
+        dataset.append(sample_txt)
+    return dataset