From 55a1c988adf24805332f6fd2c186f57e60a7cc6b Mon Sep 17 00:00:00 2001
From: Jai <jai@jai.one>
Date: Wed, 31 Jan 2024 16:39:38 -0800
Subject: [PATCH 1/3] Add load_clean_dataset function to dataset.py

---
 src/delphi/dataset/dataset.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 src/delphi/dataset/dataset.py

diff --git a/src/delphi/dataset/dataset.py b/src/delphi/dataset/dataset.py
new file mode 100644
index 00000000..41a43a6f
--- /dev/null
+++ b/src/delphi/dataset/dataset.py
@@ -0,0 +1,15 @@
+from datasets import load_dataset
+from tqdm.auto import tqdm
+
+
+def load_clean_dataset(split: str, tokenized: bool = False) -> list[str]:
+    # checking just startswith, because you can include slice like "train[:1000]"
+    assert split.startswith("train") or split.startswith("validation")
+    hf_ds = load_dataset(
+        f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}"
+    )
+    dataset = []
+    # hf_ds technically isn't guaranteed to be subscriptable, but it is in this case
+    for sample_txt in tqdm(hf_ds["tokens" if tokenized else "text"]):  # type: ignore
+        dataset.append(sample_txt)
+    return dataset

From 6d4ec8289f7ccfb36de8892f91ecf5b059f605df Mon Sep 17 00:00:00 2001
From: Jai <jai@jai.one>
Date: Wed, 31 Jan 2024 16:42:02 -0800
Subject: [PATCH 2/3] Porting jettjaniak/tinyevals#19

---
 src/delphi/dataset/dataset.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/delphi/dataset/dataset.py b/src/delphi/dataset/dataset.py
index 41a43a6f..0590ff2a 100644
--- a/src/delphi/dataset/dataset.py
+++ b/src/delphi/dataset/dataset.py
@@ -6,10 +6,11 @@ def load_clean_dataset(split: str, tokenized: bool = False) -> list[str]:
     # checking just startswith, because you can include slice like "train[:1000]"
     assert split.startswith("train") or split.startswith("validation")
     hf_ds = load_dataset(
-        f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}"
+        f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}",
+        split=split,
     )
     dataset = []
     # hf_ds technically isn't guaranteed to be subscriptable, but it is in this case
-    for sample_txt in tqdm(hf_ds["tokens" if tokenized else "text"]):  # type: ignore
-        dataset.append(sample_txt)
+    for sample in tqdm(hf_ds["tokens" if tokenized else "text"]):  # type: ignore
+        dataset.append(sample)
     return dataset

From c682793997e82f835f34ff6b9a419d54351d3355 Mon Sep 17 00:00:00 2001
From: Jai <jai@jai.one>
Date: Wed, 31 Jan 2024 16:44:07 -0800
Subject: [PATCH 3/3] Porting jettjaniak/tinyevals#23

---
 src/delphi/dataset/dataset.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/delphi/dataset/dataset.py b/src/delphi/dataset/dataset.py
index 0590ff2a..6b4bad26 100644
--- a/src/delphi/dataset/dataset.py
+++ b/src/delphi/dataset/dataset.py
@@ -1,16 +1,21 @@
 from datasets import load_dataset
 from tqdm.auto import tqdm
 
-
 def load_clean_dataset(split: str, tokenized: bool = False) -> list[str]:
     # checking just startswith, because you can include slice like "train[:1000]"
     assert split.startswith("train") or split.startswith("validation")
-    hf_ds = load_dataset(
-        f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}",
-        split=split,
-    )
+    hf_ds = load_dataset(f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}", split=split)
     dataset = []
     # hf_ds technically isn't guaranteed to be subscriptable, but it is in this case
     for sample in tqdm(hf_ds["tokens" if tokenized else "text"]):  # type: ignore
         dataset.append(sample)
     return dataset
+
+def token_map(tokenized_dataset: list[list[int]]) -> dict[int, list[tuple[int, int]]]:
+    mapping = {}
+
+    for prompt_idx, prompt in enumerate(tokenized_dataset):
+        for token_idx, token in enumerate(prompt):
+            mapping.setdefault(token, []).append((prompt_idx, token_idx))
+
+    return mapping
\ No newline at end of file