From 55a1c988adf24805332f6fd2c186f57e60a7cc6b Mon Sep 17 00:00:00 2001 From: Jai Date: Wed, 31 Jan 2024 16:39:38 -0800 Subject: [PATCH 1/3] Add load_clean_dataset function to dataset.py --- src/delphi/dataset/dataset.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 src/delphi/dataset/dataset.py diff --git a/src/delphi/dataset/dataset.py b/src/delphi/dataset/dataset.py new file mode 100644 index 00000000..41a43a6f --- /dev/null +++ b/src/delphi/dataset/dataset.py @@ -0,0 +1,15 @@ +from datasets import load_dataset +from tqdm.auto import tqdm + + +def load_clean_dataset(split: str, tokenized: bool = False) -> list[str]: + # checking just startswith, because you can include slice like "train[:1000]" + assert split.startswith("train") or split.startswith("validation") + hf_ds = load_dataset( + f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}" + ) + dataset = [] + # hf_ds technically isn't guaranteed to be subscriptable, but it is in this case + for sample_txt in tqdm(hf_ds["tokens" if tokenized else "text"]): # type: ignore + dataset.append(sample_txt) + return dataset From 6d4ec8289f7ccfb36de8892f91ecf5b059f605df Mon Sep 17 00:00:00 2001 From: Jai Date: Wed, 31 Jan 2024 16:42:02 -0800 Subject: [PATCH 2/3] Porting jettjaniak/tinyevals#19 --- src/delphi/dataset/dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/delphi/dataset/dataset.py b/src/delphi/dataset/dataset.py index 41a43a6f..0590ff2a 100644 --- a/src/delphi/dataset/dataset.py +++ b/src/delphi/dataset/dataset.py @@ -6,10 +6,11 @@ def load_clean_dataset(split: str, tokenized: bool = False) -> list[str]: # checking just startswith, because you can include slice like "train[:1000]" assert split.startswith("train") or split.startswith("validation") hf_ds = load_dataset( - f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}" + f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}", + split=split, ) dataset = [] # hf_ds technically isn't guaranteed to be subscriptable, but it is in this case - for sample_txt in tqdm(hf_ds["tokens" if tokenized else "text"]): # type: ignore - dataset.append(sample_txt) + for sample in tqdm(hf_ds["tokens" if tokenized else "text"]): # type: ignore + dataset.append(sample) return dataset From c682793997e82f835f34ff6b9a419d54351d3355 Mon Sep 17 00:00:00 2001 From: Jai Date: Wed, 31 Jan 2024 16:44:07 -0800 Subject: [PATCH 3/3] Porting jettjaniak/tinyevals#23 --- src/delphi/dataset/dataset.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/delphi/dataset/dataset.py b/src/delphi/dataset/dataset.py index 0590ff2a..6b4bad26 100644 --- a/src/delphi/dataset/dataset.py +++ b/src/delphi/dataset/dataset.py @@ -1,16 +1,21 @@ from datasets import load_dataset from tqdm.auto import tqdm - def load_clean_dataset(split: str, tokenized: bool = False) -> list[str]: # checking just startswith, because you can include slice like "train[:1000]" assert split.startswith("train") or split.startswith("validation") - hf_ds = load_dataset( - f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}", - split=split, - ) + hf_ds = load_dataset(f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}", split=split) dataset = [] # hf_ds technically isn't guaranteed to be subscriptable, but it is in this case for sample in tqdm(hf_ds["tokens" if tokenized else "text"]): # type: ignore dataset.append(sample) return dataset + +def token_map(tokenized_dataset: list[list[int]]) -> dict[int, list[tuple[int, int]]]: + mapping = {} + + for prompt_idx, prompt in enumerate(tokenized_dataset): + for token_idx, token in enumerate(prompt): + mapping.setdefault(token, []).append((prompt_idx, token_idx)) + + return mapping \ No newline at end of file