From 48341cf8ff40d012b3e597228c83c5b978ae1298 Mon Sep 17 00:00:00 2001
From: Kamyar Salahi <kam.salahi@berkeley.edu>
Date: Mon, 21 Oct 2024 14:04:11 -0700
Subject: [PATCH 1/4] Adding set values for input / output

---
 config/gpt2_small_fast_supervised.yaml |  2 ++
 src/levanter/data/text.py              | 14 ++++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/config/gpt2_small_fast_supervised.yaml b/config/gpt2_small_fast_supervised.yaml
index 56ce7ea36..d71e1267e 100644
--- a/config/gpt2_small_fast_supervised.yaml
+++ b/config/gpt2_small_fast_supervised.yaml
@@ -16,6 +16,8 @@ supervised_data:
   validation_urls:
     - "gs://marin-us-central2/benchmarks/mmlu/mmlu-*-dev-evaluation.jsonl.gz"
   cache_dir: "gs://marin-us-central2/benchmarks/tokenized-gpt2/mmlu/"
+  input_field: "input"
+  output_field: "output"
 model:
   type: gpt2
   hidden_dim: 768
diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index a1e20384f..c16676410 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -570,13 +570,16 @@ class LMSupervisedDatasetConfig:
     """tags for the dataset. Typically the name of the dataset in the config will be added as a tag as well"""
     name: Optional[str] = None  # name for hf dataset
 
+    input_field: str = "prompt" # name of the input field in the jsonl file
+    output_field: str = "response" # name of the output field in the jsonl file
+
     validation_urls: List[str] = ()  # type:ignore
 
 
-def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase):
-    sources = [example["input"] for example in batch]
+def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase, input_field: str, output_field: str) -> dict:
+    sources = [example[input_field] for example in batch]
 
-    targets = [f"{example['output']}" for example in batch]
+    targets = [f"{example[output_field]}" for example in batch]
     # TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how alpaca does it.
     examples = [s + t for s, t in zip(sources, targets)]
     sources_tokenized = tokenizer(sources, padding=False, truncation=True)
@@ -623,9 +626,12 @@ def mk_supervised_dataset(config: LMSupervisedDatasetConfig, tokenizer: PreTrain
     validation_urls = [url for url_pat in config.validation_urls for url in fsspec_expand_glob(url_pat)]
     dataset = levanter.data.datasource_from_jsonl(validation_urls)
 
+    input_field = config.input_field
+    output_field = config.output_field
+
     output_exemplar = {"input_ids": np.zeros((0,), dtype=np.int32), "sources_len": np.zeros((), dtype=np.int32)}
 
-    dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar)  # type: ignore
+    dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer, input_field, output_field), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar)  # type: ignore
     dataset = dataset.build_or_load_cache(config.cache_dir, await_finished=True)  # type: ignore
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token

From b7bbc4060d3828fc3436a451ac00da585eb9b55b Mon Sep 17 00:00:00 2001
From: Kamyar Salahi <kam.salahi@berkeley.edu>
Date: Mon, 21 Oct 2024 14:43:39 -0700
Subject: [PATCH 2/4] Making linter happy

---
 src/levanter/data/text.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index c16676410..a362f27b4 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -570,13 +570,15 @@ class LMSupervisedDatasetConfig:
     """tags for the dataset. Typically the name of the dataset in the config will be added as a tag as well"""
     name: Optional[str] = None  # name for hf dataset
 
-    input_field: str = "prompt" # name of the input field in the jsonl file
-    output_field: str = "response" # name of the output field in the jsonl file
+    input_field: str = "prompt"  # name of the input field in the jsonl file
+    output_field: str = "response"  # name of the output field in the jsonl file
 
     validation_urls: List[str] = ()  # type:ignore
 
 
-def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase, input_field: str, output_field: str) -> dict:
+def preprocess_supervised_example(
+    batch, tokenizer: PreTrainedTokenizerBase, input_field: str, output_field: str
+) -> dict:
     sources = [example[input_field] for example in batch]
 
     targets = [f"{example[output_field]}" for example in batch]

From 843050c27775ac2ba62f6d5aad0c33386274ff1b Mon Sep 17 00:00:00 2001
From: Kamyar Salahi <kam.salahi@berkeley.edu>
Date: Mon, 21 Oct 2024 17:07:59 -0700
Subject: [PATCH 3/4] Fixing tests

---
 tests/test_supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_supervised.py b/tests/test_supervised.py
index e1d9098d2..b8bec4f45 100644
--- a/tests/test_supervised.py
+++ b/tests/test_supervised.py
@@ -18,7 +18,7 @@ def test_supervised_eval():
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
 
-    output = preprocess_supervised_example(examples, tokenizer)
+    output = preprocess_supervised_example(examples, tokenizer, "input", "output")
     assert len(output["input_ids"][0]) == output["sources_len"][0] + 1
 
     ex = {

From b1765dfa10d02602fbf717e284384b3271836444 Mon Sep 17 00:00:00 2001
From: Kamyar Salahi <kam.salahi@berkeley.edu>
Date: Mon, 21 Oct 2024 17:25:36 -0700
Subject: [PATCH 4/4] Removing redundancy

---
 src/levanter/data/text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index a362f27b4..70c1fe4b3 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -581,7 +581,7 @@ def preprocess_supervised_example(
 ) -> dict:
     sources = [example[input_field] for example in batch]
 
-    targets = [f"{example[output_field]}" for example in batch]
+    targets = [example[output_field] for example in batch]
     # TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how alpaca does it.
     examples = [s + t for s, t in zip(sources, targets)]
     sources_tokenized = tokenizer(sources, padding=False, truncation=True)