stanford-crfm · TheQuantumFractal · Oct 22, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 22, 2024
diff --git a/config/gpt2_small_fast_supervised.yaml b/config/gpt2_small_fast_supervised.yaml
@@ -16,6 +16,8 @@ supervised_data:
   validation_urls:
     - "gs://marin-us-central2/benchmarks/mmlu/mmlu-*-dev-evaluation.jsonl.gz"
   cache_dir: "gs://marin-us-central2/benchmarks/tokenized-gpt2/mmlu/"
+  input_field: "input"
+  output_field: "output"
 model:
   type: gpt2
   hidden_dim: 768

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
@@ -570,13 +570,18 @@ class LMSupervisedDatasetConfig:
     """tags for the dataset. Typically the name of the dataset in the config will be added as a tag as well"""
     name: Optional[str] = None  # name for hf dataset
 
+    input_field: str = "prompt"  # name of the input field in the jsonl file
+    output_field: str = "response"  # name of the output field in the jsonl file
+
     validation_urls: List[str] = ()  # type:ignore
 
 
-def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase):
-    sources = [example["input"] for example in batch]
+def preprocess_supervised_example(
+    batch, tokenizer: PreTrainedTokenizerBase, input_field: str, output_field: str
+) -> dict:
+    sources = [example[input_field] for example in batch]
 
-    targets = [f"{example['output']}" for example in batch]
+    targets = [example[output_field] for example in batch]
     # TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how alpaca does it.
     examples = [s + t for s, t in zip(sources, targets)]
     sources_tokenized = tokenizer(sources, padding=False, truncation=True)
@@ -623,9 +628,12 @@ def mk_supervised_dataset(config: LMSupervisedDatasetConfig, tokenizer: PreTrain
     validation_urls = [url for url_pat in config.validation_urls for url in fsspec_expand_glob(url_pat)]
     dataset = levanter.data.datasource_from_jsonl(validation_urls)
 
+    input_field = config.input_field
+    output_field = config.output_field
+
     output_exemplar = {"input_ids": np.zeros((0,), dtype=np.int32), "sources_len": np.zeros((), dtype=np.int32)}
 
-    dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar)  # type: ignore
+    dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer, input_field, output_field), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar)  # type: ignore
     dataset = dataset.build_or_load_cache(config.cache_dir, await_finished=True)  # type: ignore
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token

diff --git a/tests/test_supervised.py b/tests/test_supervised.py
@@ -18,7 +18,7 @@ def test_supervised_eval():
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
 
-    output = preprocess_supervised_example(examples, tokenizer)
+    output = preprocess_supervised_example(examples, tokenizer, "input", "output")
     assert len(output["input_ids"][0]) == output["sources_len"][0] + 1
 
     ex = {