Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding set values for input / output #774

Merged
merged 4 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config/gpt2_small_fast_supervised.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ supervised_data:
validation_urls:
- "gs://marin-us-central2/benchmarks/mmlu/mmlu-*-dev-evaluation.jsonl.gz"
cache_dir: "gs://marin-us-central2/benchmarks/tokenized-gpt2/mmlu/"
input_field: "input"
output_field: "output"
model:
type: gpt2
hidden_dim: 768
Expand Down
16 changes: 12 additions & 4 deletions src/levanter/data/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,13 +570,18 @@ class LMSupervisedDatasetConfig:
"""tags for the dataset. Typically the name of the dataset in the config will be added as a tag as well"""
name: Optional[str] = None # name for hf dataset

input_field: str = "prompt" # name of the input field in the jsonl file
output_field: str = "response" # name of the output field in the jsonl file

validation_urls: List[str] = () # type:ignore


def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase):
sources = [example["input"] for example in batch]
def preprocess_supervised_example(
batch, tokenizer: PreTrainedTokenizerBase, input_field: str, output_field: str
) -> dict:
sources = [example[input_field] for example in batch]

targets = [f"{example['output']}" for example in batch]
targets = [example[output_field] for example in batch]
# TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how alpaca does it.
examples = [s + t for s, t in zip(sources, targets)]
sources_tokenized = tokenizer(sources, padding=False, truncation=True)
Expand Down Expand Up @@ -623,9 +628,12 @@ def mk_supervised_dataset(config: LMSupervisedDatasetConfig, tokenizer: PreTrain
validation_urls = [url for url_pat in config.validation_urls for url in fsspec_expand_glob(url_pat)]
dataset = levanter.data.datasource_from_jsonl(validation_urls)

input_field = config.input_field
output_field = config.output_field

output_exemplar = {"input_ids": np.zeros((0,), dtype=np.int32), "sources_len": np.zeros((), dtype=np.int32)}

dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar) # type: ignore
dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer, input_field, output_field), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar) # type: ignore
dataset = dataset.build_or_load_cache(config.cache_dir, await_finished=True) # type: ignore
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
Expand Down
2 changes: 1 addition & 1 deletion tests/test_supervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def test_supervised_eval():
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token

output = preprocess_supervised_example(examples, tokenizer)
output = preprocess_supervised_example(examples, tokenizer, "input", "output")
assert len(output["input_ids"][0]) == output["sources_len"][0] + 1

ex = {
Expand Down
Loading