Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

inference script #16

Merged
28 changes: 28 additions & 0 deletions scripts/generate_logprobs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

# Define the batch size
BATCH_SIZE=80 # This worked well in my CPU, but 200 was too much
DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized"
USERNAME="transcendingvictor" # your Hugging Face username
TOKEN="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" # your Hugging Face API token
transcendingvictor marked this conversation as resolved.
Show resolved Hide resolved


# List of models
declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k"
"delphi-suite/delphi-llama2-200k"
"delphi-suite/delphi-llama2-400k"
"delphi-suite/delphi-llama2-800k"
"delphi-suite/delphi-llama2-1.6m"
"delphi-suite/delphi-llama2-3.2m"
"delphi-suite/delphi-llama2-6.4m"
"delphi-suite/delphi-llama2-12.8m"
"delphi-suite/delphi-llama2-25.6m")

# Loop through each model and generate log probabilities
for MODEL_NAME in "${MODEL_NAMES[@]}"
do
echo "Processing $MODEL_NAME"
python scripts/inference.py "$MODEL_NAME" --batch-size "$BATCH_SIZE" --dataset-name "$DATASET_NAME" --username "$USERNAME" --token "$TOKEN"
done

echo "All models processed."
119 changes: 119 additions & 0 deletions scripts/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import argparse
import os

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from jaxtyping import Int
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM

from delphi.eval.utils import get_all_and_next_logprobs, load_validation_dataset

torch.set_grad_enabled(False)


def main(
model_name: str,
batch_size: Int,
dataset_name: str,
username: str,
token: str,
funct_test: bool = False,
):
"""
Outputs the log probabilities of the next token for each token in the validation dataset.
And uploads the resulting dataset to huggingface.
Args:
- model_name: The name of the model to use for inference
- batch_size: The batch size for processing. 80 worked well in CPU.
- dataset_name: The name of the dataset from which validation set will be loaded
- username: Hugging Face API username
- token: Hugging Face API token
"""
val_ds = load_validation_dataset(dataset_name)

model = AutoModelForCausalLM.from_pretrained(model_name)

logprobs_list = []
total_sequences = (
len(val_ds) if not funct_test else 320
) # Use only 320 sequences if funct_test is True

for i in tqdm(range(0, total_sequences, batch_size)):
batch_end = min(i + batch_size, total_sequences)
batch_sequences = [val_ds[j]["tokens"] for j in range(i, batch_end)]
batch_sequences_tensor = torch.tensor(batch_sequences)

_, next_logprobs = get_all_and_next_logprobs(model, batch_sequences_tensor)
logprobs_list.append(next_logprobs)

accumulated_logprobs = torch.cat(logprobs_list, dim=0)

nan_tensor = torch.full((accumulated_logprobs.size(0), 1), float("nan"))
extended_next_logprobs = torch.cat(
[nan_tensor, accumulated_logprobs], dim=1
) # 513 tokens

df_dataset = pd.DataFrame({"logprobs": extended_next_logprobs.tolist()})
hf_dataset = Dataset.from_pandas(df_dataset)

# change the repo_id to your hf username in generate_logprobs.sh
# change the yout hf token in generate_logprobs.sh

repo_id = f"{username}/{model_name.rsplit('/', 1)[-1]}-validation-logprobs"
if funct_test:
repo_id += "-funct-test"
hf_dataset.push_to_hub(
repo_id=repo_id,
split="validation",
private=False,
token=token,
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Run inference and generate log probabilities."
)
parser.add_argument(
"model_name", type=str, help="Model name with or without delphi-suite/ prefix"
)
parser.add_argument(
"--batch-size",
type=int,
default=80,
help="Batch size for processing (default: 80)",
)
parser.add_argument(
"--dataset-name",
type=str,
help="Dataset name with or without delphi-suite/ prefix",
)
parser.add_argument(
"--username",
type=str,
help="Hugging Face API username",
)
parser.add_argument(
"--token",
type=str,
help="Hugging Face API token",
)
parser.add_argument(
"--test-funct", action="store_true", help="Enable test function mode"
)

args = parser.parse_args()

if "/" not in args.model_name:
args.model_name = "delphi-suite/" + args.model_name

main(
args.model_name,
args.batch_size,
args.dataset_name,
args.username,
args.token,
args.test_funct,
)
22 changes: 22 additions & 0 deletions tests/scripts/functional_test_generate_logprobs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
#test to check if whether inference.py uploads log probabilities to Hugging Face.
#similar to generate_logprobs.sh, much smaller.

BATCH_SIZE=80
DATASET_NAME="delphi-suite/tinystories-v2-clean-tokenized"
USERNAME="transcendingvictor" # Your Hugging Face username
TOKEN="hf_aaaaaaaaaaaaaaaaaaaaaaaaaaaaa" # Your Hugging Face API token

# List of models
declare -a MODEL_NAMES=("delphi-suite/delphi-llama2-100k"
"delphi-suite/delphi-llama2-200k"
)

# Loop through each model and generate log probabilities
for MODEL_NAME in "${MODEL_NAMES[@]}"
do
echo "Processing $MODEL_NAME"
python scripts/inference.py "$MODEL_NAME" --batch-size "$BATCH_SIZE" --dataset-name "$DATASET_NAME" --username "$USERNAME" --token "$TOKEN" --test-funct
done

echo "All models processed."
Loading