-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_benchmark.py
116 lines (87 loc) · 5.19 KB
/
run_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from optimum.onnxruntime import ORTModelForSequenceClassification
import time
def sentiment_analysis_batched(df, device_type, model_type, model_id, batch_size, field_name, file_name = None, gpu_id=0, num_threads=1):
if device_type == 'gpu':
device = torch.device(f'cuda:{gpu_id}')
elif device_type == 'cpu':
device = torch.device('cpu')
torch.set_num_threads(num_threads)
if model_type == 'torch':
model = AutoModelForSequenceClassification.from_pretrained(model_id)
model.to(device)
elif model_type == 'onnx':
if device_type == 'cpu':
model = ORTModelForSequenceClassification.from_pretrained(model_id, file_name=file_name, provider="CPUExecutionProvider")
elif device_type == 'gpu':
model = ORTModelForSequenceClassification.from_pretrained(model_id, file_name=file_name, provider="CUDAExecutionProvider", provider_options={'device_id': gpu_id})
tokenizer = AutoTokenizer.from_pretrained(model_id)
start_time = time.time()
results = []
# Precompute id2label mapping
id2label = model.config.id2label
for start_idx in tqdm(range(0, len(df), batch_size), desc=f"(Batch size {batch_size})"):
end_idx = start_idx + batch_size
texts = df[field_name].iloc[start_idx:end_idx].tolist()
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
predictions = torch.nn.functional.sigmoid(outputs.logits) # Use sigmoid for multi-label classification
# Collect predictions on GPU
results.append(predictions)
# Concatenate all results on GPU
all_predictions = torch.cat(results, dim=0).cpu().numpy()
# Convert to DataFrame
predictions_df = pd.DataFrame(all_predictions, columns=[id2label[i] for i in range(all_predictions.shape[1])])
# Add prediction columns to the original DataFrame
combined_df = pd.concat([df.reset_index(drop=True), predictions_df], axis=1)
elapsed_time = time.time() - start_time
messages_per_second = len(df) / elapsed_time
return elapsed_time, messages_per_second, combined_df
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark GPUs running BERT code")
parser.add_argument("--dataset", type=str, choices=["normal", "filtered"], default="normal", help="Dataset to use (normal or filtered)")
parser.add_argument("--model", type=str, choices=["torch", "onnx", "onnx-fp16"], help="Model to use (torch, onnx or onnx-fp16)")
parser.add_argument("--device", type=str, choices=["cpu", "gpu"], help="Device to use (cpu or gpu)")
parser.add_argument("--gpu", type=int, default=0, help="GPU ID to use (default 0)")
parser.add_argument("--batches", type=str, default="1,2,4,8,16,32", help="Comma-separated batch sizes to run")
parser.add_argument("--threads", type=int, default=1, help="Number of CPU threads to use (default 1)")
args = parser.parse_args()
# Models
model_ids = {
"torch": {'model_type': 'torch', 'model_id': "SamLowe/roberta-base-go_emotions", 'file_name': None},
"onnx": {'model_type': 'onnx', 'model_id': "SamLowe/roberta-base-go_emotions-onnx", 'file_name': "onnx/model.onnx"},
"onnx-fp16": {'model_type': 'onnx', 'model_id': "joaopn/roberta-base-go_emotions-onnx-fp16", 'file_name': "model.onnx"}
}
if args.model == "onnx-fp16" and args.device == "cpu":
raise ValueError("ONNX FP16 models are only supported on GPUs")
field_name = "body"
model = args.model
model_params = model_ids[model]
if args.dataset == "filtered":
str_dataset = 'data/random_sample_10k_filtered.csv.gz'
else:
str_dataset = 'data/random_sample_10k.csv.gz'
df = pd.read_csv(str_dataset, compression='gzip')
results = {}
batch_sizes = [int(x) for x in args.batches.split(',')]
for batch_size in batch_sizes:
elapsed_time, messages_per_second,_ = sentiment_analysis_batched(df, args.device, model_params['model_type'], model_params['model_id'], batch_size, field_name, model_params['file_name'], gpu_id=0, num_threads=args.threads)
results[batch_size] = {'elapsed_time': elapsed_time, 'messages_per_second': messages_per_second}
if args.device == 'gpu':
gpu_name = torch.cuda.get_device_name(args.gpu)
print("\nDataset: {}, Model: {}, GPU: {}\n".format(args.dataset, args.model, gpu_name))
print("size\tmessages/s")
for batch_size, result in results.items():
print(f"{batch_size}\t{result['messages_per_second']:.2f}")
else:
print("\nDataset: {}, Model: {}, CPU threads: {}\n".format(args.dataset, args.model, args.threads))
print("size\tmessages/s\tmessages/s/thread")
for batch_size, result in results.items():
print(f"{batch_size}\t{result['messages_per_second']:.2f}\t\t{result['messages_per_second']/args.threads:.2f}")