Skip to content

Commit

Permalink
[Bench] Add "per-gpu-workload" mode (mlc-ai#3068)
Browse files Browse the repository at this point in the history
This PR introduces the per-gpu-workload mode to MLC bench.
Under this mode, the specified "num_concurrent_requests" and
"request_rate" denote the workload **per GPU**, which means the overall
workload of the entire serving system for benchmarking will be
multiplied by the number of GPUs.

Meanwhile, this PR deprecates the argument `--testset-name` in
favor of `--dataset-path` for Loogle dataset.
  • Loading branch information
MasterJH5574 authored Dec 17, 2024
1 parent 88ebe6f commit 88074ea
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 22 deletions.
36 changes: 21 additions & 15 deletions python/mlc_llm/bench/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,17 +101,18 @@ def run_pipeline(
args.output_len_std,
)
request_records = pipeline(request_records)
assert len(request_records) == args.num_requests * args.num_gpus
sorted_requests: List[RequestRecord] = [None] * args.num_requests * args.num_gpus
num_total_requests = (
args.num_requests if not args.per_gpu_workload else args.num_requests * args.num_gpus
)
assert len(request_records) == num_total_requests
sorted_requests: List[RequestRecord] = [None] * num_total_requests
for request_record in request_records:
assert request_record.request_id is not None
assert sorted_requests[request_record.request_id] is None
sorted_requests[request_record.request_id] = request_record

request_records = MetricAnalyzer(tokenizer)(request_records)
report = generate_metrics_summary(
request_records, args.num_requests * args.num_gpus, args.num_gpus
)
report = generate_metrics_summary(request_records, num_total_requests, args.num_gpus)
return report, sorted_requests


Expand Down Expand Up @@ -221,6 +222,15 @@ def _main():
help="The number of requests for warmup. "
"It is optional when fixing the number of concurrent requests, and is required otherwise.",
)
parser.add_argument(
"--per-gpu-workload",
default=False,
action="store_true",
help='When set to True, the specified "num_concurrent_requests"/"request_rate" '
"denote the workload **per GPU**, which means that the real values of "
'"num_concurrent_requests"/"request_rate" used in benchmark'
'will be multiplied by "num_gpus".',
)
parser.add_argument(
"--num-concurrent-requests",
type=_parse_num_concurrent_requests,
Expand Down Expand Up @@ -354,13 +364,6 @@ def _main():
type=_parse_mlc_engine_config,
help="The engine config used when launch MLC server.",
)
parser.add_argument(
"--output",
"-o",
type=str,
default="mlc_benchmark.csv",
help="The path of the output file where to dump the benchmark results.",
)
parser.add_argument(
"--cuda-profile",
default=False,
Expand All @@ -378,13 +381,16 @@ def _main():
"--multi-round",
default=False,
action="store_true",
help="Whether to chat like mulit round conversion with history log each request. "
help="Whether to chat like multi round conversion with history log each request. "
"Only enabled when benchmarked with fixed concurrent request mode."
"The --num-concurrent-requests should be provided when enabling this option.",
)

parser.add_argument(
"--testset-name", type=str, help="The name of the testset. Only used for Loogle dataset"
"--output",
"-o",
type=str,
default="mlc_benchmark.csv",
help="The path of the output file where to dump the benchmark results.",
)

main(parser.parse_args())
5 changes: 2 additions & 3 deletions python/mlc_llm/bench/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,11 @@ class LoogleDataset(Dataset): # pylint: disable=too-few-public-methods
# pylint: enable=line-too-long
require_fake_warmup: bool = True

def __init__(self, tokenizer: AutoTokenizer, testset_name) -> None:
def __init__(self, tokenizer: AutoTokenizer, testset_name: str) -> None:
raw_dataset = load_dataset("bigainlco/LooGLE", testset_name, split="test")
self.tokenizer = tokenizer
self.dataset = []
self.prompt_format = self.task2prompt[testset_name]
# self.max_gen = self.task2maxlen[testset_name]
prompts = []
generate_lens = []
questions = []
Expand Down Expand Up @@ -806,7 +805,7 @@ def create_dataset(args: argparse.Namespace, tokenizer: AutoTokenizer) -> "Datas
assert (
args.apply_chat_template is False
), "Loogle dataset does not support applying chat template"
return LoogleDataset(tokenizer, args.testset_name)
return LoogleDataset(tokenizer, testset_name=args.dataset_path)
if args.dataset == "react":
assert (
args.apply_chat_template is False
Expand Down
13 changes: 9 additions & 4 deletions python/mlc_llm/bench/request_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,22 +622,27 @@ def create_pipelines(
"Please specify the number of warmup requests via "
'"--num-warmup-requests" when fixing request rate.'
)
num_total_requests = int(
args.num_requests if not args.per_gpu_workload else args.num_requests * args.num_gpus
)
if dataset.require_fake_warmup:
num_samples = int(args.num_requests * args.num_gpus)
num_samples = num_total_requests
else:
num_samples = int(args.num_requests * args.num_gpus) + args.num_warmup_requests
num_samples = num_total_requests + args.num_warmup_requests
return [
SequentialProcessor(
LogMessage(f"Fixing request rate: {request_rate}"),
SampleRequests(num_samples),
AttachModelName(args.tokenizer),
AttachRequestRateTimestamp(request_rate * args.num_gpus),
AttachRequestRateTimestamp(
request_rate if not args.per_gpu_workload else request_rate * args.num_gpus
),
AttachStreamFlag(args.stream),
AttachSamplingOptions(args.temperature, args.top_p, args.ignore_eos),
AttachExecutionFeature({"request_rate": float(request_rate)}),
WarmupAndRun(
num_warmup_requests=args.num_warmup_requests,
num_benchmark_requests=int(args.num_requests * args.num_gpus),
num_benchmark_requests=num_total_requests,
pipeline=FixTimestampExecutor(
f_create_api_endpoint,
args.num_process_workers,
Expand Down

0 comments on commit 88074ea

Please sign in to comment.