From 3700e819774af36cb1d2a403d13d783867631748 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 29 Oct 2024 17:54:36 +0800 Subject: [PATCH] [fix] vllm-online-benchmark first token latency error (#12271) --- docker/llm/serving/xpu/docker/vllm_online_benchmark.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/docker/llm/serving/xpu/docker/vllm_online_benchmark.py b/docker/llm/serving/xpu/docker/vllm_online_benchmark.py index 8d4e30954ea..cc5f2add257 100644 --- a/docker/llm/serving/xpu/docker/vllm_online_benchmark.py +++ b/docker/llm/serving/xpu/docker/vllm_online_benchmark.py @@ -270,13 +270,7 @@ def perform_request(session, url, payload, headers): json_data = json.loads(data) if 'choices' in json_data and len(json_data['choices']) > 0: choice = json_data['choices'][0] - if 'finish_reason' in choice and (choice['finish_reason'] == 'length' or choice['finish_reason'] == 'stop'): - if 'first_token_time' in choice and isinstance(choice['first_token_time'], float): - first_token_inference_time = choice['first_token_time'] - if 'rest_token_time' in choice and isinstance(choice['rest_token_time'], float): - next_token_inference_time = choice['rest_token_time'] - else: - # 记录第一个token的时间 + if 'text' in choice: if first_token_time is None: first_token_time = token_time else: