[fix] vllm-online-benchmark first token latency error (intel-analytic…

…s#12271)
ch1y0q · Oct 29, 2024 · 3700e81 · 3700e81
1 parent 0bbc04b
commit 3700e81
Showing 1 changed file with 1 addition and 7 deletions.
diff --git a/docker/llm/serving/xpu/docker/vllm_online_benchmark.py b/docker/llm/serving/xpu/docker/vllm_online_benchmark.py
@@ -270,13 +270,7 @@ def perform_request(session, url, payload, headers):
                         json_data = json.loads(data)
                         if 'choices' in json_data and len(json_data['choices']) > 0:
                             choice = json_data['choices'][0]
-                            if 'finish_reason' in choice and (choice['finish_reason'] == 'length' or choice['finish_reason'] == 'stop'):
-                                if 'first_token_time' in choice and isinstance(choice['first_token_time'], float):
-                                    first_token_inference_time = choice['first_token_time']
-                                if 'rest_token_time' in choice and isinstance(choice['rest_token_time'], float):
-                                    next_token_inference_time = choice['rest_token_time']
-                            else:
-                                # 记录第一个token的时间
+                            if 'text' in choice:
                                 if first_token_time is None:
                                     first_token_time = token_time
                                 else: