Skip to content

Commit

Permalink
update decoding time caculation (#337)
Browse files Browse the repository at this point in the history
  • Loading branch information
TerryT9 authored Jan 3, 2025
1 parent 070a931 commit 5104478
Showing 1 changed file with 9 additions and 3 deletions.
12 changes: 9 additions & 3 deletions nexa/gguf/server/nexa_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,8 +717,10 @@ def _resp_async_generator(streamer, start_time):
_id = str(uuid.uuid4())
ttft = 0
decoding_times = 0
first_token_time = time.perf_counter() if first_token_time==0 else first_token_time
for token in streamer:
ttft = time.perf_counter() - start_time if ttft==0 else ttft
first_token_time = time.perf_counter() if first_token_time == 0 else first_token_time
decoding_times += 1
chunk = {
"id": _id,
Expand All @@ -728,7 +730,7 @@ def _resp_async_generator(streamer, start_time):
}
yield f"data: {json.dumps(chunk)}\n\n"

yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - start_time)).to_json()}\n\n"
yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - first_token_time)).to_json()}\n\n"
yield "data: [DONE]\n\n"

@app.post("/v1/download_model", tags=["Model"])
Expand Down Expand Up @@ -1000,13 +1002,15 @@ async def _resp_omnivlm_async_generator(model: NexaOmniVlmInference, prompt: str
_id = str(uuid.uuid4())
ttft = 0
start_time = time.perf_counter()
first_token_time = 0
decoding_times = 0
try:
if not os.path.exists(image_path):
raise FileNotFoundError(f"Image file not found: {image_path}")

for token in model.inference_streaming(prompt, image_path):
ttft = time.perf_counter() - start_time if ttft==0 else ttft
first_token_time = time.perf_counter() if first_token_time == 0 else first_token_time
decoding_times += 1
chunk = {
"id": _id,
Expand All @@ -1019,7 +1023,7 @@ async def _resp_omnivlm_async_generator(model: NexaOmniVlmInference, prompt: str
}]
}
yield f"data: {json.dumps(chunk)}\n\n"
yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - start_time)).to_json()}\n\n"
yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - first_token_time)).to_json()}\n\n"
yield "data: [DONE]\n\n"
except Exception as e:
logging.error(f"Error in OmniVLM streaming: {e}")
Expand Down Expand Up @@ -1357,9 +1361,11 @@ async def audio_chat_completions(
if stream:
async def stream_with_cleanup():
nonlocal ttft, decoding_times, start_time
first_token_time = 0
try:
for token in model.inference_streaming(audio_path, prompt or ""):
ttft = time.perf_counter() - start_time if ttft==0 else ttft
first_token_time = time.perf_counter() if first_token_time==0 else first_token_time
decoding_times += 1
chunk = {
"id": str(uuid.uuid4()),
Expand All @@ -1372,7 +1378,7 @@ async def stream_with_cleanup():
}]
}
yield f"data: {json.dumps(chunk)}\n\n"
yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - start_time)).to_json()}\n\n"
yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - first_token_time)).to_json()}\n\n"
yield "data: [DONE]\n\n"
finally:
temp_file.close()
Expand Down

0 comments on commit 5104478

Please sign in to comment.