From c46f6978298ca790dcd026f4f92608d7473b1892 Mon Sep 17 00:00:00 2001 From: T <3923106166@qq.com> Date: Tue, 31 Dec 2024 18:10:39 +0800 Subject: [PATCH 1/5] Fix model download issue: no error message for failed task. (#330) * fix model pull issue: no error message for failed task * update error message * update --- nexa/gguf/server/nexa_service.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py index 3db82c8d..878c309d 100644 --- a/nexa/gguf/server/nexa_service.py +++ b/nexa/gguf/server/nexa_service.py @@ -722,6 +722,10 @@ async def download_model(request: DownloadModelRequest): elif request.model_path in NEXA_RUN_MODEL_MAP_AUDIO_LM: downloaded_path, model_type = pull_model(NEXA_RUN_MODEL_MAP_AUDIO_LM[request.model_path]) projector_downloaded_path, _ = pull_model(NEXA_RUN_AUDIO_LM_PROJECTOR_MAP[request.model_path]) + + if not downloaded_path or not model_type: + return JSONResponse(content="Failed to download model. Please check whether model_path is correct.", status_code=400) + return { "status": "success", "message": "Successfully downloaded model and projector", @@ -732,6 +736,9 @@ async def download_model(request: DownloadModelRequest): } else: downloaded_path, model_type = pull_model(request.model_path) + if not downloaded_path or not model_type: + return JSONResponse(content="Failed to download model. Please check whether model_path is correct.", status_code=400) + return { "status": "success", "message": "Successfully downloaded model", From c05af95c70d693e979eb9a936e2e69412f4f7349 Mon Sep 17 00:00:00 2001 From: JoyboyBrian Date: Tue, 31 Dec 2024 12:01:53 -0800 Subject: [PATCH 2/5] add action --- nexa/gguf/server/nexa_service.py | 65 ++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py index 878c309d..2678f229 100644 --- a/nexa/gguf/server/nexa_service.py +++ b/nexa/gguf/server/nexa_service.py @@ -230,6 +230,9 @@ class DownloadModelRequest(BaseModel): "protected_namespaces": () } +class ActionRequest(BaseModel): + prompt: str = "query_plane_ticket(year='2024', date='12-31', time='12:00', departure='SFO', destination='JFK')" + class StreamASRProcessor: def __init__(self, asr, task, language): self.asr = asr @@ -1424,6 +1427,68 @@ async def create_embedding(request: EmbeddingRequest): logging.error(f"Error in embedding generation: {e}") raise HTTPException(status_code=500, detail=str(e)) +@app.post("/v1/action", tags=["Actions"]) +async def action(request: ActionRequest): + try: + # Extract content between and + prompt = request.prompt + import re + + # Use regex to match pattern + match = re.match(r"(.*?)", prompt) + if not match: + raise ValueError("Invalid prompt format. Must be wrapped in and ") + + # Extract the function call content + function_content = match.group(1) + + # Parse function name and parameters + function_name = function_content[:function_content.index("(")] + params_str = function_content[function_content.index("(")+1:function_content.rindex(")")] + + # Parse parameters into dictionary + params = {} + for param in params_str.split(","): + if "=" in param: + key, value = param.split("=") + params[key.strip()] = value.strip().strip("'").strip('"') + + # Handle different function types + if function_name == "query_plane_ticket": + # Validate required parameters + required_params = ["year", "date", "time", "departure", "destination"] + for param in required_params: + if param not in params: + raise ValueError(f"Missing required parameter: {param}") + + # Construct the date string in required format + date_str = f"{params['date']}/{params['year']}" + + # Build the URL + url = (f"https://www.expedia.com/Flights-Search?" + f"leg1=from:{params['departure']},to:{params['destination']}," + f"departure:{date_str}T&" + f"passengers=adults:1&trip=oneway&mode=search") + + return { + "status": "success", + "function": function_name, + "parameters": params, + "url": url + } + else: + # Handle other function types in the future + return { + "status": "error", + "message": f"Unsupported function: {function_name}" + } + + except Exception as e: + return { + "status": "error", + "message": str(e) + } + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Run the Nexa AI Text Generation Service" From cdb623cbf82e2fe3ebf6f4e8da1047427b77a1c1 Mon Sep 17 00:00:00 2001 From: JoyboyBrian Date: Tue, 31 Dec 2024 12:02:57 -0800 Subject: [PATCH 3/5] remove default prompt --- nexa/gguf/server/nexa_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py index 2678f229..1b0c5b78 100644 --- a/nexa/gguf/server/nexa_service.py +++ b/nexa/gguf/server/nexa_service.py @@ -231,7 +231,7 @@ class DownloadModelRequest(BaseModel): } class ActionRequest(BaseModel): - prompt: str = "query_plane_ticket(year='2024', date='12-31', time='12:00', departure='SFO', destination='JFK')" + prompt: str = "" class StreamASRProcessor: def __init__(self, asr, task, language): From 413491f748009d850a4e50b99da780b641600971 Mon Sep 17 00:00:00 2001 From: T <3923106166@qq.com> Date: Thu, 2 Jan 2025 17:01:30 +0800 Subject: [PATCH 4/5] T/localui group (#333) * add ttft and decoding speed for omnivlm * add ttft and decoding time --- nexa/gguf/server/nexa_service.py | 58 ++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py index 1b0c5b78..6595d3c2 100644 --- a/nexa/gguf/server/nexa_service.py +++ b/nexa/gguf/server/nexa_service.py @@ -288,6 +288,20 @@ def ts_words(self, segments): words.append((w.start, w.end, w.word)) return words +class MetricsResult: + def __init__(self, ttft: float, decoding_speed:float): + self.ttft = ttft + self.decoding_speed = decoding_speed + + def to_dict(self): + return { + 'ttft': round(self.ttft, 2), + 'decoding_speed': round(self.decoding_speed, 2) + } + + def to_json(self): + return json.dumps(self.to_dict()) + # helper functions async def load_model(): global model, chat_format, completion_template, model_path, n_ctx, is_local_path, model_type, is_huggingface, is_modelscope, projector_path @@ -699,9 +713,13 @@ async def read_root(request: Request): ) -def _resp_async_generator(streamer): +def _resp_async_generator(streamer, start_time): _id = str(uuid.uuid4()) + ttft = 0 + decoding_times = 0 for token in streamer: + ttft = time.perf_counter() - start_time if ttft==0 else ttft + decoding_times += 1 chunk = { "id": _id, "object": "chat.completion.chunk", @@ -709,6 +727,11 @@ def _resp_async_generator(streamer): "choices": [{"delta": {"content": token}}], } yield f"data: {json.dumps(chunk)}\n\n" + + yield f"metrics: {MetricsResult( + ttft=ttft, + decoding_speed=decoding_times / (time.perf_counter() - start_time) + ).to_json()}\n\n" yield "data: [DONE]\n\n" @app.post("/v1/download_model", tags=["Model"]) @@ -857,8 +880,9 @@ async def generate_text(request: GenerationRequest): generation_kwargs = request.dict() if request.stream: # Run the generation and stream the response + start_time = time.perf_counter() streamer = nexa_run_text_generation(is_chat_completion=False, **generation_kwargs) - return StreamingResponse(_resp_async_generator(streamer), media_type="application/x-ndjson") + return StreamingResponse(_resp_async_generator(streamer, start_time), media_type="application/x-ndjson") else: # Generate text synchronously and return the response result = nexa_run_text_generation(is_chat_completion=False, **generation_kwargs) @@ -902,8 +926,9 @@ async def text_chat_completions(request: ChatCompletionRequest): ).dict() if request.stream: + start_time = time.perf_counter() streamer = nexa_run_text_generation(is_chat_completion=True, **generation_kwargs) - return StreamingResponse(_resp_async_generator(streamer), media_type="application/x-ndjson") + return StreamingResponse(_resp_async_generator(streamer, start_time), media_type="application/x-ndjson") result = nexa_run_text_generation(is_chat_completion=True, **generation_kwargs) return { @@ -951,7 +976,8 @@ async def multimodal_chat_completions(request: VLMChatCompletionRequest): processed_messages.append({"role": msg.role, "content": processed_content}) else: processed_messages.append({"role": msg.role, "content": msg.content}) - + + start_time = time.perf_counter() response = model.create_chat_completion( messages=processed_messages, max_tokens=request.max_tokens, @@ -963,7 +989,8 @@ async def multimodal_chat_completions(request: VLMChatCompletionRequest): ) if request.stream: - return StreamingResponse(_resp_async_generator(response), media_type="application/x-ndjson") + + return StreamingResponse(_resp_async_generator(response, start_time), media_type="application/x-ndjson") return response except HTTPException as e: @@ -972,13 +999,18 @@ async def multimodal_chat_completions(request: VLMChatCompletionRequest): logging.error(f"Error in multimodal chat completions: {e}") raise HTTPException(status_code=500, detail=str(e)) -async def _resp_omnivlm_async_generator(model, prompt: str, image_path: str): +async def _resp_omnivlm_async_generator(model: NexaOmniVlmInference, prompt: str, image_path: str): _id = str(uuid.uuid4()) + ttft = 0 + start_time = time.perf_counter() + decoding_times = 0 try: if not os.path.exists(image_path): raise FileNotFoundError(f"Image file not found: {image_path}") for token in model.inference_streaming(prompt, image_path): + ttft = time.perf_counter() - start_time if ttft==0 else ttft + decoding_times += 1 chunk = { "id": _id, "object": "chat.completion.chunk", @@ -990,6 +1022,10 @@ async def _resp_omnivlm_async_generator(model, prompt: str, image_path: str): }] } yield f"data: {json.dumps(chunk)}\n\n" + yield f"metrics: {MetricsResult( + ttft=ttft, + decoding_speed=decoding_times / (time.perf_counter() - start_time) + ).to_json()}\n\n" yield "data: [DONE]\n\n" except Exception as e: logging.error(f"Error in OmniVLM streaming: {e}") @@ -1307,6 +1343,9 @@ async def audio_chat_completions( stream: Optional[bool] = Query(False, description="Whether to stream the response"), ): temp_file = None + ttft = 0 + start_time = time.perf_counter() + decoding_times = 0 try: if model_type != "AudioLM": @@ -1323,8 +1362,11 @@ async def audio_chat_completions( if stream: async def stream_with_cleanup(): + nonlocal ttft, decoding_times, start_time try: for token in model.inference_streaming(audio_path, prompt or ""): + ttft = time.perf_counter() - start_time if ttft==0 else ttft + decoding_times += 1 chunk = { "id": str(uuid.uuid4()), "object": "chat.completion.chunk", @@ -1336,6 +1378,10 @@ async def stream_with_cleanup(): }] } yield f"data: {json.dumps(chunk)}\n\n" + yield f"metrics: {MetricsResult( + ttft=ttft, + decoding_speed=decoding_times / (time.perf_counter() - start_time) + ).to_json()}\n\n" yield "data: [DONE]\n\n" finally: temp_file.close() From 08f522202e68042b921001e9bfd5fc61581da5e9 Mon Sep 17 00:00:00 2001 From: JoyboyBrian Date: Thu, 2 Jan 2025 12:07:57 -0800 Subject: [PATCH 5/5] bug fix --- nexa/gguf/server/nexa_service.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py index 6595d3c2..580e8205 100644 --- a/nexa/gguf/server/nexa_service.py +++ b/nexa/gguf/server/nexa_service.py @@ -728,10 +728,7 @@ def _resp_async_generator(streamer, start_time): } yield f"data: {json.dumps(chunk)}\n\n" - yield f"metrics: {MetricsResult( - ttft=ttft, - decoding_speed=decoding_times / (time.perf_counter() - start_time) - ).to_json()}\n\n" + yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - start_time)).to_json()}\n\n" yield "data: [DONE]\n\n" @app.post("/v1/download_model", tags=["Model"]) @@ -1022,10 +1019,7 @@ async def _resp_omnivlm_async_generator(model: NexaOmniVlmInference, prompt: str }] } yield f"data: {json.dumps(chunk)}\n\n" - yield f"metrics: {MetricsResult( - ttft=ttft, - decoding_speed=decoding_times / (time.perf_counter() - start_time) - ).to_json()}\n\n" + yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - start_time)).to_json()}\n\n" yield "data: [DONE]\n\n" except Exception as e: logging.error(f"Error in OmniVLM streaming: {e}") @@ -1378,10 +1372,7 @@ async def stream_with_cleanup(): }] } yield f"data: {json.dumps(chunk)}\n\n" - yield f"metrics: {MetricsResult( - ttft=ttft, - decoding_speed=decoding_times / (time.perf_counter() - start_time) - ).to_json()}\n\n" + yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - start_time)).to_json()}\n\n" yield "data: [DONE]\n\n" finally: temp_file.close()