From c46f6978298ca790dcd026f4f92608d7473b1892 Mon Sep 17 00:00:00 2001
From: T <3923106166@qq.com>
Date: Tue, 31 Dec 2024 18:10:39 +0800
Subject: [PATCH 1/5] Fix model download issue: no error message for failed
 task. (#330)

* fix model pull issue: no error message for failed task

* update error message

* update
---
 nexa/gguf/server/nexa_service.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py
index 3db82c8d..878c309d 100644
--- a/nexa/gguf/server/nexa_service.py
+++ b/nexa/gguf/server/nexa_service.py
@@ -722,6 +722,10 @@ async def download_model(request: DownloadModelRequest):
             elif request.model_path in NEXA_RUN_MODEL_MAP_AUDIO_LM:
                 downloaded_path, model_type = pull_model(NEXA_RUN_MODEL_MAP_AUDIO_LM[request.model_path])
                 projector_downloaded_path, _ = pull_model(NEXA_RUN_AUDIO_LM_PROJECTOR_MAP[request.model_path])
+
+            if not downloaded_path or not model_type:
+                return JSONResponse(content="Failed to download model. Please check whether model_path is correct.", status_code=400)
+            
             return {
                 "status": "success",
                 "message": "Successfully downloaded model and projector",
@@ -732,6 +736,9 @@ async def download_model(request: DownloadModelRequest):
             }
         else:
             downloaded_path, model_type = pull_model(request.model_path)
+            if not downloaded_path or not model_type:
+                return JSONResponse(content="Failed to download model. Please check whether model_path is correct.", status_code=400)
+            
             return {
                 "status": "success",
                 "message": "Successfully downloaded model",

From c05af95c70d693e979eb9a936e2e69412f4f7349 Mon Sep 17 00:00:00 2001
From: JoyboyBrian <brian@nexa4ai.com>
Date: Tue, 31 Dec 2024 12:01:53 -0800
Subject: [PATCH 2/5] add action

---
 nexa/gguf/server/nexa_service.py | 65 ++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py
index 878c309d..2678f229 100644
--- a/nexa/gguf/server/nexa_service.py
+++ b/nexa/gguf/server/nexa_service.py
@@ -230,6 +230,9 @@ class DownloadModelRequest(BaseModel):
         "protected_namespaces": ()
     }
 
+class ActionRequest(BaseModel):
+    prompt: str = "<nexa_0>query_plane_ticket(year='2024', date='12-31', time='12:00', departure='SFO', destination='JFK')<nexa_end>"
+
 class StreamASRProcessor:
     def __init__(self, asr, task, language):
         self.asr = asr
@@ -1424,6 +1427,68 @@ async def create_embedding(request: EmbeddingRequest):
         logging.error(f"Error in embedding generation: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 
+@app.post("/v1/action", tags=["Actions"])
+async def action(request: ActionRequest):
+    try:
+        # Extract content between <nexa_X> and <nexa_end>
+        prompt = request.prompt
+        import re
+        
+        # Use regex to match <nexa_X> pattern
+        match = re.match(r"<nexa_\d+>(.*?)<nexa_end>", prompt)
+        if not match:
+            raise ValueError("Invalid prompt format. Must be wrapped in <nexa_X> and <nexa_end>")
+            
+        # Extract the function call content
+        function_content = match.group(1)
+        
+        # Parse function name and parameters
+        function_name = function_content[:function_content.index("(")]
+        params_str = function_content[function_content.index("(")+1:function_content.rindex(")")]
+        
+        # Parse parameters into dictionary
+        params = {}
+        for param in params_str.split(","):
+            if "=" in param:
+                key, value = param.split("=")
+                params[key.strip()] = value.strip().strip("'").strip('"')
+                
+        # Handle different function types
+        if function_name == "query_plane_ticket":
+            # Validate required parameters
+            required_params = ["year", "date", "time", "departure", "destination"]
+            for param in required_params:
+                if param not in params:
+                    raise ValueError(f"Missing required parameter: {param}")
+                    
+            # Construct the date string in required format
+            date_str = f"{params['date']}/{params['year']}"
+            
+            # Build the URL
+            url = (f"https://www.expedia.com/Flights-Search?"
+                  f"leg1=from:{params['departure']},to:{params['destination']},"
+                  f"departure:{date_str}T&"
+                  f"passengers=adults:1&trip=oneway&mode=search")
+                  
+            return {
+                "status": "success",
+                "function": function_name,
+                "parameters": params,
+                "url": url
+            }
+        else:
+            # Handle other function types in the future
+            return {
+                "status": "error",
+                "message": f"Unsupported function: {function_name}"
+            }
+            
+    except Exception as e:
+        return {
+            "status": "error",
+            "message": str(e)
+        }
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Run the Nexa AI Text Generation Service"

From cdb623cbf82e2fe3ebf6f4e8da1047427b77a1c1 Mon Sep 17 00:00:00 2001
From: JoyboyBrian <brian@nexa4ai.com>
Date: Tue, 31 Dec 2024 12:02:57 -0800
Subject: [PATCH 3/5] remove default prompt

---
 nexa/gguf/server/nexa_service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py
index 2678f229..1b0c5b78 100644
--- a/nexa/gguf/server/nexa_service.py
+++ b/nexa/gguf/server/nexa_service.py
@@ -231,7 +231,7 @@ class DownloadModelRequest(BaseModel):
     }
 
 class ActionRequest(BaseModel):
-    prompt: str = "<nexa_0>query_plane_ticket(year='2024', date='12-31', time='12:00', departure='SFO', destination='JFK')<nexa_end>"
+    prompt: str = ""
 
 class StreamASRProcessor:
     def __init__(self, asr, task, language):

From 413491f748009d850a4e50b99da780b641600971 Mon Sep 17 00:00:00 2001
From: T <3923106166@qq.com>
Date: Thu, 2 Jan 2025 17:01:30 +0800
Subject: [PATCH 4/5] T/localui group (#333)

* add ttft and decoding speed for omnivlm

* add ttft and decoding time
---
 nexa/gguf/server/nexa_service.py | 58 ++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py
index 1b0c5b78..6595d3c2 100644
--- a/nexa/gguf/server/nexa_service.py
+++ b/nexa/gguf/server/nexa_service.py
@@ -288,6 +288,20 @@ def ts_words(self, segments):
                 words.append((w.start, w.end, w.word))
         return words
 
+class MetricsResult:
+    def __init__(self, ttft: float, decoding_speed:float):
+        self.ttft = ttft
+        self.decoding_speed = decoding_speed
+
+    def to_dict(self):
+        return {
+            'ttft': round(self.ttft, 2),
+            'decoding_speed': round(self.decoding_speed, 2)
+        }
+    
+    def to_json(self):
+        return json.dumps(self.to_dict())
+
 # helper functions
 async def load_model():
     global model, chat_format, completion_template, model_path, n_ctx, is_local_path, model_type, is_huggingface, is_modelscope, projector_path
@@ -699,9 +713,13 @@ async def read_root(request: Request):
     )
 
 
-def _resp_async_generator(streamer):
+def _resp_async_generator(streamer, start_time):
     _id = str(uuid.uuid4())
+    ttft = 0
+    decoding_times = 0
     for token in streamer:
+        ttft = time.perf_counter() - start_time if ttft==0 else ttft
+        decoding_times += 1
         chunk = {
             "id": _id,
             "object": "chat.completion.chunk",
@@ -709,6 +727,11 @@ def _resp_async_generator(streamer):
             "choices": [{"delta": {"content": token}}],
         }
         yield f"data: {json.dumps(chunk)}\n\n"
+
+    yield f"metrics: {MetricsResult(
+        ttft=ttft,
+        decoding_speed=decoding_times / (time.perf_counter() - start_time)
+    ).to_json()}\n\n"
     yield "data: [DONE]\n\n"
 
 @app.post("/v1/download_model", tags=["Model"])
@@ -857,8 +880,9 @@ async def generate_text(request: GenerationRequest):
         generation_kwargs = request.dict()
         if request.stream:
             # Run the generation and stream the response
+            start_time = time.perf_counter()
             streamer = nexa_run_text_generation(is_chat_completion=False, **generation_kwargs)
-            return StreamingResponse(_resp_async_generator(streamer), media_type="application/x-ndjson")
+            return StreamingResponse(_resp_async_generator(streamer, start_time), media_type="application/x-ndjson")
         else:
             # Generate text synchronously and return the response
             result = nexa_run_text_generation(is_chat_completion=False, **generation_kwargs)
@@ -902,8 +926,9 @@ async def text_chat_completions(request: ChatCompletionRequest):
         ).dict()
 
         if request.stream:
+            start_time = time.perf_counter()
             streamer = nexa_run_text_generation(is_chat_completion=True, **generation_kwargs)
-            return StreamingResponse(_resp_async_generator(streamer), media_type="application/x-ndjson")
+            return StreamingResponse(_resp_async_generator(streamer, start_time), media_type="application/x-ndjson")
         
         result = nexa_run_text_generation(is_chat_completion=True, **generation_kwargs)
         return {
@@ -951,7 +976,8 @@ async def multimodal_chat_completions(request: VLMChatCompletionRequest):
                 processed_messages.append({"role": msg.role, "content": processed_content})
             else:
                 processed_messages.append({"role": msg.role, "content": msg.content})
-                
+
+        start_time = time.perf_counter()        
         response = model.create_chat_completion(
             messages=processed_messages,
             max_tokens=request.max_tokens,
@@ -963,7 +989,8 @@ async def multimodal_chat_completions(request: VLMChatCompletionRequest):
         )
         
         if request.stream:
-            return StreamingResponse(_resp_async_generator(response), media_type="application/x-ndjson")
+            
+            return StreamingResponse(_resp_async_generator(response, start_time), media_type="application/x-ndjson")
         return response
 
     except HTTPException as e:
@@ -972,13 +999,18 @@ async def multimodal_chat_completions(request: VLMChatCompletionRequest):
         logging.error(f"Error in multimodal chat completions: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 
-async def _resp_omnivlm_async_generator(model, prompt: str, image_path: str):
+async def _resp_omnivlm_async_generator(model: NexaOmniVlmInference, prompt: str, image_path: str):
     _id = str(uuid.uuid4())
+    ttft = 0
+    start_time = time.perf_counter()
+    decoding_times = 0
     try:
         if not os.path.exists(image_path):
             raise FileNotFoundError(f"Image file not found: {image_path}")
             
         for token in model.inference_streaming(prompt, image_path):
+            ttft = time.perf_counter() - start_time if ttft==0 else ttft
+            decoding_times += 1
             chunk = {
                 "id": _id,
                 "object": "chat.completion.chunk",
@@ -990,6 +1022,10 @@ async def _resp_omnivlm_async_generator(model, prompt: str, image_path: str):
                 }]
             }
             yield f"data: {json.dumps(chunk)}\n\n"
+        yield f"metrics: {MetricsResult(
+            ttft=ttft,
+            decoding_speed=decoding_times / (time.perf_counter() - start_time)
+        ).to_json()}\n\n"
         yield "data: [DONE]\n\n"
     except Exception as e:
         logging.error(f"Error in OmniVLM streaming: {e}")
@@ -1307,6 +1343,9 @@ async def audio_chat_completions(
     stream: Optional[bool] = Query(False, description="Whether to stream the response"),
 ):
     temp_file = None
+    ttft = 0
+    start_time = time.perf_counter()
+    decoding_times = 0
     
     try:
         if model_type != "AudioLM":
@@ -1323,8 +1362,11 @@ async def audio_chat_completions(
 
         if stream:
             async def stream_with_cleanup():
+                nonlocal ttft, decoding_times, start_time
                 try:
                     for token in model.inference_streaming(audio_path, prompt or ""):
+                        ttft = time.perf_counter() - start_time if ttft==0 else ttft
+                        decoding_times += 1
                         chunk = {
                             "id": str(uuid.uuid4()),
                             "object": "chat.completion.chunk",
@@ -1336,6 +1378,10 @@ async def stream_with_cleanup():
                             }]
                         }
                         yield f"data: {json.dumps(chunk)}\n\n"
+                    yield f"metrics: {MetricsResult(
+                        ttft=ttft,
+                        decoding_speed=decoding_times / (time.perf_counter() - start_time)
+                    ).to_json()}\n\n"
                     yield "data: [DONE]\n\n"
                 finally:
                     temp_file.close()

From 08f522202e68042b921001e9bfd5fc61581da5e9 Mon Sep 17 00:00:00 2001
From: JoyboyBrian <brian@nexa4ai.com>
Date: Thu, 2 Jan 2025 12:07:57 -0800
Subject: [PATCH 5/5] bug fix

---
 nexa/gguf/server/nexa_service.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py
index 6595d3c2..580e8205 100644
--- a/nexa/gguf/server/nexa_service.py
+++ b/nexa/gguf/server/nexa_service.py
@@ -728,10 +728,7 @@ def _resp_async_generator(streamer, start_time):
         }
         yield f"data: {json.dumps(chunk)}\n\n"
 
-    yield f"metrics: {MetricsResult(
-        ttft=ttft,
-        decoding_speed=decoding_times / (time.perf_counter() - start_time)
-    ).to_json()}\n\n"
+    yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - start_time)).to_json()}\n\n"
     yield "data: [DONE]\n\n"
 
 @app.post("/v1/download_model", tags=["Model"])
@@ -1022,10 +1019,7 @@ async def _resp_omnivlm_async_generator(model: NexaOmniVlmInference, prompt: str
                 }]
             }
             yield f"data: {json.dumps(chunk)}\n\n"
-        yield f"metrics: {MetricsResult(
-            ttft=ttft,
-            decoding_speed=decoding_times / (time.perf_counter() - start_time)
-        ).to_json()}\n\n"
+        yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - start_time)).to_json()}\n\n"
         yield "data: [DONE]\n\n"
     except Exception as e:
         logging.error(f"Error in OmniVLM streaming: {e}")
@@ -1378,10 +1372,7 @@ async def stream_with_cleanup():
                             }]
                         }
                         yield f"data: {json.dumps(chunk)}\n\n"
-                    yield f"metrics: {MetricsResult(
-                        ttft=ttft,
-                        decoding_speed=decoding_times / (time.perf_counter() - start_time)
-                    ).to_json()}\n\n"
+                    yield f"metrics: {MetricsResult(ttft=ttft, decoding_speed=decoding_times / (time.perf_counter() - start_time)).to_json()}\n\n"
                     yield "data: [DONE]\n\n"
                 finally:
                     temp_file.close()