Merge pull request #13 from DeepAuto-AI/ainl-feat-logit-bias

Hotfix: Logit masking and stress tester
DeepAuto-AI · Jun 18, 2024 · d3cab56 · d3cab56
2 parents af1d4e0 + a9a92ca
commit d3cab56
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 5 deletions.
diff --git a/examples/openai_chat_image_stress.py b/examples/openai_chat_image_stress.py
@@ -582,6 +582,10 @@ def get_message(image_file: Optional[str]):
                         "type": "text",
                         "text": f"Here is one of wikipedia page that i found. \n```\n{CONTENT}\n```\n\n\n",
                     },
+                    {
+                        "type": "text",
+                        "text": f"Here is one of wikipedia page that i found. \n```\n{CONTENT[:15000]}\n```\n\n\n",
+                    },
                     {
                         "type": "text",
                         "text": "What is similarity and difference between above documents? Describe every precisely with examples and summary both documents.",

diff --git a/examples/openai_chat_streaming.py b/examples/openai_chat_streaming.py
@@ -423,9 +423,18 @@
 """
 input = f"Hello world! Here I found the wikipedia docuemnt. Please summarize it.\n\n=== Document Starts ===\n\n{CONTENT}\n\n=== Document Ends ===\n\n"
 
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--model', default='microsoft/Phi-3-mini-128k-instruct')
+parser.add_argument('--endpoint', default='http://localhost:8888/v1')
+parser.add_argument('--text', action='store_true')
+
+args = parser.parse_args()
+
 # Prepare the request data to be sent to the GPT API
 data = {
-    'model': 'microsoft/Phi-3-mini-128k-instruct',
+    'model': args.model,
     'stream': True,
     'max_tokens': 512,
     'messages': [
@@ -452,9 +461,9 @@
 }
 
 # Send the request to the OpenAI API and process each chunk of data as it arrives
-response = requests.post('http://localhost:8888/v1/chat/completions', data=json.dumps(data), headers=headers, stream=True)
+response = requests.post(f'{args.endpoint}/chat/completions', data=json.dumps(data), headers=headers, stream=True)
 
-show_text = sys.argv[-1] == 'text'
+show_text = args.text
 
 if response.status_code == 200:
     prompt_throughput = 0
@@ -483,6 +492,6 @@
 
     if show_text and len(throughputs) > 0:
         print('prompt throughput', prompt_throughput)
-        print('decode throughput', sum(throughputs) / len(throughputs))
+        print('decode throughput', list(sorted(throughputs))[len(throughputs) // 2], sum(throughputs) / len(throughputs))
 else:
     print("Request failed with status code: ", response.status_code)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -874,7 +874,7 @@ def execute_model(
 
         DEBUG_LOGITS_MASK = os.getenv('DEBUG_LOGITS_MASK', None)
 
-        if DEBUG_LOGITS_MASK is not None:
+        if (DEBUG_LOGITS_MASK is not None) and (logits is not None) and isinstance(logits, torch.Tensor):
             if not hasattr(ModelRunner, 'debug_logits_mask'):
                 ModelRunner.debug_logits_mask = torch.load(DEBUG_LOGITS_MASK, map_location=logits.device)
             if ModelRunner.debug_logits_mask.device != logits.device: