Skip to content

Commit

Permalink
Merge pull request #13 from DeepAuto-AI/ainl-feat-logit-bias
Browse files Browse the repository at this point in the history
Hotfix: Logit masking and stress tester
  • Loading branch information
gmlwns2000 authored Jun 18, 2024
2 parents af1d4e0 + a9a92ca commit d3cab56
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 5 deletions.
4 changes: 4 additions & 0 deletions examples/openai_chat_image_stress.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,10 @@ def get_message(image_file: Optional[str]):
"type": "text",
"text": f"Here is one of wikipedia page that i found. \n```\n{CONTENT}\n```\n\n\n",
},
{
"type": "text",
"text": f"Here is one of wikipedia page that i found. \n```\n{CONTENT[:15000]}\n```\n\n\n",
},
{
"type": "text",
"text": "What is similarity and difference between above documents? Describe every precisely with examples and summary both documents.",
Expand Down
17 changes: 13 additions & 4 deletions examples/openai_chat_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,9 +423,18 @@
"""
input = f"Hello world! Here I found the wikipedia docuemnt. Please summarize it.\n\n=== Document Starts ===\n\n{CONTENT}\n\n=== Document Ends ===\n\n"

import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--model', default='microsoft/Phi-3-mini-128k-instruct')
parser.add_argument('--endpoint', default='http://localhost:8888/v1')
parser.add_argument('--text', action='store_true')

args = parser.parse_args()

# Prepare the request data to be sent to the GPT API
data = {
'model': 'microsoft/Phi-3-mini-128k-instruct',
'model': args.model,
'stream': True,
'max_tokens': 512,
'messages': [
Expand All @@ -452,9 +461,9 @@
}

# Send the request to the OpenAI API and process each chunk of data as it arrives
response = requests.post('http://localhost:8888/v1/chat/completions', data=json.dumps(data), headers=headers, stream=True)
response = requests.post(f'{args.endpoint}/chat/completions', data=json.dumps(data), headers=headers, stream=True)

show_text = sys.argv[-1] == 'text'
show_text = args.text

if response.status_code == 200:
prompt_throughput = 0
Expand Down Expand Up @@ -483,6 +492,6 @@

if show_text and len(throughputs) > 0:
print('prompt throughput', prompt_throughput)
print('decode throughput', sum(throughputs) / len(throughputs))
print('decode throughput', list(sorted(throughputs))[len(throughputs) // 2], sum(throughputs) / len(throughputs))
else:
print("Request failed with status code: ", response.status_code)
2 changes: 1 addition & 1 deletion vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -874,7 +874,7 @@ def execute_model(

DEBUG_LOGITS_MASK = os.getenv('DEBUG_LOGITS_MASK', None)

if DEBUG_LOGITS_MASK is not None:
if (DEBUG_LOGITS_MASK is not None) and (logits is not None) and isinstance(logits, torch.Tensor):
if not hasattr(ModelRunner, 'debug_logits_mask'):
ModelRunner.debug_logits_mask = torch.load(DEBUG_LOGITS_MASK, map_location=logits.device)
if ModelRunner.debug_logits_mask.device != logits.device:
Expand Down

0 comments on commit d3cab56

Please sign in to comment.