Skip to content

Commit

Permalink
Fix vllm print error message issue (#10664)
Browse files Browse the repository at this point in the history
* update chatglm readme

* Add condition to invalidInputError

* update

* update

* style
  • Loading branch information
jenniew authored Apr 5, 2024
1 parent 29d97e4 commit 69bdbf5
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 2 deletions.
2 changes: 1 addition & 1 deletion python/llm/example/GPU/vLLM-Serving/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ Then you can access the api server as follows:
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "/MODEL_PATH/Llama-2-7b-chat-hf-ipex/",
"model": "/MODEL_PATH/Llama-2-7b-chat-hf/",
"prompt": "San Francisco is a",
"max_tokens": 128,
"temperature": 0
Expand Down
3 changes: 3 additions & 0 deletions python/llm/src/ipex_llm/transformers/models/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def compute_attn_outputs_weights(query_states, key_states, value_states, bsz, q_

if attn_output.size() != (bsz, num_heads, q_len, head_dim):
invalidInputError(
False,
f"`attn_output` should be of size {(bsz, num_heads, q_len, head_dim)},"
f" but is {attn_output.size()}"
)
Expand Down Expand Up @@ -326,6 +327,7 @@ def mistral_attention_forward_quantized(
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
invalidInputError(
False,
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)},"
f" but is {attention_mask.size()}"
)
Expand Down Expand Up @@ -682,6 +684,7 @@ def mistral_attention_forward_4_36_quantized(
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
invalidInputError(
False,
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)},"
f" but is {attention_mask.size()}"
)
Expand Down
1 change: 1 addition & 0 deletions python/llm/src/ipex_llm/transformers/models/mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ def mixtral_attention_forward(

if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
invalidInputError(
False,
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)},"
f" but is {attn_output.size()}"
)
Expand Down
3 changes: 2 additions & 1 deletion python/llm/src/ipex_llm/transformers/models/qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ def qwen2_model_forward_internal(
elif inputs_embeds is not None:
batch_size, seq_length, _ = inputs_embeds.shape
else:
invalidInputError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
invalidInputError(False,
"You have to specify either decoder_input_ids or decoder_inputs_embeds")

if self.gradient_checkpointing and self.training:
if use_cache:
Expand Down
2 changes: 2 additions & 0 deletions python/llm/src/ipex_llm/vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ def __init__(
def _verify_args(self) -> None:
if self.max_num_batched_tokens < self.max_model_len:
invalidInputError(
False,
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
f"smaller than max_model_len ({self.max_model_len}). "
"This effectively limits the maximum sequence length to "
Expand All @@ -415,6 +416,7 @@ def _verify_args(self) -> None:
"decrease max_model_len.")
if self.max_num_batched_tokens < self.max_num_seqs:
invalidInputError(
False,
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
"be greater than or equal to max_num_seqs "
f"({self.max_num_seqs}).")

0 comments on commit 69bdbf5

Please sign in to comment.