Fix vllm print error message issue (#10664)

* update chatglm readme * Add condition to invalidInputError * update * update * style
intel-analytics · Apr 5, 2024 · 69bdbf5 · 69bdbf5
1 parent 29d97e4
commit 69bdbf5
Show file tree

Hide file tree

Showing 5 changed files with 9 additions and 2 deletions.
diff --git a/python/llm/example/GPU/vLLM-Serving/README.md b/python/llm/example/GPU/vLLM-Serving/README.md
@@ -87,7 +87,7 @@ Then you can access the api server as follows:
  curl http://localhost:8000/v1/completions \
          -H "Content-Type: application/json" \
          -d '{
-                 "model": "/MODEL_PATH/Llama-2-7b-chat-hf-ipex/",
+                 "model": "/MODEL_PATH/Llama-2-7b-chat-hf/",
                  "prompt": "San Francisco is a",
                  "max_tokens": 128,
                  "temperature": 0

diff --git a/python/llm/src/ipex_llm/transformers/models/mistral.py b/python/llm/src/ipex_llm/transformers/models/mistral.py
@@ -117,6 +117,7 @@ def compute_attn_outputs_weights(query_states, key_states, value_states, bsz, q_
 
     if attn_output.size() != (bsz, num_heads, q_len, head_dim):
         invalidInputError(
+            False,
             f"`attn_output` should be of size {(bsz, num_heads, q_len, head_dim)},"
             f" but is {attn_output.size()}"
         )
@@ -326,6 +327,7 @@ def mistral_attention_forward_quantized(
             if attention_mask is not None:
                 if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                     invalidInputError(
+                        False,
                         f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)},"
                         f" but is {attention_mask.size()}"
                     )
@@ -682,6 +684,7 @@ def mistral_attention_forward_4_36_quantized(
             if attention_mask is not None:
                 if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                     invalidInputError(
+                        False,
                         f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)},"
                         f" but is {attention_mask.size()}"
                     )

diff --git a/python/llm/src/ipex_llm/transformers/models/mixtral.py b/python/llm/src/ipex_llm/transformers/models/mixtral.py
@@ -351,6 +351,7 @@ def mixtral_attention_forward(
 
     if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
         invalidInputError(
+            False,
             f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)},"
             f" but is {attn_output.size()}"
         )

diff --git a/python/llm/src/ipex_llm/transformers/models/qwen2.py b/python/llm/src/ipex_llm/transformers/models/qwen2.py
@@ -141,7 +141,8 @@ def qwen2_model_forward_internal(
     elif inputs_embeds is not None:
         batch_size, seq_length, _ = inputs_embeds.shape
     else:
-        invalidInputError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        invalidInputError(False,
+                          "You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
     if self.gradient_checkpointing and self.training:
         if use_cache:

diff --git a/python/llm/src/ipex_llm/vllm/config.py b/python/llm/src/ipex_llm/vllm/config.py
@@ -407,6 +407,7 @@ def __init__(
     def _verify_args(self) -> None:
         if self.max_num_batched_tokens < self.max_model_len:
             invalidInputError(
+                False,
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
                 f"smaller than max_model_len ({self.max_model_len}). "
                 "This effectively limits the maximum sequence length to "
@@ -415,6 +416,7 @@ def _verify_args(self) -> None:
                 "decrease max_model_len.")
         if self.max_num_batched_tokens < self.max_num_seqs:
             invalidInputError(
+                False,
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
                 "be greater than or equal to max_num_seqs "
                 f"({self.max_num_seqs}).")