diff --git a/python/llm/example/GPU/vLLM-Serving/README.md b/python/llm/example/GPU/vLLM-Serving/README.md index 8152e454..02f72379 100644 --- a/python/llm/example/GPU/vLLM-Serving/README.md +++ b/python/llm/example/GPU/vLLM-Serving/README.md @@ -87,7 +87,7 @@ Then you can access the api server as follows: curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "/MODEL_PATH/Llama-2-7b-chat-hf-ipex/", + "model": "/MODEL_PATH/Llama-2-7b-chat-hf/", "prompt": "San Francisco is a", "max_tokens": 128, "temperature": 0 diff --git a/python/llm/src/ipex_llm/transformers/models/mistral.py b/python/llm/src/ipex_llm/transformers/models/mistral.py index 9a7d5c0d..818b98f3 100644 --- a/python/llm/src/ipex_llm/transformers/models/mistral.py +++ b/python/llm/src/ipex_llm/transformers/models/mistral.py @@ -117,6 +117,7 @@ def compute_attn_outputs_weights(query_states, key_states, value_states, bsz, q_ if attn_output.size() != (bsz, num_heads, q_len, head_dim): invalidInputError( + False, f"`attn_output` should be of size {(bsz, num_heads, q_len, head_dim)}," f" but is {attn_output.size()}" ) @@ -326,6 +327,7 @@ def mistral_attention_forward_quantized( if attention_mask is not None: if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): invalidInputError( + False, f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}," f" but is {attention_mask.size()}" ) @@ -682,6 +684,7 @@ def mistral_attention_forward_4_36_quantized( if attention_mask is not None: if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): invalidInputError( + False, f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}," f" but is {attention_mask.size()}" ) diff --git a/python/llm/src/ipex_llm/transformers/models/mixtral.py b/python/llm/src/ipex_llm/transformers/models/mixtral.py index 96762512..c25e1425 100644 --- a/python/llm/src/ipex_llm/transformers/models/mixtral.py +++ b/python/llm/src/ipex_llm/transformers/models/mixtral.py @@ -351,6 +351,7 @@ def mixtral_attention_forward( if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): invalidInputError( + False, f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}," f" but is {attn_output.size()}" ) diff --git a/python/llm/src/ipex_llm/transformers/models/qwen2.py b/python/llm/src/ipex_llm/transformers/models/qwen2.py index 8e557b23..66f86692 100644 --- a/python/llm/src/ipex_llm/transformers/models/qwen2.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen2.py @@ -141,7 +141,8 @@ def qwen2_model_forward_internal( elif inputs_embeds is not None: batch_size, seq_length, _ = inputs_embeds.shape else: - invalidInputError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + invalidInputError(False, + "You have to specify either decoder_input_ids or decoder_inputs_embeds") if self.gradient_checkpointing and self.training: if use_cache: diff --git a/python/llm/src/ipex_llm/vllm/config.py b/python/llm/src/ipex_llm/vllm/config.py index c386a90e..a5f4762b 100644 --- a/python/llm/src/ipex_llm/vllm/config.py +++ b/python/llm/src/ipex_llm/vllm/config.py @@ -407,6 +407,7 @@ class SchedulerConfig: def _verify_args(self) -> None: if self.max_num_batched_tokens < self.max_model_len: invalidInputError( + False, f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " f"smaller than max_model_len ({self.max_model_len}). " "This effectively limits the maximum sequence length to " @@ -415,6 +416,7 @@ class SchedulerConfig: "decrease max_model_len.") if self.max_num_batched_tokens < self.max_num_seqs: invalidInputError( + False, f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " "be greater than or equal to max_num_seqs " f"({self.max_num_seqs}).")