Fix vllm print error message issue (#10664)
* update chatglm readme * Add condition to invalidInputError * update * update * style
This commit is contained in:
parent
29d97e4678
commit
69bdbf5806
5 changed files with 9 additions and 2 deletions
|
|
@ -87,7 +87,7 @@ Then you can access the api server as follows:
|
||||||
curl http://localhost:8000/v1/completions \
|
curl http://localhost:8000/v1/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "/MODEL_PATH/Llama-2-7b-chat-hf-ipex/",
|
"model": "/MODEL_PATH/Llama-2-7b-chat-hf/",
|
||||||
"prompt": "San Francisco is a",
|
"prompt": "San Francisco is a",
|
||||||
"max_tokens": 128,
|
"max_tokens": 128,
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
|
|
|
||||||
|
|
@ -117,6 +117,7 @@ def compute_attn_outputs_weights(query_states, key_states, value_states, bsz, q_
|
||||||
|
|
||||||
if attn_output.size() != (bsz, num_heads, q_len, head_dim):
|
if attn_output.size() != (bsz, num_heads, q_len, head_dim):
|
||||||
invalidInputError(
|
invalidInputError(
|
||||||
|
False,
|
||||||
f"`attn_output` should be of size {(bsz, num_heads, q_len, head_dim)},"
|
f"`attn_output` should be of size {(bsz, num_heads, q_len, head_dim)},"
|
||||||
f" but is {attn_output.size()}"
|
f" but is {attn_output.size()}"
|
||||||
)
|
)
|
||||||
|
|
@ -326,6 +327,7 @@ def mistral_attention_forward_quantized(
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
|
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
|
||||||
invalidInputError(
|
invalidInputError(
|
||||||
|
False,
|
||||||
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)},"
|
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)},"
|
||||||
f" but is {attention_mask.size()}"
|
f" but is {attention_mask.size()}"
|
||||||
)
|
)
|
||||||
|
|
@ -682,6 +684,7 @@ def mistral_attention_forward_4_36_quantized(
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
|
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
|
||||||
invalidInputError(
|
invalidInputError(
|
||||||
|
False,
|
||||||
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)},"
|
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)},"
|
||||||
f" but is {attention_mask.size()}"
|
f" but is {attention_mask.size()}"
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -351,6 +351,7 @@ def mixtral_attention_forward(
|
||||||
|
|
||||||
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
|
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
|
||||||
invalidInputError(
|
invalidInputError(
|
||||||
|
False,
|
||||||
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)},"
|
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)},"
|
||||||
f" but is {attn_output.size()}"
|
f" but is {attn_output.size()}"
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -141,7 +141,8 @@ def qwen2_model_forward_internal(
|
||||||
elif inputs_embeds is not None:
|
elif inputs_embeds is not None:
|
||||||
batch_size, seq_length, _ = inputs_embeds.shape
|
batch_size, seq_length, _ = inputs_embeds.shape
|
||||||
else:
|
else:
|
||||||
invalidInputError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
|
invalidInputError(False,
|
||||||
|
"You have to specify either decoder_input_ids or decoder_inputs_embeds")
|
||||||
|
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
if use_cache:
|
if use_cache:
|
||||||
|
|
|
||||||
|
|
@ -407,6 +407,7 @@ class SchedulerConfig:
|
||||||
def _verify_args(self) -> None:
|
def _verify_args(self) -> None:
|
||||||
if self.max_num_batched_tokens < self.max_model_len:
|
if self.max_num_batched_tokens < self.max_model_len:
|
||||||
invalidInputError(
|
invalidInputError(
|
||||||
|
False,
|
||||||
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
|
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
|
||||||
f"smaller than max_model_len ({self.max_model_len}). "
|
f"smaller than max_model_len ({self.max_model_len}). "
|
||||||
"This effectively limits the maximum sequence length to "
|
"This effectively limits the maximum sequence length to "
|
||||||
|
|
@ -415,6 +416,7 @@ class SchedulerConfig:
|
||||||
"decrease max_model_len.")
|
"decrease max_model_len.")
|
||||||
if self.max_num_batched_tokens < self.max_num_seqs:
|
if self.max_num_batched_tokens < self.max_num_seqs:
|
||||||
invalidInputError(
|
invalidInputError(
|
||||||
|
False,
|
||||||
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
|
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
|
||||||
"be greater than or equal to max_num_seqs "
|
"be greater than or equal to max_num_seqs "
|
||||||
f"({self.max_num_seqs}).")
|
f"({self.max_num_seqs}).")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue