update vllm patch (#13185)
Co-authored-by: gc-fu <guancheng.fu@intel.com>
This commit is contained in:
parent
531bef2810
commit
c5d919b151
1 changed files with 29 additions and 6 deletions
|
|
@ -12307,10 +12307,10 @@ index 1b1738f88..2c2ed67b9 100644
|
||||||
layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
|
layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
|
||||||
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
|
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
|
||||||
new file mode 100644
|
new file mode 100644
|
||||||
index 000000000..8e6e2c11a
|
index 000000000..332af5a68
|
||||||
--- /dev/null
|
--- /dev/null
|
||||||
+++ b/vllm/model_executor/models/glm4.py
|
+++ b/vllm/model_executor/models/glm4.py
|
||||||
@@ -0,0 +1,318 @@
|
@@ -0,0 +1,329 @@
|
||||||
+# SPDX-License-Identifier: Apache-2.0
|
+# SPDX-License-Identifier: Apache-2.0
|
||||||
+
|
+
|
||||||
+# Copyright 2025 The Zhipu AI team.
|
+# Copyright 2025 The Zhipu AI team.
|
||||||
|
|
@ -12339,6 +12339,7 @@ index 000000000..8e6e2c11a
|
||||||
+import torch
|
+import torch
|
||||||
+from torch import nn
|
+from torch import nn
|
||||||
+from transformers import Glm4Config
|
+from transformers import Glm4Config
|
||||||
|
+import os
|
||||||
+
|
+
|
||||||
+from vllm.attention import Attention, AttentionType
|
+from vllm.attention import Attention, AttentionType
|
||||||
+from vllm.compilation.decorators import support_torch_compile
|
+from vllm.compilation.decorators import support_torch_compile
|
||||||
|
|
@ -12427,7 +12428,6 @@ index 000000000..8e6e2c11a
|
||||||
+ rope_scaling=rope_scaling,
|
+ rope_scaling=rope_scaling,
|
||||||
+ partial_rotary_factor=partial_rotary_factor,
|
+ partial_rotary_factor=partial_rotary_factor,
|
||||||
+ is_neox_style=False,
|
+ is_neox_style=False,
|
||||||
+ dtype=torch.float32,
|
|
||||||
+ )
|
+ )
|
||||||
+ self.attn = Attention(self.num_heads,
|
+ self.attn = Attention(self.num_heads,
|
||||||
+ self.head_dim,
|
+ self.head_dim,
|
||||||
|
|
@ -12461,6 +12461,7 @@ index 000000000..8e6e2c11a
|
||||||
+ prefix: str = "",
|
+ prefix: str = "",
|
||||||
+ ) -> None:
|
+ ) -> None:
|
||||||
+ super().__init__()
|
+ super().__init__()
|
||||||
|
+ self.config = config
|
||||||
+ self.hidden_size = config.hidden_size
|
+ self.hidden_size = config.hidden_size
|
||||||
+ rope_theta = getattr(config, "rope_theta", 1000000)
|
+ rope_theta = getattr(config, "rope_theta", 1000000)
|
||||||
+ rope_scaling = getattr(config, "rope_scaling", None)
|
+ rope_scaling = getattr(config, "rope_scaling", None)
|
||||||
|
|
@ -12519,6 +12520,7 @@ index 000000000..8e6e2c11a
|
||||||
+ # Fully Connected
|
+ # Fully Connected
|
||||||
+ hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
|
+ hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
|
||||||
+ hidden_states = self.mlp(hidden_states)
|
+ hidden_states = self.mlp(hidden_states)
|
||||||
|
+ if self.config.num_hidden_layers == 61: # GLM-4-32B-0414
|
||||||
+ hidden_states = hidden_states.to(torch.float32)
|
+ hidden_states = hidden_states.to(torch.float32)
|
||||||
+ hidden_states = torch.clamp(hidden_states, min=-1e36, max=1e36)
|
+ hidden_states = torch.clamp(hidden_states, min=-1e36, max=1e36)
|
||||||
+ hidden_states = self.post_mlp_layernorm(hidden_states)
|
+ hidden_states = self.post_mlp_layernorm(hidden_states)
|
||||||
|
|
@ -12562,6 +12564,15 @@ index 000000000..8e6e2c11a
|
||||||
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
+ super().__init__()
|
+ super().__init__()
|
||||||
+ config = vllm_config.model_config.hf_config
|
+ config = vllm_config.model_config.hf_config
|
||||||
|
+ if config.num_hidden_layers == 61: # GLM-4-32B-0414
|
||||||
|
+ if vllm_config.model_config.dtype != torch.float32:
|
||||||
|
+ vllm_config.model_config.dtype = torch.float32
|
||||||
|
+ logger.warning_once("vLLM xpu: GLM-4-32B-0414 model is only supported with FP32. Converting dtype to torch.float32.")
|
||||||
|
+ sdp_disabled_flag = os.getenv("IPEX_LLM_DISABLE_SDP_CAUSAL", None)
|
||||||
|
+ if sdp_disabled_flag != "1":
|
||||||
|
+ logger.warning_once("vLLM xpu: GLM-4-32B-0414 model is currently not supported with sdp_causal(). Setting IPEX_LLM_DISABLE_SDP_CAUSAL=1.")
|
||||||
|
+ os.environ["IPEX_LLM_DISABLE_SDP_CAUSAL"] = "1"
|
||||||
|
+
|
||||||
+ quant_config = vllm_config.quant_config
|
+ quant_config = vllm_config.quant_config
|
||||||
+ lora_config = vllm_config.lora_config
|
+ lora_config = vllm_config.lora_config
|
||||||
+
|
+
|
||||||
|
|
@ -14681,7 +14692,7 @@ index 000000000..bda3de2d3
|
||||||
+ loaded_params.add(name)
|
+ loaded_params.add(name)
|
||||||
+ return loaded_params
|
+ return loaded_params
|
||||||
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
|
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
|
||||||
index c0a3c59ba..8614c2273 100644
|
index c0a3c59ba..b8f54e82e 100644
|
||||||
--- a/vllm/model_executor/models/registry.py
|
--- a/vllm/model_executor/models/registry.py
|
||||||
+++ b/vllm/model_executor/models/registry.py
|
+++ b/vllm/model_executor/models/registry.py
|
||||||
@@ -57,6 +57,7 @@ _TEXT_GENERATION_MODELS = {
|
@@ -57,6 +57,7 @@ _TEXT_GENERATION_MODELS = {
|
||||||
|
|
@ -14709,6 +14720,18 @@ index c0a3c59ba..8614c2273 100644
|
||||||
"XverseForCausalLM": ("llama", "LlamaForCausalLM"),
|
"XverseForCausalLM": ("llama", "LlamaForCausalLM"),
|
||||||
"Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
|
"Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
|
||||||
# [Encoder-decoder]
|
# [Encoder-decoder]
|
||||||
|
@@ -307,8 +311,9 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
|
||||||
|
|
||||||
|
# Performed in another process to avoid initializing CUDA
|
||||||
|
def inspect_model_cls(self) -> _ModelInfo:
|
||||||
|
- return _run_in_subprocess(
|
||||||
|
- lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
|
||||||
|
+ # return _run_in_subprocess(
|
||||||
|
+ # lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
|
||||||
|
+ return _ModelInfo.from_model_cls(self.load_model_cls())
|
||||||
|
|
||||||
|
def load_model_cls(self) -> Type[nn.Module]:
|
||||||
|
mod = importlib.import_module(self.module_name)
|
||||||
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
|
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
|
||||||
index a09741a55..d989a12fa 100644
|
index a09741a55..d989a12fa 100644
|
||||||
--- a/vllm/model_executor/models/roberta.py
|
--- a/vllm/model_executor/models/roberta.py
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue