From 5a629ae470b524db8d0b0cfbb8dbd8bc906c6e8f Mon Sep 17 00:00:00 2001 From: Shaojun Liu <61072813+liu-shaojun@users.noreply.github.com> Date: Fri, 6 Jun 2025 17:20:45 +0800 Subject: [PATCH] update vllm patch (#13211) Co-authored-by: gc-fu --- .../xpu/docker/vllm_for_multi_arc.patch | 188 ++++++++++++++---- 1 file changed, 147 insertions(+), 41 deletions(-) diff --git a/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch b/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch index ffcad63d..93b7e779 100644 --- a/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch +++ b/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch @@ -10389,7 +10389,7 @@ index bd52fc90b..7d4e3555a 100644 if capability < quant_config.get_min_capability(): raise ValueError( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py -index 89c9b6747..a5be57ce0 100644 +index 89c9b6747..feba4f69f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -210,6 +210,8 @@ class EngineArgs: @@ -10420,7 +10420,7 @@ index 89c9b6747..a5be57ce0 100644 parser.add_argument( "--disable-cascade-attn", action="store_true", -@@ -1061,6 +1075,8 @@ class EngineArgs: +@@ -1061,10 +1075,16 @@ class EngineArgs: override_generation_config=self.override_generation_config, enable_sleep_mode=self.enable_sleep_mode, model_impl=self.model_impl, @@ -10429,7 +10429,26 @@ index 89c9b6747..a5be57ce0 100644 ) def create_load_config(self) -> LoadConfig: -@@ -1504,12 +1520,13 @@ class EngineArgs: + ++ use_low_bit_loader = False ++ ++ if self.low_bit_model_path is not None: ++ use_low_bit_loader = True + if(self.qlora_adapter_name_or_path is not None) and \ + self.quantization != "bitsandbytes": + raise ValueError( +@@ -1079,8 +1099,10 @@ class EngineArgs: + model_loader_extra_config=self.model_loader_extra_config, + ignore_patterns=self.ignore_patterns, + use_tqdm_on_load=self.use_tqdm_on_load, ++ use_low_bit_loader=use_low_bit_loader, + ) + ++ + def create_speculative_config( + self, + target_model_config: ModelConfig, +@@ -1504,12 +1526,13 @@ class EngineArgs: _raise_or_fallback(feature_name=name, recommend_to_remove=True) return False @@ -12669,6 +12688,23 @@ index c190a4585..dda2a96cc 100644 boi = self.boi.expand(x.shape[0], -1, -1) eoi = self.eoi.expand(x.shape[0], -1, -1) x = torch.cat((boi, x, eoi), dim=1) +diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py +index cb0379c10..5e8b22ab0 100644 +--- a/vllm/model_executor/models/idefics2_vision_model.py ++++ b/vllm/model_executor/models/idefics2_vision_model.py +@@ -144,8 +144,10 @@ class Idefics2VisionAttention(nn.Module): + ) + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_partition = divide(self.num_heads, self.tp_size) +- self.attn = MultiHeadAttention(self.num_heads_per_partition, +- self.head_dim, self.scale) ++ # self.attn = MultiHeadAttention(self.num_heads_per_partition, ++ # self.head_dim, self.scale) ++ from vllm.model_executor.models.siglip import SelfAttention ++ self.attn = SelfAttention(self.num_heads_per_partition, self.head_dim, self.scale) + + def forward( + self, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 5fab9df3f..f8e6fbe24 100644 --- a/vllm/model_executor/models/minicpmv.py @@ -13552,6 +13588,18 @@ index 000000000..d96085f46 + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) +diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py +index db90848f9..5eabcf653 100644 +--- a/vllm/model_executor/models/phi4mm_audio.py ++++ b/vllm/model_executor/models/phi4mm_audio.py +@@ -230,6 +230,7 @@ class ConformerEncoderLayer(nn.Module): + x = x + 0.5 * self.feed_forward_in(x) + norm_x = self.layer_norm_att(x) + ++ mask = mask.to(x.device) + x = x + self.self_attn( + norm_x, + norm_x, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index c4d02e5dd..2831a5a12 100644 --- a/vllm/model_executor/models/qwen2.py @@ -13589,41 +13637,85 @@ index c4d02e5dd..2831a5a12 100644 ) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py -index 1e6ff1fec..e2480326a 100644 +index 1e6ff1fec..90ebe5ca9 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py -@@ -304,6 +304,10 @@ class Qwen2_5_VisionAttention(nn.Module): +@@ -302,23 +302,33 @@ class Qwen2_5_VisionAttention(nn.Module): + "(b s) ... -> b s ...", + b=batch_size) elif self.attn_backend == _Backend.TORCH_SDPA: - # Execute attention entry by entry for speed & less VRAM. - outputs = [] -+ head_dim = q.shape[-1] -+ import math -+ import xe_addons -+ scale = 1 / math.sqrt(head_dim) - for i in range(1, len(cu_seqlens)): - start_idx = cu_seqlens[i - 1] - end_idx = cu_seqlens[i] -@@ -312,10 +316,16 @@ class Qwen2_5_VisionAttention(nn.Module): - v_i = v[:, start_idx:end_idx] - q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d") - for x in [q_i, k_i, v_i]) +- # Execute attention entry by entry for speed & less VRAM. +- outputs = [] +- for i in range(1, len(cu_seqlens)): +- start_idx = cu_seqlens[i - 1] +- end_idx = cu_seqlens[i] +- q_i = q[:, start_idx:end_idx] +- k_i = k[:, start_idx:end_idx] +- v_i = v[:, start_idx:end_idx] +- q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d") +- for x in [q_i, k_i, v_i]) - output_i = F.scaled_dot_product_attention(q_i, - k_i, - v_i, - dropout_p=0.0) -+ # output_i = F.scaled_dot_product_attention(q_i, -+ # k_i, -+ # v_i, -+ # dropout_p=0.0) -+ output_i = xe_addons.sdp_non_causal( -+ q_i.contiguous(), -+ k_i.contiguous(), -+ v_i.contiguous(), -+ None, -+ scale) - output_i = rearrange(output_i, "b h s d -> b s h d ") - outputs.append(output_i) - context_layer = torch.cat(outputs, dim=1) +- output_i = rearrange(output_i, "b h s d -> b s h d ") +- outputs.append(output_i) +- context_layer = torch.cat(outputs, dim=1) ++ # TODO(xiangyu): Maybe add attn_backend xpu? ++ q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) ++ from vllm._ipex_ops import ipex_ops ++ output = torch.empty( ++ (q.shape[0], q.shape[1], q.shape[2]), ++ dtype=q.dtype, ++ device=q.device) ++ import math ++ head_dim = q.shape[-1] ++ scale = 1 / math.sqrt(head_dim) ++ ipex_ops.varlen_attention(q, k, v, output, ++ cu_seqlens, ++ cu_seqlens, ++ max_seqlen, ++ max_seqlen, ++ pdropout=0, ++ softmax_scale=scale, ++ zero_tensors=False, ++ is_causal=False, ++ return_softmax=False, ++ gen_=None, ++ logits_soft_cap=0 ++ ) ++ ++ context_layer = rearrange(output, ++ "(b s) ... -> b s ...", ++ b=batch_size) + elif self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalMask +@@ -613,10 +623,11 @@ class Qwen2_5_VisionTransformer(nn.Module): + cu_seqlens: torch.Tensor, + ) -> tuple[Optional[int], Optional[list[int]]]: + max_seqlen, seqlens = None, None +- if self.attn_backend == _Backend.FLASH_ATTN: +- max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() +- elif self.attn_backend == _Backend.XFORMERS: +- seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() ++ # if self.attn_backend == _Backend.FLASH_ATTN: ++ # max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() ++ # elif self.attn_backend == _Backend.XFORMERS: ++ # seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() ++ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + return max_seqlen, seqlens + + def forward( +@@ -1082,7 +1093,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, + image_input=image_input, + video_input=video_input) + input_ids = None +- ++ + hidden_states = self.language_model.model( + input_ids=input_ids, + positions=positions, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index a7800d415..26af87512 100644 --- a/vllm/model_executor/models/qwen2_vl.py @@ -15133,10 +15225,10 @@ index c271f438e..cf7180606 100755 assert sliding_window == (-1, -1), ( diff --git a/vllm/v1/attention/backends/ipex_attn.py b/vllm/v1/attention/backends/ipex_attn.py new file mode 100644 -index 000000000..f4a435eaa +index 000000000..964696cfe --- /dev/null +++ b/vllm/v1/attention/backends/ipex_attn.py -@@ -0,0 +1,392 @@ +@@ -0,0 +1,404 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type + @@ -15152,6 +15244,10 @@ index 000000000..f4a435eaa +from vllm.attention.backends.ipex_attn import use_gqa_kernel +from vllm.utils import is_bmg_platform +import os ++from vllm.logger import init_logger ++ ++logger = init_logger(__name__) ++ + +@dataclass +class IPEXAttentionMetadata(FlashAttentionMetadata): @@ -15246,6 +15342,12 @@ index 000000000..f4a435eaa + "are not implemented for " + "IpexAttnBackendImpl") + ++ flag = os.getenv("IPEX_LLM_PREFILL_VARLEN_BACKEND", None) ++ self.ipex_varlen_attn = False ++ if flag is not None: ++ self.ipex_varlen_attn = True ++ logger.info_once(f"V1 engine using varlen_attention for prefilling.") ++ + def forward( + self, + layer: AttentionLayer, @@ -15293,6 +15395,7 @@ index 000000000..f4a435eaa + self.sliding_window, + self.alibi_slopes, + self.logits_soft_cap, ++ self.ipex_varlen_attn, + ) + return output.view(-1, self.num_heads * self.head_size) + @@ -15367,6 +15470,7 @@ index 000000000..f4a435eaa + sliding_window: Optional[List[int]] = None, + alibi_slopes: Optional[torch.Tensor] = None, + logits_soft_cap: Optional[float] = None, ++ flag: Optional[bool] = False, +) -> None: + context = get_forward_context() + current_metadata = context.attn_metadata @@ -15382,7 +15486,7 @@ index 000000000..f4a435eaa + key = key.view(-1, num_kv_heads, head_size) + value = value.view(-1, num_kv_heads, head_size) + -+ if is_bmg_platform: ++ if flag or is_bmg_platform: + key_cache, value_cache = kv_cache.unbind(0) + ipex_ops.reshape_and_cache_flash( + key[:num_actual_tokens], @@ -17087,7 +17191,7 @@ index 000000000..dffc7b367 + return (attn_metadata, encoder_input_tokens_tensor, + encoder_input_positions_tensor) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py -index 9d49b4385..78e0c54f2 100644 +index 9d49b4385..dc5e95f4e 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -5,8 +5,8 @@ import time @@ -17735,15 +17839,17 @@ index 9d49b4385..78e0c54f2 100644 max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( self.model_config) if max_mm_tokens > 0: -@@ -461,6 +820,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -461,6 +820,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): "Computed max_num_seqs (%s) to be less than 1. " "Setting it to the minimum value of 1.", expr) max_num_seqs = 1 + ''' ++ if "phi4mm" in self.model_config.hf_config.model_type: ++ max_num_seqs = 1 batch_size = 0 for group_id in range(max_num_seqs): -@@ -479,11 +839,14 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -479,11 +841,14 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): seq_data={group_id: dummy_data.seq_data}, sampling_params=sampling_params, block_tables=None, @@ -17759,7 +17865,7 @@ index 9d49b4385..78e0c54f2 100644 finished_requests_ids = [seq.request_id for seq in seqs] model_input = self.prepare_model_input( seqs, finished_requests_ids=finished_requests_ids) -@@ -493,25 +856,39 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -493,25 +858,39 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): batch_size=batch_size, dtype=self.model_config.dtype, device=self.device) @@ -17810,7 +17916,7 @@ index 9d49b4385..78e0c54f2 100644 """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not metadata for possible additional steps, e.g., sampling. -@@ -524,6 +901,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -524,6 +903,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): return builder.build() # type: ignore @@ -17833,7 +17939,7 @@ index 9d49b4385..78e0c54f2 100644 def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], -@@ -563,6 +956,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -563,6 +958,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): raise ValueError( "XPUModelRunner does not support multi-step execution.") @@ -17846,7 +17952,7 @@ index 9d49b4385..78e0c54f2 100644 model_executable = self.model if (self.observability_config is not None and self.observability_config.collect_model_forward_time): -@@ -612,3 +1011,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -612,3 +1013,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): output.model_forward_time = model_forward_time return [output]