update vllm patch (#13064)

This commit is contained in:
Guancheng Fu 2025-04-10 15:03:37 +08:00 committed by GitHub
parent 1d7f4a83ac
commit 3ee6dec0f8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -39043,7 +39043,7 @@ index 3ac7fb8df..249b3ed2d 100644
and self.observability_config.collect_model_execute_time):
output.tensors["model_execute_time"] = torch.tensor(
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 9cf253875..df6ab56c6 100644
index 9cf253875..34d098486 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -3,8 +3,8 @@ import time
@ -39606,7 +39606,7 @@ index 9cf253875..df6ab56c6 100644
self.sampling_metadata_cache: SamplingMetadataCache = \
SamplingMetadataCache() \
@@ -415,10 +719,38 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -415,16 +719,74 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
logger.info("Loading model weights took %.4f GB",
self.model_memory_usage / float(2**30))
@ -39643,12 +39643,19 @@ index 9cf253875..df6ab56c6 100644
+ return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None
+
@torch.inference_mode()
def profile_run(self) -> None:
- def profile_run(self) -> None:
+ def profile_run(self, num_batched_tokens=-1, num_seqs=-1) -> None:
# Enable top-k sampling to reflect the accurate memory usage.
@@ -426,6 +758,30 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+ assert (num_batched_tokens == -1 or num_batched_tokens > 0)
+ assert (num_seqs == -1 or num_seqs > 0)
max_num_seqs = self.scheduler_config.max_num_seqs
+ if num_batched_tokens != -1:
+ max_num_batched_tokens = num_batched_tokens
+ if num_seqs != -1:
+ max_num_seqs = num_seqs
+
+ # This represents the maximum number of different requests
+ # that will have unique loras, an therefore the max amount of memory
+ # consumption create dummy lora request copies from the lora request
@ -39672,27 +39679,10 @@ index 9cf253875..df6ab56c6 100644
+ dummy_lora_requests[idx % len(dummy_lora_requests)]
+ for idx in range(max_num_seqs)
+ ]
+
# Profile memory usage with max_num_sequences sequences and the total
# number of tokens equal to max_num_batched_tokens.
seqs: List[SequenceGroupMetadata] = []
@@ -450,6 +806,15 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
max_num_seqs = 1
batch_size = 0
+ import os
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
+ if self_max_num_batched_tokens is not None:
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
+ if self_max_num_seqs is not None:
+ max_num_seqs = int(self_max_num_seqs)
+ else:
+ max_num_seqs = 1
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs))
@@ -466,7 +831,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -466,7 +828,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
seq_data={group_id: dummy_data.seq_data},
sampling_params=sampling_params,
block_tables=None,
@ -39702,7 +39692,7 @@ index 9cf253875..df6ab56c6 100644
multi_modal_data=dummy_data.multi_modal_data,
multi_modal_placeholders=dummy_data.multi_modal_placeholders)
seqs.append(seq)
@@ -477,9 +843,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -477,9 +840,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
# it by reference, rather by specializing on the value ``None``.
# the `dtype` argument does not matter, and we use `float32` as
# a placeholder (it has wide hardware support).
@ -39713,7 +39703,7 @@ index 9cf253875..df6ab56c6 100644
finished_requests_ids = [seq.request_id for seq in seqs]
model_input = self.prepare_model_input(
seqs, finished_requests_ids=finished_requests_ids)
@@ -493,21 +857,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -493,21 +854,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
torch.xpu.synchronize()
return
@ -39759,7 +39749,7 @@ index 9cf253875..df6ab56c6 100644
"""Helper method to prepare the model input based on a given sequence
group. Prepares metadata needed for the base model forward pass but not
metadata for possible additional steps, e.g., sampling.
@@ -519,6 +897,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -519,6 +894,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
return builder.build() # type: ignore
@ -39782,7 +39772,7 @@ index 9cf253875..df6ab56c6 100644
def prepare_model_input(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -558,6 +952,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -558,6 +949,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
raise ValueError(
"XPUModelRunner does not support multi-step execution.")
@ -39795,7 +39785,7 @@ index 9cf253875..df6ab56c6 100644
model_executable = self.model
if (self.observability_config is not None
and self.observability_config.collect_model_forward_time):
@@ -607,3 +1007,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -607,3 +1004,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
output.model_forward_time = model_forward_time
return [output]
@ -40700,7 +40690,7 @@ index 000000000..6ad951824
+ return model_input, worker_input, kwargs
\ No newline at end of file
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 129566605..43d306145 100644
index 129566605..fb7962dfe 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -3,7 +3,8 @@ import gc
@ -40729,15 +40719,75 @@ index 129566605..43d306145 100644
"""A worker class that executes (a partition of) the model on a GPU.
Each worker is associated with a single XPU device. The worker is
@@ -98,6 +99,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
@@ -97,16 +98,74 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
"""
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
+ flag = int(os.getenv("IPEX_LLM_FIND_MAX_LENGTH", -1))
+ if flag != -1:
+ assert flag > 0
+ torch.xpu.empty_cache()
+ before_memory = torch.xpu.memory_reserved()
+ max_num_batched_tokens = flag
+ max_num_seqs = 1
+ support_input = []
+ support_kv_cache = []
+ while True:
+ print(f"Profiling with max_num_batched_tokens {max_num_batched_tokens}...")
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
+ torch.xpu.synchronize()
+ used_memory = torch.xpu.memory_reserved()
+ total_gpu_memory = torch.xpu.get_device_properties(
+ self.local_rank).total_memory
+ free_gpu_memory = total_gpu_memory - used_memory
+ peak_memory = self.init_gpu_memory - free_gpu_memory
+ assert peak_memory > 0
+ cache_block_size = self.get_cache_block_size_bytes()
+ num_gpu_blocks = int(
+ (total_gpu_memory * self.cache_config.gpu_memory_utilization -
+ peak_memory) // cache_block_size)
+ num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+ cache_block_size)
+ num_gpu_blocks = max(num_gpu_blocks, 0)
+ num_cpu_blocks = max(num_cpu_blocks, 0)
+ gc.collect()
+ torch.xpu.empty_cache()
+ # Begin to handle data...
+ if num_gpu_blocks == 0:
+ break
+ kv_cache_support_length = num_gpu_blocks * self.cache_config.block_size
+ # Too long input...
+ if max_num_batched_tokens > kv_cache_support_length:
+ break
+ support_input.append(max_num_batched_tokens)
+ support_kv_cache.append(kv_cache_support_length)
+ max_num_batched_tokens += 250
+
+ print(f"Recommended max input length: {support_input[len(support_input) - 1]}")
+ print(f"{'input length':<15} {'kv cache length':<15}")
+ print("-" * 30)
+
+ for inp, kv in zip(support_input, support_kv_cache):
+ print(f"{inp:<15} {kv:<15}")
torch.xpu.empty_cache()
+ before_memory = torch.xpu.memory_reserved()
# Execute a forward pass with dummy inputs to profile the memory usage
# of the model.
@@ -106,7 +108,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
- self.model_runner.profile_run()
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
+ if self_max_num_batched_tokens is not None:
+ # If this get set, then profile using max input length
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
+ if self_max_num_seqs is not None:
+ max_num_seqs = int(self_max_num_seqs)
+ else:
+ max_num_seqs = 1
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
+ else:
+ self.model_runner.profile_run()
# Calculate the number of blocks that can be allocated with the
# profiled peak memory.
torch.xpu.synchronize()
@ -40746,7 +40796,7 @@ index 129566605..43d306145 100644
total_gpu_memory = torch.xpu.get_device_properties(
self.local_rank).total_memory
free_gpu_memory = total_gpu_memory - used_memory
@@ -130,6 +132,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
@@ -130,6 +189,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
num_cpu_blocks = max(num_cpu_blocks, 0)
gc.collect()
torch.xpu.empty_cache()
@ -40767,7 +40817,7 @@ index 129566605..43d306145 100644
return num_gpu_blocks, num_cpu_blocks
def _warm_up_model(self) -> None:
@@ -175,4 +191,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
@@ -175,4 +248,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
parallel_config.tensor_parallel_size,
parallel_config.pipeline_parallel_size)
# global all_reduce needed for overall oneccl warm up