update vllm patch (#13064)

This commit is contained in:
Guancheng Fu 2025-04-10 15:03:37 +08:00 committed by GitHub
parent 1d7f4a83ac
commit 3ee6dec0f8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -39043,7 +39043,7 @@ index 3ac7fb8df..249b3ed2d 100644
and self.observability_config.collect_model_execute_time): and self.observability_config.collect_model_execute_time):
output.tensors["model_execute_time"] = torch.tensor( output.tensors["model_execute_time"] = torch.tensor(
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 9cf253875..df6ab56c6 100644 index 9cf253875..34d098486 100644
--- a/vllm/worker/xpu_model_runner.py --- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py
@@ -3,8 +3,8 @@ import time @@ -3,8 +3,8 @@ import time
@ -39606,7 +39606,7 @@ index 9cf253875..df6ab56c6 100644
self.sampling_metadata_cache: SamplingMetadataCache = \ self.sampling_metadata_cache: SamplingMetadataCache = \
SamplingMetadataCache() \ SamplingMetadataCache() \
@@ -415,10 +719,38 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): @@ -415,16 +719,74 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
logger.info("Loading model weights took %.4f GB", logger.info("Loading model weights took %.4f GB",
self.model_memory_usage / float(2**30)) self.model_memory_usage / float(2**30))
@ -39643,12 +39643,19 @@ index 9cf253875..df6ab56c6 100644
+ return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None + return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None
+ +
@torch.inference_mode() @torch.inference_mode()
def profile_run(self) -> None: - def profile_run(self) -> None:
+ def profile_run(self, num_batched_tokens=-1, num_seqs=-1) -> None:
# Enable top-k sampling to reflect the accurate memory usage. # Enable top-k sampling to reflect the accurate memory usage.
@@ -426,6 +758,30 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+ assert (num_batched_tokens == -1 or num_batched_tokens > 0)
+ assert (num_seqs == -1 or num_seqs > 0)
max_num_seqs = self.scheduler_config.max_num_seqs max_num_seqs = self.scheduler_config.max_num_seqs
+ if num_batched_tokens != -1:
+ max_num_batched_tokens = num_batched_tokens
+ if num_seqs != -1:
+ max_num_seqs = num_seqs
+
+ # This represents the maximum number of different requests + # This represents the maximum number of different requests
+ # that will have unique loras, an therefore the max amount of memory + # that will have unique loras, an therefore the max amount of memory
+ # consumption create dummy lora request copies from the lora request + # consumption create dummy lora request copies from the lora request
@ -39672,27 +39679,10 @@ index 9cf253875..df6ab56c6 100644
+ dummy_lora_requests[idx % len(dummy_lora_requests)] + dummy_lora_requests[idx % len(dummy_lora_requests)]
+ for idx in range(max_num_seqs) + for idx in range(max_num_seqs)
+ ] + ]
+
# Profile memory usage with max_num_sequences sequences and the total # Profile memory usage with max_num_sequences sequences and the total
# number of tokens equal to max_num_batched_tokens. # number of tokens equal to max_num_batched_tokens.
seqs: List[SequenceGroupMetadata] = [] @@ -466,7 +828,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
@@ -450,6 +806,15 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
max_num_seqs = 1
batch_size = 0
+ import os
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
+ if self_max_num_batched_tokens is not None:
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
+ if self_max_num_seqs is not None:
+ max_num_seqs = int(self_max_num_seqs)
+ else:
+ max_num_seqs = 1
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs))
@@ -466,7 +831,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
seq_data={group_id: dummy_data.seq_data}, seq_data={group_id: dummy_data.seq_data},
sampling_params=sampling_params, sampling_params=sampling_params,
block_tables=None, block_tables=None,
@ -39702,7 +39692,7 @@ index 9cf253875..df6ab56c6 100644
multi_modal_data=dummy_data.multi_modal_data, multi_modal_data=dummy_data.multi_modal_data,
multi_modal_placeholders=dummy_data.multi_modal_placeholders) multi_modal_placeholders=dummy_data.multi_modal_placeholders)
seqs.append(seq) seqs.append(seq)
@@ -477,9 +843,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): @@ -477,9 +840,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
# it by reference, rather by specializing on the value ``None``. # it by reference, rather by specializing on the value ``None``.
# the `dtype` argument does not matter, and we use `float32` as # the `dtype` argument does not matter, and we use `float32` as
# a placeholder (it has wide hardware support). # a placeholder (it has wide hardware support).
@ -39713,7 +39703,7 @@ index 9cf253875..df6ab56c6 100644
finished_requests_ids = [seq.request_id for seq in seqs] finished_requests_ids = [seq.request_id for seq in seqs]
model_input = self.prepare_model_input( model_input = self.prepare_model_input(
seqs, finished_requests_ids=finished_requests_ids) seqs, finished_requests_ids=finished_requests_ids)
@@ -493,21 +857,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): @@ -493,21 +854,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
torch.xpu.synchronize() torch.xpu.synchronize()
return return
@ -39759,7 +39749,7 @@ index 9cf253875..df6ab56c6 100644
"""Helper method to prepare the model input based on a given sequence """Helper method to prepare the model input based on a given sequence
group. Prepares metadata needed for the base model forward pass but not group. Prepares metadata needed for the base model forward pass but not
metadata for possible additional steps, e.g., sampling. metadata for possible additional steps, e.g., sampling.
@@ -519,6 +897,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): @@ -519,6 +894,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
return builder.build() # type: ignore return builder.build() # type: ignore
@ -39782,7 +39772,7 @@ index 9cf253875..df6ab56c6 100644
def prepare_model_input( def prepare_model_input(
self, self,
seq_group_metadata_list: List[SequenceGroupMetadata], seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -558,6 +952,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): @@ -558,6 +949,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
raise ValueError( raise ValueError(
"XPUModelRunner does not support multi-step execution.") "XPUModelRunner does not support multi-step execution.")
@ -39795,7 +39785,7 @@ index 9cf253875..df6ab56c6 100644
model_executable = self.model model_executable = self.model
if (self.observability_config is not None if (self.observability_config is not None
and self.observability_config.collect_model_forward_time): and self.observability_config.collect_model_forward_time):
@@ -607,3 +1007,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): @@ -607,3 +1004,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
output.model_forward_time = model_forward_time output.model_forward_time = model_forward_time
return [output] return [output]
@ -40700,7 +40690,7 @@ index 000000000..6ad951824
+ return model_input, worker_input, kwargs + return model_input, worker_input, kwargs
\ No newline at end of file \ No newline at end of file
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 129566605..43d306145 100644 index 129566605..fb7962dfe 100644
--- a/vllm/worker/xpu_worker.py --- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py
@@ -3,7 +3,8 @@ import gc @@ -3,7 +3,8 @@ import gc
@ -40729,15 +40719,75 @@ index 129566605..43d306145 100644
"""A worker class that executes (a partition of) the model on a GPU. """A worker class that executes (a partition of) the model on a GPU.
Each worker is associated with a single XPU device. The worker is Each worker is associated with a single XPU device. The worker is
@@ -98,6 +99,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): @@ -97,16 +98,74 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
"""
# Profile the memory usage of the model and get the maximum number of # Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory. # cache blocks that can be allocated with the remaining free memory.
+ flag = int(os.getenv("IPEX_LLM_FIND_MAX_LENGTH", -1))
+ if flag != -1:
+ assert flag > 0
+ torch.xpu.empty_cache()
+ before_memory = torch.xpu.memory_reserved()
+ max_num_batched_tokens = flag
+ max_num_seqs = 1
+ support_input = []
+ support_kv_cache = []
+ while True:
+ print(f"Profiling with max_num_batched_tokens {max_num_batched_tokens}...")
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
+ torch.xpu.synchronize()
+ used_memory = torch.xpu.memory_reserved()
+ total_gpu_memory = torch.xpu.get_device_properties(
+ self.local_rank).total_memory
+ free_gpu_memory = total_gpu_memory - used_memory
+ peak_memory = self.init_gpu_memory - free_gpu_memory
+ assert peak_memory > 0
+ cache_block_size = self.get_cache_block_size_bytes()
+ num_gpu_blocks = int(
+ (total_gpu_memory * self.cache_config.gpu_memory_utilization -
+ peak_memory) // cache_block_size)
+ num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+ cache_block_size)
+ num_gpu_blocks = max(num_gpu_blocks, 0)
+ num_cpu_blocks = max(num_cpu_blocks, 0)
+ gc.collect()
+ torch.xpu.empty_cache()
+ # Begin to handle data...
+ if num_gpu_blocks == 0:
+ break
+ kv_cache_support_length = num_gpu_blocks * self.cache_config.block_size
+ # Too long input...
+ if max_num_batched_tokens > kv_cache_support_length:
+ break
+ support_input.append(max_num_batched_tokens)
+ support_kv_cache.append(kv_cache_support_length)
+ max_num_batched_tokens += 250
+
+ print(f"Recommended max input length: {support_input[len(support_input) - 1]}")
+ print(f"{'input length':<15} {'kv cache length':<15}")
+ print("-" * 30)
+
+ for inp, kv in zip(support_input, support_kv_cache):
+ print(f"{inp:<15} {kv:<15}")
torch.xpu.empty_cache() torch.xpu.empty_cache()
+ before_memory = torch.xpu.memory_reserved() + before_memory = torch.xpu.memory_reserved()
# Execute a forward pass with dummy inputs to profile the memory usage # Execute a forward pass with dummy inputs to profile the memory usage
# of the model. # of the model.
@@ -106,7 +108,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): - self.model_runner.profile_run()
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
+ if self_max_num_batched_tokens is not None:
+ # If this get set, then profile using max input length
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
+ if self_max_num_seqs is not None:
+ max_num_seqs = int(self_max_num_seqs)
+ else:
+ max_num_seqs = 1
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
+ else:
+ self.model_runner.profile_run()
# Calculate the number of blocks that can be allocated with the # Calculate the number of blocks that can be allocated with the
# profiled peak memory. # profiled peak memory.
torch.xpu.synchronize() torch.xpu.synchronize()
@ -40746,7 +40796,7 @@ index 129566605..43d306145 100644
total_gpu_memory = torch.xpu.get_device_properties( total_gpu_memory = torch.xpu.get_device_properties(
self.local_rank).total_memory self.local_rank).total_memory
free_gpu_memory = total_gpu_memory - used_memory free_gpu_memory = total_gpu_memory - used_memory
@@ -130,6 +132,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): @@ -130,6 +189,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
num_cpu_blocks = max(num_cpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0)
gc.collect() gc.collect()
torch.xpu.empty_cache() torch.xpu.empty_cache()
@ -40767,7 +40817,7 @@ index 129566605..43d306145 100644
return num_gpu_blocks, num_cpu_blocks return num_gpu_blocks, num_cpu_blocks
def _warm_up_model(self) -> None: def _warm_up_model(self) -> None:
@@ -175,4 +191,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): @@ -175,4 +248,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
parallel_config.tensor_parallel_size, parallel_config.tensor_parallel_size,
parallel_config.pipeline_parallel_size) parallel_config.pipeline_parallel_size)
# global all_reduce needed for overall oneccl warm up # global all_reduce needed for overall oneccl warm up