update vllm patch (#13064)
This commit is contained in:
parent
1d7f4a83ac
commit
3ee6dec0f8
1 changed files with 84 additions and 34 deletions
|
|
@ -39043,7 +39043,7 @@ index 3ac7fb8df..249b3ed2d 100644
|
|||
and self.observability_config.collect_model_execute_time):
|
||||
output.tensors["model_execute_time"] = torch.tensor(
|
||||
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
|
||||
index 9cf253875..df6ab56c6 100644
|
||||
index 9cf253875..34d098486 100644
|
||||
--- a/vllm/worker/xpu_model_runner.py
|
||||
+++ b/vllm/worker/xpu_model_runner.py
|
||||
@@ -3,8 +3,8 @@ import time
|
||||
|
|
@ -39606,7 +39606,7 @@ index 9cf253875..df6ab56c6 100644
|
|||
|
||||
self.sampling_metadata_cache: SamplingMetadataCache = \
|
||||
SamplingMetadataCache() \
|
||||
@@ -415,10 +719,38 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
@@ -415,16 +719,74 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
logger.info("Loading model weights took %.4f GB",
|
||||
self.model_memory_usage / float(2**30))
|
||||
|
||||
|
|
@ -39643,12 +39643,19 @@ index 9cf253875..df6ab56c6 100644
|
|||
+ return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None
|
||||
+
|
||||
@torch.inference_mode()
|
||||
def profile_run(self) -> None:
|
||||
- def profile_run(self) -> None:
|
||||
+ def profile_run(self, num_batched_tokens=-1, num_seqs=-1) -> None:
|
||||
# Enable top-k sampling to reflect the accurate memory usage.
|
||||
@@ -426,6 +758,30 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
|
||||
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
+ assert (num_batched_tokens == -1 or num_batched_tokens > 0)
|
||||
+ assert (num_seqs == -1 or num_seqs > 0)
|
||||
max_num_seqs = self.scheduler_config.max_num_seqs
|
||||
|
||||
+ if num_batched_tokens != -1:
|
||||
+ max_num_batched_tokens = num_batched_tokens
|
||||
+ if num_seqs != -1:
|
||||
+ max_num_seqs = num_seqs
|
||||
+
|
||||
+ # This represents the maximum number of different requests
|
||||
+ # that will have unique loras, an therefore the max amount of memory
|
||||
+ # consumption create dummy lora request copies from the lora request
|
||||
|
|
@ -39672,27 +39679,10 @@ index 9cf253875..df6ab56c6 100644
|
|||
+ dummy_lora_requests[idx % len(dummy_lora_requests)]
|
||||
+ for idx in range(max_num_seqs)
|
||||
+ ]
|
||||
+
|
||||
|
||||
# Profile memory usage with max_num_sequences sequences and the total
|
||||
# number of tokens equal to max_num_batched_tokens.
|
||||
seqs: List[SequenceGroupMetadata] = []
|
||||
@@ -450,6 +806,15 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
max_num_seqs = 1
|
||||
|
||||
batch_size = 0
|
||||
+ import os
|
||||
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
|
||||
+ if self_max_num_batched_tokens is not None:
|
||||
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
|
||||
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
|
||||
+ if self_max_num_seqs is not None:
|
||||
+ max_num_seqs = int(self_max_num_seqs)
|
||||
+ else:
|
||||
+ max_num_seqs = 1
|
||||
for group_id in range(max_num_seqs):
|
||||
seq_len = (max_num_batched_tokens // max_num_seqs +
|
||||
(group_id < max_num_batched_tokens % max_num_seqs))
|
||||
@@ -466,7 +831,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
@@ -466,7 +828,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
seq_data={group_id: dummy_data.seq_data},
|
||||
sampling_params=sampling_params,
|
||||
block_tables=None,
|
||||
|
|
@ -39702,7 +39692,7 @@ index 9cf253875..df6ab56c6 100644
|
|||
multi_modal_data=dummy_data.multi_modal_data,
|
||||
multi_modal_placeholders=dummy_data.multi_modal_placeholders)
|
||||
seqs.append(seq)
|
||||
@@ -477,9 +843,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
@@ -477,9 +840,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
# it by reference, rather by specializing on the value ``None``.
|
||||
# the `dtype` argument does not matter, and we use `float32` as
|
||||
# a placeholder (it has wide hardware support).
|
||||
|
|
@ -39713,7 +39703,7 @@ index 9cf253875..df6ab56c6 100644
|
|||
finished_requests_ids = [seq.request_id for seq in seqs]
|
||||
model_input = self.prepare_model_input(
|
||||
seqs, finished_requests_ids=finished_requests_ids)
|
||||
@@ -493,21 +857,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
@@ -493,21 +854,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
torch.xpu.synchronize()
|
||||
return
|
||||
|
||||
|
|
@ -39759,7 +39749,7 @@ index 9cf253875..df6ab56c6 100644
|
|||
"""Helper method to prepare the model input based on a given sequence
|
||||
group. Prepares metadata needed for the base model forward pass but not
|
||||
metadata for possible additional steps, e.g., sampling.
|
||||
@@ -519,6 +897,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
@@ -519,6 +894,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
|
||||
return builder.build() # type: ignore
|
||||
|
||||
|
|
@ -39782,7 +39772,7 @@ index 9cf253875..df6ab56c6 100644
|
|||
def prepare_model_input(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
@@ -558,6 +952,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
@@ -558,6 +949,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
raise ValueError(
|
||||
"XPUModelRunner does not support multi-step execution.")
|
||||
|
||||
|
|
@ -39795,7 +39785,7 @@ index 9cf253875..df6ab56c6 100644
|
|||
model_executable = self.model
|
||||
if (self.observability_config is not None
|
||||
and self.observability_config.collect_model_forward_time):
|
||||
@@ -607,3 +1007,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
@@ -607,3 +1004,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
output.model_forward_time = model_forward_time
|
||||
|
||||
return [output]
|
||||
|
|
@ -40700,7 +40690,7 @@ index 000000000..6ad951824
|
|||
+ return model_input, worker_input, kwargs
|
||||
\ No newline at end of file
|
||||
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
|
||||
index 129566605..43d306145 100644
|
||||
index 129566605..fb7962dfe 100644
|
||||
--- a/vllm/worker/xpu_worker.py
|
||||
+++ b/vllm/worker/xpu_worker.py
|
||||
@@ -3,7 +3,8 @@ import gc
|
||||
|
|
@ -40729,15 +40719,75 @@ index 129566605..43d306145 100644
|
|||
"""A worker class that executes (a partition of) the model on a GPU.
|
||||
|
||||
Each worker is associated with a single XPU device. The worker is
|
||||
@@ -98,6 +99,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
||||
@@ -97,16 +98,74 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
||||
"""
|
||||
# Profile the memory usage of the model and get the maximum number of
|
||||
# cache blocks that can be allocated with the remaining free memory.
|
||||
+ flag = int(os.getenv("IPEX_LLM_FIND_MAX_LENGTH", -1))
|
||||
+ if flag != -1:
|
||||
+ assert flag > 0
|
||||
+ torch.xpu.empty_cache()
|
||||
+ before_memory = torch.xpu.memory_reserved()
|
||||
+ max_num_batched_tokens = flag
|
||||
+ max_num_seqs = 1
|
||||
+ support_input = []
|
||||
+ support_kv_cache = []
|
||||
+ while True:
|
||||
+ print(f"Profiling with max_num_batched_tokens {max_num_batched_tokens}...")
|
||||
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
|
||||
+ torch.xpu.synchronize()
|
||||
+ used_memory = torch.xpu.memory_reserved()
|
||||
+ total_gpu_memory = torch.xpu.get_device_properties(
|
||||
+ self.local_rank).total_memory
|
||||
+ free_gpu_memory = total_gpu_memory - used_memory
|
||||
+ peak_memory = self.init_gpu_memory - free_gpu_memory
|
||||
+ assert peak_memory > 0
|
||||
+ cache_block_size = self.get_cache_block_size_bytes()
|
||||
+ num_gpu_blocks = int(
|
||||
+ (total_gpu_memory * self.cache_config.gpu_memory_utilization -
|
||||
+ peak_memory) // cache_block_size)
|
||||
+ num_cpu_blocks = int(self.cache_config.swap_space_bytes //
|
||||
+ cache_block_size)
|
||||
+ num_gpu_blocks = max(num_gpu_blocks, 0)
|
||||
+ num_cpu_blocks = max(num_cpu_blocks, 0)
|
||||
+ gc.collect()
|
||||
+ torch.xpu.empty_cache()
|
||||
+ # Begin to handle data...
|
||||
+ if num_gpu_blocks == 0:
|
||||
+ break
|
||||
+ kv_cache_support_length = num_gpu_blocks * self.cache_config.block_size
|
||||
+ # Too long input...
|
||||
+ if max_num_batched_tokens > kv_cache_support_length:
|
||||
+ break
|
||||
+ support_input.append(max_num_batched_tokens)
|
||||
+ support_kv_cache.append(kv_cache_support_length)
|
||||
+ max_num_batched_tokens += 250
|
||||
+
|
||||
+ print(f"Recommended max input length: {support_input[len(support_input) - 1]}")
|
||||
+ print(f"{'input length':<15} {'kv cache length':<15}")
|
||||
+ print("-" * 30)
|
||||
+
|
||||
+ for inp, kv in zip(support_input, support_kv_cache):
|
||||
+ print(f"{inp:<15} {kv:<15}")
|
||||
torch.xpu.empty_cache()
|
||||
+ before_memory = torch.xpu.memory_reserved()
|
||||
|
||||
# Execute a forward pass with dummy inputs to profile the memory usage
|
||||
# of the model.
|
||||
@@ -106,7 +108,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
||||
- self.model_runner.profile_run()
|
||||
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
|
||||
+ if self_max_num_batched_tokens is not None:
|
||||
+ # If this get set, then profile using max input length
|
||||
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
|
||||
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
|
||||
+ if self_max_num_seqs is not None:
|
||||
+ max_num_seqs = int(self_max_num_seqs)
|
||||
+ else:
|
||||
+ max_num_seqs = 1
|
||||
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
|
||||
+ else:
|
||||
+ self.model_runner.profile_run()
|
||||
|
||||
# Calculate the number of blocks that can be allocated with the
|
||||
# profiled peak memory.
|
||||
torch.xpu.synchronize()
|
||||
|
|
@ -40746,7 +40796,7 @@ index 129566605..43d306145 100644
|
|||
total_gpu_memory = torch.xpu.get_device_properties(
|
||||
self.local_rank).total_memory
|
||||
free_gpu_memory = total_gpu_memory - used_memory
|
||||
@@ -130,6 +132,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
||||
@@ -130,6 +189,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
||||
num_cpu_blocks = max(num_cpu_blocks, 0)
|
||||
gc.collect()
|
||||
torch.xpu.empty_cache()
|
||||
|
|
@ -40767,7 +40817,7 @@ index 129566605..43d306145 100644
|
|||
return num_gpu_blocks, num_cpu_blocks
|
||||
|
||||
def _warm_up_model(self) -> None:
|
||||
@@ -175,4 +191,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
||||
@@ -175,4 +248,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
||||
parallel_config.tensor_parallel_size,
|
||||
parallel_config.pipeline_parallel_size)
|
||||
# global all_reduce needed for overall oneccl warm up
|
||||
|
|
|
|||
Loading…
Reference in a new issue