update vllm patch (#13064)
This commit is contained in:
parent
1d7f4a83ac
commit
3ee6dec0f8
1 changed files with 84 additions and 34 deletions
|
|
@ -39043,7 +39043,7 @@ index 3ac7fb8df..249b3ed2d 100644
|
||||||
and self.observability_config.collect_model_execute_time):
|
and self.observability_config.collect_model_execute_time):
|
||||||
output.tensors["model_execute_time"] = torch.tensor(
|
output.tensors["model_execute_time"] = torch.tensor(
|
||||||
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
|
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
|
||||||
index 9cf253875..df6ab56c6 100644
|
index 9cf253875..34d098486 100644
|
||||||
--- a/vllm/worker/xpu_model_runner.py
|
--- a/vllm/worker/xpu_model_runner.py
|
||||||
+++ b/vllm/worker/xpu_model_runner.py
|
+++ b/vllm/worker/xpu_model_runner.py
|
||||||
@@ -3,8 +3,8 @@ import time
|
@@ -3,8 +3,8 @@ import time
|
||||||
|
|
@ -39606,7 +39606,7 @@ index 9cf253875..df6ab56c6 100644
|
||||||
|
|
||||||
self.sampling_metadata_cache: SamplingMetadataCache = \
|
self.sampling_metadata_cache: SamplingMetadataCache = \
|
||||||
SamplingMetadataCache() \
|
SamplingMetadataCache() \
|
||||||
@@ -415,10 +719,38 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
@@ -415,16 +719,74 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||||
logger.info("Loading model weights took %.4f GB",
|
logger.info("Loading model weights took %.4f GB",
|
||||||
self.model_memory_usage / float(2**30))
|
self.model_memory_usage / float(2**30))
|
||||||
|
|
||||||
|
|
@ -39643,12 +39643,19 @@ index 9cf253875..df6ab56c6 100644
|
||||||
+ return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None
|
+ return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None
|
||||||
+
|
+
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def profile_run(self) -> None:
|
- def profile_run(self) -> None:
|
||||||
|
+ def profile_run(self, num_batched_tokens=-1, num_seqs=-1) -> None:
|
||||||
# Enable top-k sampling to reflect the accurate memory usage.
|
# Enable top-k sampling to reflect the accurate memory usage.
|
||||||
@@ -426,6 +758,30 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
|
||||||
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
|
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
|
||||||
|
+ assert (num_batched_tokens == -1 or num_batched_tokens > 0)
|
||||||
|
+ assert (num_seqs == -1 or num_seqs > 0)
|
||||||
max_num_seqs = self.scheduler_config.max_num_seqs
|
max_num_seqs = self.scheduler_config.max_num_seqs
|
||||||
|
+ if num_batched_tokens != -1:
|
||||||
|
+ max_num_batched_tokens = num_batched_tokens
|
||||||
|
+ if num_seqs != -1:
|
||||||
|
+ max_num_seqs = num_seqs
|
||||||
|
+
|
||||||
+ # This represents the maximum number of different requests
|
+ # This represents the maximum number of different requests
|
||||||
+ # that will have unique loras, an therefore the max amount of memory
|
+ # that will have unique loras, an therefore the max amount of memory
|
||||||
+ # consumption create dummy lora request copies from the lora request
|
+ # consumption create dummy lora request copies from the lora request
|
||||||
|
|
@ -39672,27 +39679,10 @@ index 9cf253875..df6ab56c6 100644
|
||||||
+ dummy_lora_requests[idx % len(dummy_lora_requests)]
|
+ dummy_lora_requests[idx % len(dummy_lora_requests)]
|
||||||
+ for idx in range(max_num_seqs)
|
+ for idx in range(max_num_seqs)
|
||||||
+ ]
|
+ ]
|
||||||
+
|
|
||||||
# Profile memory usage with max_num_sequences sequences and the total
|
# Profile memory usage with max_num_sequences sequences and the total
|
||||||
# number of tokens equal to max_num_batched_tokens.
|
# number of tokens equal to max_num_batched_tokens.
|
||||||
seqs: List[SequenceGroupMetadata] = []
|
@@ -466,7 +828,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||||
@@ -450,6 +806,15 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
|
||||||
max_num_seqs = 1
|
|
||||||
|
|
||||||
batch_size = 0
|
|
||||||
+ import os
|
|
||||||
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
|
|
||||||
+ if self_max_num_batched_tokens is not None:
|
|
||||||
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
|
|
||||||
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
|
|
||||||
+ if self_max_num_seqs is not None:
|
|
||||||
+ max_num_seqs = int(self_max_num_seqs)
|
|
||||||
+ else:
|
|
||||||
+ max_num_seqs = 1
|
|
||||||
for group_id in range(max_num_seqs):
|
|
||||||
seq_len = (max_num_batched_tokens // max_num_seqs +
|
|
||||||
(group_id < max_num_batched_tokens % max_num_seqs))
|
|
||||||
@@ -466,7 +831,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
|
||||||
seq_data={group_id: dummy_data.seq_data},
|
seq_data={group_id: dummy_data.seq_data},
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
block_tables=None,
|
block_tables=None,
|
||||||
|
|
@ -39702,7 +39692,7 @@ index 9cf253875..df6ab56c6 100644
|
||||||
multi_modal_data=dummy_data.multi_modal_data,
|
multi_modal_data=dummy_data.multi_modal_data,
|
||||||
multi_modal_placeholders=dummy_data.multi_modal_placeholders)
|
multi_modal_placeholders=dummy_data.multi_modal_placeholders)
|
||||||
seqs.append(seq)
|
seqs.append(seq)
|
||||||
@@ -477,9 +843,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
@@ -477,9 +840,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||||
# it by reference, rather by specializing on the value ``None``.
|
# it by reference, rather by specializing on the value ``None``.
|
||||||
# the `dtype` argument does not matter, and we use `float32` as
|
# the `dtype` argument does not matter, and we use `float32` as
|
||||||
# a placeholder (it has wide hardware support).
|
# a placeholder (it has wide hardware support).
|
||||||
|
|
@ -39713,7 +39703,7 @@ index 9cf253875..df6ab56c6 100644
|
||||||
finished_requests_ids = [seq.request_id for seq in seqs]
|
finished_requests_ids = [seq.request_id for seq in seqs]
|
||||||
model_input = self.prepare_model_input(
|
model_input = self.prepare_model_input(
|
||||||
seqs, finished_requests_ids=finished_requests_ids)
|
seqs, finished_requests_ids=finished_requests_ids)
|
||||||
@@ -493,21 +857,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
@@ -493,21 +854,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
@ -39759,7 +39749,7 @@ index 9cf253875..df6ab56c6 100644
|
||||||
"""Helper method to prepare the model input based on a given sequence
|
"""Helper method to prepare the model input based on a given sequence
|
||||||
group. Prepares metadata needed for the base model forward pass but not
|
group. Prepares metadata needed for the base model forward pass but not
|
||||||
metadata for possible additional steps, e.g., sampling.
|
metadata for possible additional steps, e.g., sampling.
|
||||||
@@ -519,6 +897,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
@@ -519,6 +894,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||||
|
|
||||||
return builder.build() # type: ignore
|
return builder.build() # type: ignore
|
||||||
|
|
||||||
|
|
@ -39782,7 +39772,7 @@ index 9cf253875..df6ab56c6 100644
|
||||||
def prepare_model_input(
|
def prepare_model_input(
|
||||||
self,
|
self,
|
||||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||||
@@ -558,6 +952,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
@@ -558,6 +949,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"XPUModelRunner does not support multi-step execution.")
|
"XPUModelRunner does not support multi-step execution.")
|
||||||
|
|
||||||
|
|
@ -39795,7 +39785,7 @@ index 9cf253875..df6ab56c6 100644
|
||||||
model_executable = self.model
|
model_executable = self.model
|
||||||
if (self.observability_config is not None
|
if (self.observability_config is not None
|
||||||
and self.observability_config.collect_model_forward_time):
|
and self.observability_config.collect_model_forward_time):
|
||||||
@@ -607,3 +1007,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
@@ -607,3 +1004,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||||
output.model_forward_time = model_forward_time
|
output.model_forward_time = model_forward_time
|
||||||
|
|
||||||
return [output]
|
return [output]
|
||||||
|
|
@ -40700,7 +40690,7 @@ index 000000000..6ad951824
|
||||||
+ return model_input, worker_input, kwargs
|
+ return model_input, worker_input, kwargs
|
||||||
\ No newline at end of file
|
\ No newline at end of file
|
||||||
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
|
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
|
||||||
index 129566605..43d306145 100644
|
index 129566605..fb7962dfe 100644
|
||||||
--- a/vllm/worker/xpu_worker.py
|
--- a/vllm/worker/xpu_worker.py
|
||||||
+++ b/vllm/worker/xpu_worker.py
|
+++ b/vllm/worker/xpu_worker.py
|
||||||
@@ -3,7 +3,8 @@ import gc
|
@@ -3,7 +3,8 @@ import gc
|
||||||
|
|
@ -40729,15 +40719,75 @@ index 129566605..43d306145 100644
|
||||||
"""A worker class that executes (a partition of) the model on a GPU.
|
"""A worker class that executes (a partition of) the model on a GPU.
|
||||||
|
|
||||||
Each worker is associated with a single XPU device. The worker is
|
Each worker is associated with a single XPU device. The worker is
|
||||||
@@ -98,6 +99,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
@@ -97,16 +98,74 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
||||||
|
"""
|
||||||
# Profile the memory usage of the model and get the maximum number of
|
# Profile the memory usage of the model and get the maximum number of
|
||||||
# cache blocks that can be allocated with the remaining free memory.
|
# cache blocks that can be allocated with the remaining free memory.
|
||||||
|
+ flag = int(os.getenv("IPEX_LLM_FIND_MAX_LENGTH", -1))
|
||||||
|
+ if flag != -1:
|
||||||
|
+ assert flag > 0
|
||||||
|
+ torch.xpu.empty_cache()
|
||||||
|
+ before_memory = torch.xpu.memory_reserved()
|
||||||
|
+ max_num_batched_tokens = flag
|
||||||
|
+ max_num_seqs = 1
|
||||||
|
+ support_input = []
|
||||||
|
+ support_kv_cache = []
|
||||||
|
+ while True:
|
||||||
|
+ print(f"Profiling with max_num_batched_tokens {max_num_batched_tokens}...")
|
||||||
|
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
|
||||||
|
+ torch.xpu.synchronize()
|
||||||
|
+ used_memory = torch.xpu.memory_reserved()
|
||||||
|
+ total_gpu_memory = torch.xpu.get_device_properties(
|
||||||
|
+ self.local_rank).total_memory
|
||||||
|
+ free_gpu_memory = total_gpu_memory - used_memory
|
||||||
|
+ peak_memory = self.init_gpu_memory - free_gpu_memory
|
||||||
|
+ assert peak_memory > 0
|
||||||
|
+ cache_block_size = self.get_cache_block_size_bytes()
|
||||||
|
+ num_gpu_blocks = int(
|
||||||
|
+ (total_gpu_memory * self.cache_config.gpu_memory_utilization -
|
||||||
|
+ peak_memory) // cache_block_size)
|
||||||
|
+ num_cpu_blocks = int(self.cache_config.swap_space_bytes //
|
||||||
|
+ cache_block_size)
|
||||||
|
+ num_gpu_blocks = max(num_gpu_blocks, 0)
|
||||||
|
+ num_cpu_blocks = max(num_cpu_blocks, 0)
|
||||||
|
+ gc.collect()
|
||||||
|
+ torch.xpu.empty_cache()
|
||||||
|
+ # Begin to handle data...
|
||||||
|
+ if num_gpu_blocks == 0:
|
||||||
|
+ break
|
||||||
|
+ kv_cache_support_length = num_gpu_blocks * self.cache_config.block_size
|
||||||
|
+ # Too long input...
|
||||||
|
+ if max_num_batched_tokens > kv_cache_support_length:
|
||||||
|
+ break
|
||||||
|
+ support_input.append(max_num_batched_tokens)
|
||||||
|
+ support_kv_cache.append(kv_cache_support_length)
|
||||||
|
+ max_num_batched_tokens += 250
|
||||||
|
+
|
||||||
|
+ print(f"Recommended max input length: {support_input[len(support_input) - 1]}")
|
||||||
|
+ print(f"{'input length':<15} {'kv cache length':<15}")
|
||||||
|
+ print("-" * 30)
|
||||||
|
+
|
||||||
|
+ for inp, kv in zip(support_input, support_kv_cache):
|
||||||
|
+ print(f"{inp:<15} {kv:<15}")
|
||||||
torch.xpu.empty_cache()
|
torch.xpu.empty_cache()
|
||||||
+ before_memory = torch.xpu.memory_reserved()
|
+ before_memory = torch.xpu.memory_reserved()
|
||||||
|
|
||||||
# Execute a forward pass with dummy inputs to profile the memory usage
|
# Execute a forward pass with dummy inputs to profile the memory usage
|
||||||
# of the model.
|
# of the model.
|
||||||
@@ -106,7 +108,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
- self.model_runner.profile_run()
|
||||||
|
+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None)
|
||||||
|
+ if self_max_num_batched_tokens is not None:
|
||||||
|
+ # If this get set, then profile using max input length
|
||||||
|
+ max_num_batched_tokens = int(self_max_num_batched_tokens)
|
||||||
|
+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None)
|
||||||
|
+ if self_max_num_seqs is not None:
|
||||||
|
+ max_num_seqs = int(self_max_num_seqs)
|
||||||
|
+ else:
|
||||||
|
+ max_num_seqs = 1
|
||||||
|
+ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs)
|
||||||
|
+ else:
|
||||||
|
+ self.model_runner.profile_run()
|
||||||
|
|
||||||
# Calculate the number of blocks that can be allocated with the
|
# Calculate the number of blocks that can be allocated with the
|
||||||
# profiled peak memory.
|
# profiled peak memory.
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
@ -40746,7 +40796,7 @@ index 129566605..43d306145 100644
|
||||||
total_gpu_memory = torch.xpu.get_device_properties(
|
total_gpu_memory = torch.xpu.get_device_properties(
|
||||||
self.local_rank).total_memory
|
self.local_rank).total_memory
|
||||||
free_gpu_memory = total_gpu_memory - used_memory
|
free_gpu_memory = total_gpu_memory - used_memory
|
||||||
@@ -130,6 +132,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
@@ -130,6 +189,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
||||||
num_cpu_blocks = max(num_cpu_blocks, 0)
|
num_cpu_blocks = max(num_cpu_blocks, 0)
|
||||||
gc.collect()
|
gc.collect()
|
||||||
torch.xpu.empty_cache()
|
torch.xpu.empty_cache()
|
||||||
|
|
@ -40767,7 +40817,7 @@ index 129566605..43d306145 100644
|
||||||
return num_gpu_blocks, num_cpu_blocks
|
return num_gpu_blocks, num_cpu_blocks
|
||||||
|
|
||||||
def _warm_up_model(self) -> None:
|
def _warm_up_model(self) -> None:
|
||||||
@@ -175,4 +191,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
@@ -175,4 +248,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
|
||||||
parallel_config.tensor_parallel_size,
|
parallel_config.tensor_parallel_size,
|
||||||
parallel_config.pipeline_parallel_size)
|
parallel_config.pipeline_parallel_size)
|
||||||
# global all_reduce needed for overall oneccl warm up
|
# global all_reduce needed for overall oneccl warm up
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue