diff --git a/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch b/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch index 2f4635a9..0410b8d4 100644 --- a/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch +++ b/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch @@ -39043,7 +39043,7 @@ index 3ac7fb8df..249b3ed2d 100644 and self.observability_config.collect_model_execute_time): output.tensors["model_execute_time"] = torch.tensor( diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py -index 9cf253875..df6ab56c6 100644 +index 9cf253875..34d098486 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -3,8 +3,8 @@ import time @@ -39606,7 +39606,7 @@ index 9cf253875..df6ab56c6 100644 self.sampling_metadata_cache: SamplingMetadataCache = \ SamplingMetadataCache() \ -@@ -415,10 +719,38 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -415,16 +719,74 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30)) @@ -39643,12 +39643,19 @@ index 9cf253875..df6ab56c6 100644 + return rope_scaling.get("type", None) == "mrope" or rope_scaling.get("mrope_section", None) is not None + @torch.inference_mode() - def profile_run(self) -> None: +- def profile_run(self) -> None: ++ def profile_run(self, num_batched_tokens=-1, num_seqs=-1) -> None: # Enable top-k sampling to reflect the accurate memory usage. -@@ -426,6 +758,30 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens ++ assert (num_batched_tokens == -1 or num_batched_tokens > 0) ++ assert (num_seqs == -1 or num_seqs > 0) max_num_seqs = self.scheduler_config.max_num_seqs - ++ if num_batched_tokens != -1: ++ max_num_batched_tokens = num_batched_tokens ++ if num_seqs != -1: ++ max_num_seqs = num_seqs ++ + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request @@ -39672,27 +39679,10 @@ index 9cf253875..df6ab56c6 100644 + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] -+ + # Profile memory usage with max_num_sequences sequences and the total # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] -@@ -450,6 +806,15 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): - max_num_seqs = 1 - - batch_size = 0 -+ import os -+ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None) -+ if self_max_num_batched_tokens is not None: -+ max_num_batched_tokens = int(self_max_num_batched_tokens) -+ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None) -+ if self_max_num_seqs is not None: -+ max_num_seqs = int(self_max_num_seqs) -+ else: -+ max_num_seqs = 1 - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) -@@ -466,7 +831,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -466,7 +828,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): seq_data={group_id: dummy_data.seq_data}, sampling_params=sampling_params, block_tables=None, @@ -39702,7 +39692,7 @@ index 9cf253875..df6ab56c6 100644 multi_modal_data=dummy_data.multi_modal_data, multi_modal_placeholders=dummy_data.multi_modal_placeholders) seqs.append(seq) -@@ -477,9 +843,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -477,9 +840,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): # it by reference, rather by specializing on the value ``None``. # the `dtype` argument does not matter, and we use `float32` as # a placeholder (it has wide hardware support). @@ -39713,7 +39703,7 @@ index 9cf253875..df6ab56c6 100644 finished_requests_ids = [seq.request_id for seq in seqs] model_input = self.prepare_model_input( seqs, finished_requests_ids=finished_requests_ids) -@@ -493,21 +857,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -493,21 +854,35 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): torch.xpu.synchronize() return @@ -39759,7 +39749,7 @@ index 9cf253875..df6ab56c6 100644 """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not metadata for possible additional steps, e.g., sampling. -@@ -519,6 +897,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -519,6 +894,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): return builder.build() # type: ignore @@ -39782,7 +39772,7 @@ index 9cf253875..df6ab56c6 100644 def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], -@@ -558,6 +952,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -558,6 +949,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): raise ValueError( "XPUModelRunner does not support multi-step execution.") @@ -39795,7 +39785,7 @@ index 9cf253875..df6ab56c6 100644 model_executable = self.model if (self.observability_config is not None and self.observability_config.collect_model_forward_time): -@@ -607,3 +1007,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): +@@ -607,3 +1004,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): output.model_forward_time = model_forward_time return [output] @@ -40700,7 +40690,7 @@ index 000000000..6ad951824 + return model_input, worker_input, kwargs \ No newline at end of file diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py -index 129566605..43d306145 100644 +index 129566605..fb7962dfe 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -3,7 +3,8 @@ import gc @@ -40729,15 +40719,75 @@ index 129566605..43d306145 100644 """A worker class that executes (a partition of) the model on a GPU. Each worker is associated with a single XPU device. The worker is -@@ -98,6 +99,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): +@@ -97,16 +98,74 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): + """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. ++ flag = int(os.getenv("IPEX_LLM_FIND_MAX_LENGTH", -1)) ++ if flag != -1: ++ assert flag > 0 ++ torch.xpu.empty_cache() ++ before_memory = torch.xpu.memory_reserved() ++ max_num_batched_tokens = flag ++ max_num_seqs = 1 ++ support_input = [] ++ support_kv_cache = [] ++ while True: ++ print(f"Profiling with max_num_batched_tokens {max_num_batched_tokens}...") ++ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs) ++ torch.xpu.synchronize() ++ used_memory = torch.xpu.memory_reserved() ++ total_gpu_memory = torch.xpu.get_device_properties( ++ self.local_rank).total_memory ++ free_gpu_memory = total_gpu_memory - used_memory ++ peak_memory = self.init_gpu_memory - free_gpu_memory ++ assert peak_memory > 0 ++ cache_block_size = self.get_cache_block_size_bytes() ++ num_gpu_blocks = int( ++ (total_gpu_memory * self.cache_config.gpu_memory_utilization - ++ peak_memory) // cache_block_size) ++ num_cpu_blocks = int(self.cache_config.swap_space_bytes // ++ cache_block_size) ++ num_gpu_blocks = max(num_gpu_blocks, 0) ++ num_cpu_blocks = max(num_cpu_blocks, 0) ++ gc.collect() ++ torch.xpu.empty_cache() ++ # Begin to handle data... ++ if num_gpu_blocks == 0: ++ break ++ kv_cache_support_length = num_gpu_blocks * self.cache_config.block_size ++ # Too long input... ++ if max_num_batched_tokens > kv_cache_support_length: ++ break ++ support_input.append(max_num_batched_tokens) ++ support_kv_cache.append(kv_cache_support_length) ++ max_num_batched_tokens += 250 ++ ++ print(f"Recommended max input length: {support_input[len(support_input) - 1]}") ++ print(f"{'input length':<15} {'kv cache length':<15}") ++ print("-" * 30) ++ ++ for inp, kv in zip(support_input, support_kv_cache): ++ print(f"{inp:<15} {kv:<15}") torch.xpu.empty_cache() + before_memory = torch.xpu.memory_reserved() # Execute a forward pass with dummy inputs to profile the memory usage # of the model. -@@ -106,7 +108,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): +- self.model_runner.profile_run() ++ self_max_num_batched_tokens = os.getenv("IPEX_LLM_SELF_MAX_NUM_BATCHED_TOKENS", None) ++ if self_max_num_batched_tokens is not None: ++ # If this get set, then profile using max input length ++ max_num_batched_tokens = int(self_max_num_batched_tokens) ++ self_max_num_seqs = os.getenv("IPEX_LLM_SELF_MAX_NUM_SEQS", None) ++ if self_max_num_seqs is not None: ++ max_num_seqs = int(self_max_num_seqs) ++ else: ++ max_num_seqs = 1 ++ self.model_runner.profile_run(max_num_batched_tokens, max_num_seqs) ++ else: ++ self.model_runner.profile_run() + # Calculate the number of blocks that can be allocated with the # profiled peak memory. torch.xpu.synchronize() @@ -40746,7 +40796,7 @@ index 129566605..43d306145 100644 total_gpu_memory = torch.xpu.get_device_properties( self.local_rank).total_memory free_gpu_memory = total_gpu_memory - used_memory -@@ -130,6 +132,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): +@@ -130,6 +189,20 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): num_cpu_blocks = max(num_cpu_blocks, 0) gc.collect() torch.xpu.empty_cache() @@ -40767,7 +40817,7 @@ index 129566605..43d306145 100644 return num_gpu_blocks, num_cpu_blocks def _warm_up_model(self) -> None: -@@ -175,4 +191,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): +@@ -175,4 +248,10 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) # global all_reduce needed for overall oneccl warm up