update vllm patch (#13072)

2025-04-14 14:56:10 +08:00 · 2025-04-14 14:56:10 +08:00 · 7826152f5a
commit 7826152f5a
parent 10c30cdba9
1 changed files with 130 additions and 17 deletions
--- a/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch
+++ b/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch
@ -17548,6 +17548,16 @@ index 9033644e3..a46c67ad7 100644
 
     @property
     def url(self) -> str:
+diff --git a/vllm/assets/image.py b/vllm/assets/image.py
+index cb831cb0b..0a55506f8 100644
+--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
+@@ -26,4 +26,4 @@ class ImageAsset:
+         """
+         image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
+                                             s3_prefix=VLM_IMAGES_DIR)
+-        return torch.load(image_path, map_location="cpu")
+        return torch.load(image_path, map_location="cpu", weights_only=True)
 diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
 index 21949874b..79ed61f35 100644
 --- a/vllm/attention/backends/ipex_attn.py
@ -18812,6 +18822,44 @@ index 3e2bb436d..6372dab72 100644
 +    "MooncakeConnector",
 +    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
 +    "SimpleConnector")
+diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+index 8e4358672..69049ec76 100644
+--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+@@ -1,12 +1,13 @@
+ import json
+ import os
+-import pickle
+ from concurrent.futures import ThreadPoolExecutor
+ from dataclasses import dataclass
+ from typing import Optional, Union
+ 
+ import torch
+ import zmq
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import save as safetensors_save
+ 
+ from vllm.config import KVTransferConfig
+ from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+@@ -235,14 +236,13 @@ class MooncakePipe(KVPipeBase):
+         return hash(tensor.data_ptr())
+ 
+     def _send_impl(self, tensor: torch.Tensor) -> None:
+-        """Implement the tensor sending logic."""
+-        value_bytes = pickle.dumps(tensor)
+-        self.transfer_engine.send_bytes(value_bytes)
+        """Implement the tensor sending logic using safetensors."""
+        self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
+ 
+     def _recv_impl(self) -> torch.Tensor:
+-        """Implement the tensor receiving logic."""
+        """Implement the tensor receiving logic using safetensors."""
+         data = self.transfer_engine.recv_bytes()
+-        return pickle.loads(data)
+        return safetensors_load(data)["tensor"].to(self.device)
+ 
+     def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+         """Send tensor to the target process."""
 diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
 index 5b9236f8c..a837c1dc5 100644
 --- a/vllm/distributed/parallel_state.py
@ -19375,10 +19423,19 @@ index fadf297e9..e4e0803c6 100644
     def get_engine_class() -> Type[LLMEngine]:
         if envs.VLLM_USE_V1:
 diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
-index 2e45b4742..c72581e6c 100644
+index 2e45b4742..b468085b5 100644
 --- a/vllm/entrypoints/openai/api_server.py
 +++ b/vllm/entrypoints/openai/api_server.py
-@@ -16,7 +16,7 @@ from http import HTTPStatus
+@@ -3,6 +3,8 @@ import atexit
+ import importlib
+ import inspect
+ import multiprocessing
+# Fix https://avd.aquasec.com/nvd/cve-2022-42919 
+multiprocessing.util.abstract_sockets_supported = False
+ import os
+ import re
+ import signal
+@@ -16,7 +18,7 @@ from http import HTTPStatus
 from typing import AsyncIterator, Optional, Set, Tuple
 
 import uvloop
@ -19387,7 +19444,7 @@ index 2e45b4742..c72581e6c 100644
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
-@@ -44,11 +44,15 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+@@ -44,11 +46,15 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionResponse,
                                               DetokenizeRequest,
                                               DetokenizeResponse,
@ -19403,7 +19460,7 @@ index 2e45b4742..c72581e6c 100644
                                               PoolingRequest, PoolingResponse,
                                               ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
-@@ -58,7 +62,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+@@ -58,7 +64,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
@ -19414,7 +19471,7 @@ index 2e45b4742..c72581e6c 100644
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
-@@ -133,32 +139,21 @@ async def build_async_engine_client_from_engine_args(
+@@ -133,32 +141,21 @@ async def build_async_engine_client_from_engine_args(
     Returns the Client or None if the creation failed.
     """
 
@ -19458,7 +19515,7 @@ index 2e45b4742..c72581e6c 100644
     else:
         if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
             # Make TemporaryDirectory for prometheus multiprocessing
-@@ -280,6 +275,10 @@ def base(request: Request) -> OpenAIServing:
+@@ -280,6 +277,10 @@ def base(request: Request) -> OpenAIServing:
     return tokenization(request)
 
 
@ -19469,7 +19526,7 @@ index 2e45b4742..c72581e6c 100644
 def chat(request: Request) -> Optional[OpenAIServingChat]:
     return request.app.state.openai_serving_chat
 
-@@ -315,6 +314,12 @@ async def health(raw_request: Request) -> Response:
+@@ -315,6 +316,12 @@ async def health(raw_request: Request) -> Response:
     return Response(status_code=200)
 
 
@ -19482,7 +19539,7 @@ index 2e45b4742..c72581e6c 100644
 @router.post("/tokenize")
 @with_cancellation
 async def tokenize(request: TokenizeRequest, raw_request: Request):
-@@ -347,10 +352,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
+@@ -347,10 +354,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 
 @router.get("/v1/models")
 async def show_available_models(raw_request: Request):
@ -19496,7 +19553,7 @@ index 2e45b4742..c72581e6c 100644
 
 
 @router.get("/version")
-@@ -488,6 +493,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
+@@ -488,6 +495,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
 
 
@ -19551,7 +19608,7 @@ index 2e45b4742..c72581e6c 100644
 if envs.VLLM_TORCH_PROFILER_DIR:
     logger.warning(
         "Torch Profiler is enabled in the API server. This should ONLY be "
-@@ -516,26 +569,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+@@ -516,26 +571,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
     @router.post("/v1/load_lora_adapter")
     async def load_lora_adapter(request: LoadLoraAdapterRequest,
                                 raw_request: Request):
@ -19588,7 +19645,7 @@ index 2e45b4742..c72581e6c 100644
 
         return Response(status_code=200, content=response)
 
-@@ -639,13 +688,18 @@ def init_app_state(
+@@ -639,13 +690,18 @@ def init_app_state(
     resolved_chat_template = load_chat_template(args.chat_template)
     logger.info("Using supplied chat template:\n%s", resolved_chat_template)
 
@ -19610,7 +19667,7 @@ index 2e45b4742..c72581e6c 100644
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
-@@ -657,16 +711,14 @@ def init_app_state(
+@@ -657,16 +713,14 @@ def init_app_state(
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
@ -19629,7 +19686,7 @@ index 2e45b4742..c72581e6c 100644
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
-@@ -674,7 +726,7 @@ def init_app_state(
+@@ -674,7 +728,7 @@ def init_app_state(
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
@ -19638,7 +19695,7 @@ index 2e45b4742..c72581e6c 100644
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
-@@ -682,18 +734,18 @@ def init_app_state(
+@@ -682,18 +736,18 @@ def init_app_state(
     state.openai_serving_scores = OpenAIServingScores(
         engine_client,
         model_config,
@ -19660,7 +19717,7 @@ index 2e45b4742..c72581e6c 100644
 
 
 def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
-@@ -715,11 +767,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
+@@ -715,11 +769,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 
@ -19675,7 +19732,7 @@ index 2e45b4742..c72581e6c 100644
 
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
-@@ -765,6 +817,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
+@@ -765,6 +819,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
 if __name__ == "__main__":
     # NOTE(simon):
     # This section should be in sync with vllm/scripts.py for CLI entrypoints.
@ -21049,7 +21106,7 @@ index dde347b78..93ad4651f 100644
     @classmethod
     def create_dummy_lora_weights(
 diff --git a/vllm/lora/models.py b/vllm/lora/models.py
-index 5c0e4e5cb..5b7225bdc 100644
+index 5c0e4e5cb..7e57d9c85 100644
 --- a/vllm/lora/models.py
 +++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@ import math
@ -21078,6 +21135,16 @@ index 5c0e4e5cb..5b7225bdc 100644
         if os.path.isfile(lora_tensor_path):
             tensors: Dict[str, torch.Tensor] = {}
             # Find unexpected modules.
+@@ -280,7 +281,8 @@ class LoRAModel(AdapterModel):
+                 new_embeddings_tensor_path)
+         elif os.path.isfile(new_embeddings_bin_file_path):
+             embeddings = torch.load(new_embeddings_bin_file_path,
+-                                    map_location=device)
+                                    map_location=device,
+                                    weights_only=True)
+ 
+         return cls.from_lora_tensors(
+             lora_model_id=get_lora_id()
 diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
 index edf4ba565..ddd42ae93 100644
 --- a/vllm/lora/peft_helper.py
@ -22555,6 +22622,39 @@ index f2d9293b3..b3d7d6977 100644
     if isinstance(load_config.load_format, type):
         return load_config.load_format(load_config)
 
+diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
+index 8aa0c98df..34c86a23a 100644
+--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
+@@ -93,7 +93,7 @@ def convert_bin_to_safetensor_file(
+     pt_filename: str,
+     sf_filename: str,
+ ) -> None:
+-    loaded = torch.load(pt_filename, map_location="cpu")
+    loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
+     if "state_dict" in loaded:
+         loaded = loaded["state_dict"]
+     shared = _shared_pointers(loaded)
+@@ -381,7 +381,9 @@ def np_cache_weights_iterator(
+                     disable=not enable_tqdm,
+                     bar_format=_BAR_FORMAT,
+             ):
+-                state = torch.load(bin_file, map_location="cpu")
+                state = torch.load(bin_file,
+                                   map_location="cpu",
+                                   weights_only=True)
+                 for name, param in state.items():
+                     param_path = os.path.join(np_folder, name)
+                     with open(param_path, "wb") as f:
+@@ -447,7 +449,7 @@ def pt_weights_iterator(
+             disable=not enable_tqdm,
+             bar_format=_BAR_FORMAT,
+     ):
+-        state = torch.load(bin_file, map_location="cpu")
+        state = torch.load(bin_file, map_location="cpu", weights_only=True)
+         yield from state.items()
+         del state
+         torch.cuda.empty_cache()
 diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
 index 9437ad968..2e649f10c 100644
 --- a/vllm/model_executor/models/aria.py
@ -34955,6 +35055,19 @@ index 17f604ea0..c50eb2cef 100644
 +    # general plugins, we only need to execute the loaded functions
 +    for func in plugins.values():
 +        func()
+diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
+index 473b87c89..8b2732923 100644
+--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
+@@ -89,6 +89,7 @@ def load_peft_weights(model_id: str,
+         adapters_weights = safe_load_file(filename, device=device)
+     else:
+         adapters_weights = torch.load(filename,
+-                                      map_location=torch.device(device))
+                                      map_location=torch.device(device),
+                                      weights_only=True)
+ 
+     return adapters_weights
 diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
 index fc77f3ca5..605c09b8d 100644
 --- a/vllm/sampling_params.py