update vllm patch (#13072)

2025-04-14 14:56:10 +08:00 · 2025-04-14 14:56:10 +08:00 · 7826152f5a
commit 7826152f5a
parent 10c30cdba9
1 changed files with 130 additions and 17 deletions
--- a/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch
+++ b/docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch
@ -17548,6 +17548,16 @@ index 9033644e3..a46c67ad7 100644
     @property
     def url(self) -> str:
 diff --git a/vllm/assets/image.py b/vllm/assets/image.py
 index cb831cb0b..0a55506f8 100644
 --- a/vllm/assets/image.py
 +++ b/vllm/assets/image.py
@@ -26,4 +26,4 @@ class ImageAsset:
         """
         image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
                                             s3_prefix=VLM_IMAGES_DIR)
 -        return torch.load(image_path, map_location="cpu")
 +        return torch.load(image_path, map_location="cpu", weights_only=True)
 diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
 index 21949874b..79ed61f35 100644
 --- a/vllm/attention/backends/ipex_attn.py
@ -18812,6 +18822,44 @@ index 3e2bb436d..6372dab72 100644
 +    "MooncakeConnector",
 +    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
 +    "SimpleConnector")
 diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
 index 8e4358672..69049ec76 100644
 --- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
 +++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -1,12 +1,13 @@
 import json
 import os
 -import pickle
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from typing import Optional, Union
 import torch
 import zmq
 +from safetensors.torch import load as safetensors_load
 +from safetensors.torch import save as safetensors_save
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
@@ -235,14 +236,13 @@ class MooncakePipe(KVPipeBase):
         return hash(tensor.data_ptr())
     def _send_impl(self, tensor: torch.Tensor) -> None:
 -        """Implement the tensor sending logic."""
 -        value_bytes = pickle.dumps(tensor)
 -        self.transfer_engine.send_bytes(value_bytes)
 +        """Implement the tensor sending logic using safetensors."""
 +        self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
     def _recv_impl(self) -> torch.Tensor:
 -        """Implement the tensor receiving logic."""
 +        """Implement the tensor receiving logic using safetensors."""
         data = self.transfer_engine.recv_bytes()
 -        return pickle.loads(data)
 +        return safetensors_load(data)["tensor"].to(self.device)
     def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         """Send tensor to the target process."""
 diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
 index 5b9236f8c..a837c1dc5 100644
 --- a/vllm/distributed/parallel_state.py
@ -19375,10 +19423,19 @@ index fadf297e9..e4e0803c6 100644
     def get_engine_class() -> Type[LLMEngine]:
         if envs.VLLM_USE_V1:
 diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
-index 2e45b4742..c72581e6c 100644
+index 2e45b4742..b468085b5 100644
 --- a/vllm/entrypoints/openai/api_server.py
 +++ b/vllm/entrypoints/openai/api_server.py
-@@ -16,7 +16,7 @@ from http import HTTPStatus
+@@ -3,6 +3,8 @@ import atexit
 import importlib
 import inspect
 import multiprocessing
 +# Fix https://avd.aquasec.com/nvd/cve-2022-42919 
 +multiprocessing.util.abstract_sockets_supported = False
 import os
 import re
 import signal
@@ -16,7 +18,7 @@ from http import HTTPStatus
 from typing import AsyncIterator, Optional, Set, Tuple
 import uvloop
@ -19387,7 +19444,7 @@ index 2e45b4742..c72581e6c 100644
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
-@@ -44,11 +44,15 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+@@ -44,11 +46,15 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionResponse,
                                               DetokenizeRequest,
                                               DetokenizeResponse,
@ -19403,7 +19460,7 @@ index 2e45b4742..c72581e6c 100644
                                               PoolingRequest, PoolingResponse,
                                               ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
-@@ -58,7 +62,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+@@ -58,7 +64,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
@ -19414,7 +19471,7 @@ index 2e45b4742..c72581e6c 100644
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
-@@ -133,32 +139,21 @@ async def build_async_engine_client_from_engine_args(
+@@ -133,32 +141,21 @@ async def build_async_engine_client_from_engine_args(
     Returns the Client or None if the creation failed.
     """
@ -19458,7 +19515,7 @@ index 2e45b4742..c72581e6c 100644
     else:
         if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
             # Make TemporaryDirectory for prometheus multiprocessing
-@@ -280,6 +275,10 @@ def base(request: Request) -> OpenAIServing:
+@@ -280,6 +277,10 @@ def base(request: Request) -> OpenAIServing:
     return tokenization(request)
@ -19469,7 +19526,7 @@ index 2e45b4742..c72581e6c 100644
 def chat(request: Request) -> Optional[OpenAIServingChat]:
     return request.app.state.openai_serving_chat
-@@ -315,6 +314,12 @@ async def health(raw_request: Request) -> Response:
+@@ -315,6 +316,12 @@ async def health(raw_request: Request) -> Response:
     return Response(status_code=200)
@ -19482,7 +19539,7 @@ index 2e45b4742..c72581e6c 100644
 @router.post("/tokenize")
 @with_cancellation
 async def tokenize(request: TokenizeRequest, raw_request: Request):
-@@ -347,10 +352,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
+@@ -347,10 +354,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 @router.get("/v1/models")
 async def show_available_models(raw_request: Request):
@ -19496,7 +19553,7 @@ index 2e45b4742..c72581e6c 100644
 @router.get("/version")
-@@ -488,6 +493,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
+@@ -488,6 +495,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
@ -19551,7 +19608,7 @@ index 2e45b4742..c72581e6c 100644
 if envs.VLLM_TORCH_PROFILER_DIR:
     logger.warning(
         "Torch Profiler is enabled in the API server. This should ONLY be "
-@@ -516,26 +569,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
+@@ -516,26 +571,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
     @router.post("/v1/load_lora_adapter")
     async def load_lora_adapter(request: LoadLoraAdapterRequest,
                                 raw_request: Request):
@ -19588,7 +19645,7 @@ index 2e45b4742..c72581e6c 100644
         return Response(status_code=200, content=response)
-@@ -639,13 +688,18 @@ def init_app_state(
+@@ -639,13 +690,18 @@ def init_app_state(
     resolved_chat_template = load_chat_template(args.chat_template)
     logger.info("Using supplied chat template:\n%s", resolved_chat_template)
@ -19610,7 +19667,7 @@ index 2e45b4742..c72581e6c 100644
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
-@@ -657,16 +711,14 @@ def init_app_state(
+@@ -657,16 +713,14 @@ def init_app_state(
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
         model_config,
@ -19629,7 +19686,7 @@ index 2e45b4742..c72581e6c 100644
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
-@@ -674,7 +726,7 @@ def init_app_state(
+@@ -674,7 +728,7 @@ def init_app_state(
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
         model_config,
@ -19638,7 +19695,7 @@ index 2e45b4742..c72581e6c 100644
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
-@@ -682,18 +734,18 @@ def init_app_state(
+@@ -682,18 +736,18 @@ def init_app_state(
     state.openai_serving_scores = OpenAIServingScores(
         engine_client,
         model_config,
@ -19660,7 +19717,7 @@ index 2e45b4742..c72581e6c 100644
 def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
-@@ -715,11 +767,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
+@@ -715,11 +769,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
@ -19675,7 +19732,7 @@ index 2e45b4742..c72581e6c 100644
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
-@@ -765,6 +817,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
+@@ -765,6 +819,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
 if __name__ == "__main__":
     # NOTE(simon):
     # This section should be in sync with vllm/scripts.py for CLI entrypoints.
@ -21049,7 +21106,7 @@ index dde347b78..93ad4651f 100644
     @classmethod
     def create_dummy_lora_weights(
 diff --git a/vllm/lora/models.py b/vllm/lora/models.py
-index 5c0e4e5cb..5b7225bdc 100644
+index 5c0e4e5cb..7e57d9c85 100644
 --- a/vllm/lora/models.py
 +++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@ import math
@ -21078,6 +21135,16 @@ index 5c0e4e5cb..5b7225bdc 100644
         if os.path.isfile(lora_tensor_path):
             tensors: Dict[str, torch.Tensor] = {}
             # Find unexpected modules.
@@ -280,7 +281,8 @@ class LoRAModel(AdapterModel):
                 new_embeddings_tensor_path)
         elif os.path.isfile(new_embeddings_bin_file_path):
             embeddings = torch.load(new_embeddings_bin_file_path,
 -                                    map_location=device)
 +                                    map_location=device,
 +                                    weights_only=True)
         return cls.from_lora_tensors(
             lora_model_id=get_lora_id()
 diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
 index edf4ba565..ddd42ae93 100644
 --- a/vllm/lora/peft_helper.py
@ -22555,6 +22622,39 @@ index f2d9293b3..b3d7d6977 100644
     if isinstance(load_config.load_format, type):
         return load_config.load_format(load_config)
 diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
 index 8aa0c98df..34c86a23a 100644
 --- a/vllm/model_executor/model_loader/weight_utils.py
 +++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -93,7 +93,7 @@ def convert_bin_to_safetensor_file(
     pt_filename: str,
     sf_filename: str,
 ) -> None:
 -    loaded = torch.load(pt_filename, map_location="cpu")
 +    loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
     if "state_dict" in loaded:
         loaded = loaded["state_dict"]
     shared = _shared_pointers(loaded)
@@ -381,7 +381,9 @@ def np_cache_weights_iterator(
                     disable=not enable_tqdm,
                     bar_format=_BAR_FORMAT,
             ):
 -                state = torch.load(bin_file, map_location="cpu")
 +                state = torch.load(bin_file,
 +                                   map_location="cpu",
 +                                   weights_only=True)
                 for name, param in state.items():
                     param_path = os.path.join(np_folder, name)
                     with open(param_path, "wb") as f:
@@ -447,7 +449,7 @@ def pt_weights_iterator(
             disable=not enable_tqdm,
             bar_format=_BAR_FORMAT,
     ):
 -        state = torch.load(bin_file, map_location="cpu")
 +        state = torch.load(bin_file, map_location="cpu", weights_only=True)
         yield from state.items()
         del state
         torch.cuda.empty_cache()
 diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
 index 9437ad968..2e649f10c 100644
 --- a/vllm/model_executor/models/aria.py
@ -34955,6 +35055,19 @@ index 17f604ea0..c50eb2cef 100644
 +    # general plugins, we only need to execute the loaded functions
 +    for func in plugins.values():
 +        func()
 diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
 index 473b87c89..8b2732923 100644
 --- a/vllm/prompt_adapter/utils.py
 +++ b/vllm/prompt_adapter/utils.py
@@ -89,6 +89,7 @@ def load_peft_weights(model_id: str,
         adapters_weights = safe_load_file(filename, device=device)
     else:
         adapters_weights = torch.load(filename,
 -                                      map_location=torch.device(device))
 +                                      map_location=torch.device(device),
 +                                      weights_only=True)
     return adapters_weights
 diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
 index fc77f3ca5..605c09b8d 100644
 --- a/vllm/sampling_params.py