update vllm patch (#13072)

This commit is contained in:
Shaojun Liu 2025-04-14 14:56:10 +08:00 committed by GitHub
parent 10c30cdba9
commit 7826152f5a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -17548,6 +17548,16 @@ index 9033644e3..a46c67ad7 100644
@property @property
def url(self) -> str: def url(self) -> str:
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index cb831cb0b..0a55506f8 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -26,4 +26,4 @@ class ImageAsset:
"""
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
s3_prefix=VLM_IMAGES_DIR)
- return torch.load(image_path, map_location="cpu")
+ return torch.load(image_path, map_location="cpu", weights_only=True)
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 21949874b..79ed61f35 100644 index 21949874b..79ed61f35 100644
--- a/vllm/attention/backends/ipex_attn.py --- a/vllm/attention/backends/ipex_attn.py
@ -18812,6 +18822,44 @@ index 3e2bb436d..6372dab72 100644
+ "MooncakeConnector", + "MooncakeConnector",
+ "vllm.distributed.kv_transfer.kv_connector.simple_connector", + "vllm.distributed.kv_transfer.kv_connector.simple_connector",
+ "SimpleConnector") + "SimpleConnector")
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 8e4358672..69049ec76 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -1,12 +1,13 @@
import json
import os
-import pickle
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from typing import Optional, Union
import torch
import zmq
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import save as safetensors_save
from vllm.config import KVTransferConfig
from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
@@ -235,14 +236,13 @@ class MooncakePipe(KVPipeBase):
return hash(tensor.data_ptr())
def _send_impl(self, tensor: torch.Tensor) -> None:
- """Implement the tensor sending logic."""
- value_bytes = pickle.dumps(tensor)
- self.transfer_engine.send_bytes(value_bytes)
+ """Implement the tensor sending logic using safetensors."""
+ self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
def _recv_impl(self) -> torch.Tensor:
- """Implement the tensor receiving logic."""
+ """Implement the tensor receiving logic using safetensors."""
data = self.transfer_engine.recv_bytes()
- return pickle.loads(data)
+ return safetensors_load(data)["tensor"].to(self.device)
def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
"""Send tensor to the target process."""
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5b9236f8c..a837c1dc5 100644 index 5b9236f8c..a837c1dc5 100644
--- a/vllm/distributed/parallel_state.py --- a/vllm/distributed/parallel_state.py
@ -19375,10 +19423,19 @@ index fadf297e9..e4e0803c6 100644
def get_engine_class() -> Type[LLMEngine]: def get_engine_class() -> Type[LLMEngine]:
if envs.VLLM_USE_V1: if envs.VLLM_USE_V1:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2e45b4742..c72581e6c 100644 index 2e45b4742..b468085b5 100644
--- a/vllm/entrypoints/openai/api_server.py --- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py
@@ -16,7 +16,7 @@ from http import HTTPStatus @@ -3,6 +3,8 @@ import atexit
import importlib
import inspect
import multiprocessing
+# Fix https://avd.aquasec.com/nvd/cve-2022-42919
+multiprocessing.util.abstract_sockets_supported = False
import os
import re
import signal
@@ -16,7 +18,7 @@ from http import HTTPStatus
from typing import AsyncIterator, Optional, Set, Tuple from typing import AsyncIterator, Optional, Set, Tuple
import uvloop import uvloop
@ -19387,7 +19444,7 @@ index 2e45b4742..c72581e6c 100644
from fastapi.exceptions import RequestValidationError from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -44,11 +44,15 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, @@ -44,11 +46,15 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionResponse, CompletionResponse,
DetokenizeRequest, DetokenizeRequest,
DetokenizeResponse, DetokenizeResponse,
@ -19403,7 +19460,7 @@ index 2e45b4742..c72581e6c 100644
PoolingRequest, PoolingResponse, PoolingRequest, PoolingResponse,
ScoreRequest, ScoreResponse, ScoreRequest, ScoreResponse,
TokenizeRequest, TokenizeRequest,
@@ -58,7 +62,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, @@ -58,7 +64,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
@ -19414,7 +19471,7 @@ index 2e45b4742..c72581e6c 100644
from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.entrypoints.openai.serving_score import OpenAIServingScores
from vllm.entrypoints.openai.serving_tokenization import ( from vllm.entrypoints.openai.serving_tokenization import (
@@ -133,32 +139,21 @@ async def build_async_engine_client_from_engine_args( @@ -133,32 +141,21 @@ async def build_async_engine_client_from_engine_args(
Returns the Client or None if the creation failed. Returns the Client or None if the creation failed.
""" """
@ -19458,7 +19515,7 @@ index 2e45b4742..c72581e6c 100644
else: else:
if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
# Make TemporaryDirectory for prometheus multiprocessing # Make TemporaryDirectory for prometheus multiprocessing
@@ -280,6 +275,10 @@ def base(request: Request) -> OpenAIServing: @@ -280,6 +277,10 @@ def base(request: Request) -> OpenAIServing:
return tokenization(request) return tokenization(request)
@ -19469,7 +19526,7 @@ index 2e45b4742..c72581e6c 100644
def chat(request: Request) -> Optional[OpenAIServingChat]: def chat(request: Request) -> Optional[OpenAIServingChat]:
return request.app.state.openai_serving_chat return request.app.state.openai_serving_chat
@@ -315,6 +314,12 @@ async def health(raw_request: Request) -> Response: @@ -315,6 +316,12 @@ async def health(raw_request: Request) -> Response:
return Response(status_code=200) return Response(status_code=200)
@ -19482,7 +19539,7 @@ index 2e45b4742..c72581e6c 100644
@router.post("/tokenize") @router.post("/tokenize")
@with_cancellation @with_cancellation
async def tokenize(request: TokenizeRequest, raw_request: Request): async def tokenize(request: TokenizeRequest, raw_request: Request):
@@ -347,10 +352,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): @@ -347,10 +354,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
@router.get("/v1/models") @router.get("/v1/models")
async def show_available_models(raw_request: Request): async def show_available_models(raw_request: Request):
@ -19496,7 +19553,7 @@ index 2e45b4742..c72581e6c 100644
@router.get("/version") @router.get("/version")
@@ -488,6 +493,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): @@ -488,6 +495,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
return await create_score(request, raw_request) return await create_score(request, raw_request)
@ -19551,7 +19608,7 @@ index 2e45b4742..c72581e6c 100644
if envs.VLLM_TORCH_PROFILER_DIR: if envs.VLLM_TORCH_PROFILER_DIR:
logger.warning( logger.warning(
"Torch Profiler is enabled in the API server. This should ONLY be " "Torch Profiler is enabled in the API server. This should ONLY be "
@@ -516,26 +569,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: @@ -516,26 +571,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
@router.post("/v1/load_lora_adapter") @router.post("/v1/load_lora_adapter")
async def load_lora_adapter(request: LoadLoraAdapterRequest, async def load_lora_adapter(request: LoadLoraAdapterRequest,
raw_request: Request): raw_request: Request):
@ -19588,7 +19645,7 @@ index 2e45b4742..c72581e6c 100644
return Response(status_code=200, content=response) return Response(status_code=200, content=response)
@@ -639,13 +688,18 @@ def init_app_state( @@ -639,13 +690,18 @@ def init_app_state(
resolved_chat_template = load_chat_template(args.chat_template) resolved_chat_template = load_chat_template(args.chat_template)
logger.info("Using supplied chat template:\n%s", resolved_chat_template) logger.info("Using supplied chat template:\n%s", resolved_chat_template)
@ -19610,7 +19667,7 @@ index 2e45b4742..c72581e6c 100644
request_logger=request_logger, request_logger=request_logger,
chat_template=resolved_chat_template, chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format, chat_template_content_format=args.chat_template_content_format,
@@ -657,16 +711,14 @@ def init_app_state( @@ -657,16 +713,14 @@ def init_app_state(
state.openai_serving_completion = OpenAIServingCompletion( state.openai_serving_completion = OpenAIServingCompletion(
engine_client, engine_client,
model_config, model_config,
@ -19629,7 +19686,7 @@ index 2e45b4742..c72581e6c 100644
request_logger=request_logger, request_logger=request_logger,
chat_template=resolved_chat_template, chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format, chat_template_content_format=args.chat_template_content_format,
@@ -674,7 +726,7 @@ def init_app_state( @@ -674,7 +728,7 @@ def init_app_state(
state.openai_serving_embedding = OpenAIServingEmbedding( state.openai_serving_embedding = OpenAIServingEmbedding(
engine_client, engine_client,
model_config, model_config,
@ -19638,7 +19695,7 @@ index 2e45b4742..c72581e6c 100644
request_logger=request_logger, request_logger=request_logger,
chat_template=resolved_chat_template, chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format, chat_template_content_format=args.chat_template_content_format,
@@ -682,18 +734,18 @@ def init_app_state( @@ -682,18 +736,18 @@ def init_app_state(
state.openai_serving_scores = OpenAIServingScores( state.openai_serving_scores = OpenAIServingScores(
engine_client, engine_client,
model_config, model_config,
@ -19660,7 +19717,7 @@ index 2e45b4742..c72581e6c 100644
def create_server_socket(addr: Tuple[str, int]) -> socket.socket: def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
@@ -715,11 +767,11 @@ async def run_server(args, **uvicorn_kwargs) -> None: @@ -715,11 +769,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
ToolParserManager.import_tool_parser(args.tool_parser_plugin) ToolParserManager.import_tool_parser(args.tool_parser_plugin)
@ -19675,7 +19732,7 @@ index 2e45b4742..c72581e6c 100644
# workaround to make sure that we bind the port before the engine is set up. # workaround to make sure that we bind the port before the engine is set up.
# This avoids race conditions with ray. # This avoids race conditions with ray.
@@ -765,6 +817,8 @@ async def run_server(args, **uvicorn_kwargs) -> None: @@ -765,6 +819,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
if __name__ == "__main__": if __name__ == "__main__":
# NOTE(simon): # NOTE(simon):
# This section should be in sync with vllm/scripts.py for CLI entrypoints. # This section should be in sync with vllm/scripts.py for CLI entrypoints.
@ -21049,7 +21106,7 @@ index dde347b78..93ad4651f 100644
@classmethod @classmethod
def create_dummy_lora_weights( def create_dummy_lora_weights(
diff --git a/vllm/lora/models.py b/vllm/lora/models.py diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 5c0e4e5cb..5b7225bdc 100644 index 5c0e4e5cb..7e57d9c85 100644
--- a/vllm/lora/models.py --- a/vllm/lora/models.py
+++ b/vllm/lora/models.py +++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@ import math @@ -4,7 +4,7 @@ import math
@ -21078,6 +21135,16 @@ index 5c0e4e5cb..5b7225bdc 100644
if os.path.isfile(lora_tensor_path): if os.path.isfile(lora_tensor_path):
tensors: Dict[str, torch.Tensor] = {} tensors: Dict[str, torch.Tensor] = {}
# Find unexpected modules. # Find unexpected modules.
@@ -280,7 +281,8 @@ class LoRAModel(AdapterModel):
new_embeddings_tensor_path)
elif os.path.isfile(new_embeddings_bin_file_path):
embeddings = torch.load(new_embeddings_bin_file_path,
- map_location=device)
+ map_location=device,
+ weights_only=True)
return cls.from_lora_tensors(
lora_model_id=get_lora_id()
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index edf4ba565..ddd42ae93 100644 index edf4ba565..ddd42ae93 100644
--- a/vllm/lora/peft_helper.py --- a/vllm/lora/peft_helper.py
@ -22555,6 +22622,39 @@ index f2d9293b3..b3d7d6977 100644
if isinstance(load_config.load_format, type): if isinstance(load_config.load_format, type):
return load_config.load_format(load_config) return load_config.load_format(load_config)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 8aa0c98df..34c86a23a 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -93,7 +93,7 @@ def convert_bin_to_safetensor_file(
pt_filename: str,
sf_filename: str,
) -> None:
- loaded = torch.load(pt_filename, map_location="cpu")
+ loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
if "state_dict" in loaded:
loaded = loaded["state_dict"]
shared = _shared_pointers(loaded)
@@ -381,7 +381,9 @@ def np_cache_weights_iterator(
disable=not enable_tqdm,
bar_format=_BAR_FORMAT,
):
- state = torch.load(bin_file, map_location="cpu")
+ state = torch.load(bin_file,
+ map_location="cpu",
+ weights_only=True)
for name, param in state.items():
param_path = os.path.join(np_folder, name)
with open(param_path, "wb") as f:
@@ -447,7 +449,7 @@ def pt_weights_iterator(
disable=not enable_tqdm,
bar_format=_BAR_FORMAT,
):
- state = torch.load(bin_file, map_location="cpu")
+ state = torch.load(bin_file, map_location="cpu", weights_only=True)
yield from state.items()
del state
torch.cuda.empty_cache()
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 9437ad968..2e649f10c 100644 index 9437ad968..2e649f10c 100644
--- a/vllm/model_executor/models/aria.py --- a/vllm/model_executor/models/aria.py
@ -34955,6 +35055,19 @@ index 17f604ea0..c50eb2cef 100644
+ # general plugins, we only need to execute the loaded functions + # general plugins, we only need to execute the loaded functions
+ for func in plugins.values(): + for func in plugins.values():
+ func() + func()
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
index 473b87c89..8b2732923 100644
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -89,6 +89,7 @@ def load_peft_weights(model_id: str,
adapters_weights = safe_load_file(filename, device=device)
else:
adapters_weights = torch.load(filename,
- map_location=torch.device(device))
+ map_location=torch.device(device),
+ weights_only=True)
return adapters_weights
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index fc77f3ca5..605c09b8d 100644 index fc77f3ca5..605c09b8d 100644
--- a/vllm/sampling_params.py --- a/vllm/sampling_params.py