update vllm patch (#13072)
This commit is contained in:
		
							parent
							
								
									10c30cdba9
								
							
						
					
					
						commit
						7826152f5a
					
				
					 1 changed files with 130 additions and 17 deletions
				
			
		| 
						 | 
				
			
			@ -17548,6 +17548,16 @@ index 9033644e3..a46c67ad7 100644
 | 
			
		|||
 
 | 
			
		||||
     @property
 | 
			
		||||
     def url(self) -> str:
 | 
			
		||||
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
 | 
			
		||||
index cb831cb0b..0a55506f8 100644
 | 
			
		||||
--- a/vllm/assets/image.py
 | 
			
		||||
+++ b/vllm/assets/image.py
 | 
			
		||||
@@ -26,4 +26,4 @@ class ImageAsset:
 | 
			
		||||
         """
 | 
			
		||||
         image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
 | 
			
		||||
                                             s3_prefix=VLM_IMAGES_DIR)
 | 
			
		||||
-        return torch.load(image_path, map_location="cpu")
 | 
			
		||||
+        return torch.load(image_path, map_location="cpu", weights_only=True)
 | 
			
		||||
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
 | 
			
		||||
index 21949874b..79ed61f35 100644
 | 
			
		||||
--- a/vllm/attention/backends/ipex_attn.py
 | 
			
		||||
| 
						 | 
				
			
			@ -18812,6 +18822,44 @@ index 3e2bb436d..6372dab72 100644
 | 
			
		|||
+    "MooncakeConnector",
 | 
			
		||||
+    "vllm.distributed.kv_transfer.kv_connector.simple_connector",
 | 
			
		||||
+    "SimpleConnector")
 | 
			
		||||
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
 | 
			
		||||
index 8e4358672..69049ec76 100644
 | 
			
		||||
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
 | 
			
		||||
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
 | 
			
		||||
@@ -1,12 +1,13 @@
 | 
			
		||||
 import json
 | 
			
		||||
 import os
 | 
			
		||||
-import pickle
 | 
			
		||||
 from concurrent.futures import ThreadPoolExecutor
 | 
			
		||||
 from dataclasses import dataclass
 | 
			
		||||
 from typing import Optional, Union
 | 
			
		||||
 
 | 
			
		||||
 import torch
 | 
			
		||||
 import zmq
 | 
			
		||||
+from safetensors.torch import load as safetensors_load
 | 
			
		||||
+from safetensors.torch import save as safetensors_save
 | 
			
		||||
 
 | 
			
		||||
 from vllm.config import KVTransferConfig
 | 
			
		||||
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 | 
			
		||||
@@ -235,14 +236,13 @@ class MooncakePipe(KVPipeBase):
 | 
			
		||||
         return hash(tensor.data_ptr())
 | 
			
		||||
 
 | 
			
		||||
     def _send_impl(self, tensor: torch.Tensor) -> None:
 | 
			
		||||
-        """Implement the tensor sending logic."""
 | 
			
		||||
-        value_bytes = pickle.dumps(tensor)
 | 
			
		||||
-        self.transfer_engine.send_bytes(value_bytes)
 | 
			
		||||
+        """Implement the tensor sending logic using safetensors."""
 | 
			
		||||
+        self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
 | 
			
		||||
 
 | 
			
		||||
     def _recv_impl(self) -> torch.Tensor:
 | 
			
		||||
-        """Implement the tensor receiving logic."""
 | 
			
		||||
+        """Implement the tensor receiving logic using safetensors."""
 | 
			
		||||
         data = self.transfer_engine.recv_bytes()
 | 
			
		||||
-        return pickle.loads(data)
 | 
			
		||||
+        return safetensors_load(data)["tensor"].to(self.device)
 | 
			
		||||
 
 | 
			
		||||
     def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
 | 
			
		||||
         """Send tensor to the target process."""
 | 
			
		||||
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
 | 
			
		||||
index 5b9236f8c..a837c1dc5 100644
 | 
			
		||||
--- a/vllm/distributed/parallel_state.py
 | 
			
		||||
| 
						 | 
				
			
			@ -19375,10 +19423,19 @@ index fadf297e9..e4e0803c6 100644
 | 
			
		|||
     def get_engine_class() -> Type[LLMEngine]:
 | 
			
		||||
         if envs.VLLM_USE_V1:
 | 
			
		||||
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
 | 
			
		||||
index 2e45b4742..c72581e6c 100644
 | 
			
		||||
index 2e45b4742..b468085b5 100644
 | 
			
		||||
--- a/vllm/entrypoints/openai/api_server.py
 | 
			
		||||
+++ b/vllm/entrypoints/openai/api_server.py
 | 
			
		||||
@@ -16,7 +16,7 @@ from http import HTTPStatus
 | 
			
		||||
@@ -3,6 +3,8 @@ import atexit
 | 
			
		||||
 import importlib
 | 
			
		||||
 import inspect
 | 
			
		||||
 import multiprocessing
 | 
			
		||||
+# Fix https://avd.aquasec.com/nvd/cve-2022-42919 
 | 
			
		||||
+multiprocessing.util.abstract_sockets_supported = False
 | 
			
		||||
 import os
 | 
			
		||||
 import re
 | 
			
		||||
 import signal
 | 
			
		||||
@@ -16,7 +18,7 @@ from http import HTTPStatus
 | 
			
		||||
 from typing import AsyncIterator, Optional, Set, Tuple
 | 
			
		||||
 
 | 
			
		||||
 import uvloop
 | 
			
		||||
| 
						 | 
				
			
			@ -19387,7 +19444,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
 from fastapi.exceptions import RequestValidationError
 | 
			
		||||
 from fastapi.middleware.cors import CORSMiddleware
 | 
			
		||||
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 | 
			
		||||
@@ -44,11 +44,15 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
 | 
			
		||||
@@ -44,11 +46,15 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
 | 
			
		||||
                                               CompletionResponse,
 | 
			
		||||
                                               DetokenizeRequest,
 | 
			
		||||
                                               DetokenizeResponse,
 | 
			
		||||
| 
						 | 
				
			
			@ -19403,7 +19460,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
                                               PoolingRequest, PoolingResponse,
 | 
			
		||||
                                               ScoreRequest, ScoreResponse,
 | 
			
		||||
                                               TokenizeRequest,
 | 
			
		||||
@@ -58,7 +62,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
 | 
			
		||||
@@ -58,7 +64,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
 | 
			
		||||
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 | 
			
		||||
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 | 
			
		||||
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 | 
			
		||||
| 
						 | 
				
			
			@ -19414,7 +19471,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 | 
			
		||||
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 | 
			
		||||
 from vllm.entrypoints.openai.serving_tokenization import (
 | 
			
		||||
@@ -133,32 +139,21 @@ async def build_async_engine_client_from_engine_args(
 | 
			
		||||
@@ -133,32 +141,21 @@ async def build_async_engine_client_from_engine_args(
 | 
			
		||||
     Returns the Client or None if the creation failed.
 | 
			
		||||
     """
 | 
			
		||||
 
 | 
			
		||||
| 
						 | 
				
			
			@ -19458,7 +19515,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
     else:
 | 
			
		||||
         if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
 | 
			
		||||
             # Make TemporaryDirectory for prometheus multiprocessing
 | 
			
		||||
@@ -280,6 +275,10 @@ def base(request: Request) -> OpenAIServing:
 | 
			
		||||
@@ -280,6 +277,10 @@ def base(request: Request) -> OpenAIServing:
 | 
			
		||||
     return tokenization(request)
 | 
			
		||||
 
 | 
			
		||||
 
 | 
			
		||||
| 
						 | 
				
			
			@ -19469,7 +19526,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
 def chat(request: Request) -> Optional[OpenAIServingChat]:
 | 
			
		||||
     return request.app.state.openai_serving_chat
 | 
			
		||||
 
 | 
			
		||||
@@ -315,6 +314,12 @@ async def health(raw_request: Request) -> Response:
 | 
			
		||||
@@ -315,6 +316,12 @@ async def health(raw_request: Request) -> Response:
 | 
			
		||||
     return Response(status_code=200)
 | 
			
		||||
 
 | 
			
		||||
 
 | 
			
		||||
| 
						 | 
				
			
			@ -19482,7 +19539,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
 @router.post("/tokenize")
 | 
			
		||||
 @with_cancellation
 | 
			
		||||
 async def tokenize(request: TokenizeRequest, raw_request: Request):
 | 
			
		||||
@@ -347,10 +352,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 | 
			
		||||
@@ -347,10 +354,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 | 
			
		||||
 
 | 
			
		||||
 @router.get("/v1/models")
 | 
			
		||||
 async def show_available_models(raw_request: Request):
 | 
			
		||||
| 
						 | 
				
			
			@ -19496,7 +19553,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
 
 | 
			
		||||
 
 | 
			
		||||
 @router.get("/version")
 | 
			
		||||
@@ -488,6 +493,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
 | 
			
		||||
@@ -488,6 +495,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
 | 
			
		||||
     return await create_score(request, raw_request)
 | 
			
		||||
 
 | 
			
		||||
 
 | 
			
		||||
| 
						 | 
				
			
			@ -19551,7 +19608,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
 if envs.VLLM_TORCH_PROFILER_DIR:
 | 
			
		||||
     logger.warning(
 | 
			
		||||
         "Torch Profiler is enabled in the API server. This should ONLY be "
 | 
			
		||||
@@ -516,26 +569,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
 | 
			
		||||
@@ -516,26 +571,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
 | 
			
		||||
     @router.post("/v1/load_lora_adapter")
 | 
			
		||||
     async def load_lora_adapter(request: LoadLoraAdapterRequest,
 | 
			
		||||
                                 raw_request: Request):
 | 
			
		||||
| 
						 | 
				
			
			@ -19588,7 +19645,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
 
 | 
			
		||||
         return Response(status_code=200, content=response)
 | 
			
		||||
 
 | 
			
		||||
@@ -639,13 +688,18 @@ def init_app_state(
 | 
			
		||||
@@ -639,13 +690,18 @@ def init_app_state(
 | 
			
		||||
     resolved_chat_template = load_chat_template(args.chat_template)
 | 
			
		||||
     logger.info("Using supplied chat template:\n%s", resolved_chat_template)
 | 
			
		||||
 
 | 
			
		||||
| 
						 | 
				
			
			@ -19610,7 +19667,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
         request_logger=request_logger,
 | 
			
		||||
         chat_template=resolved_chat_template,
 | 
			
		||||
         chat_template_content_format=args.chat_template_content_format,
 | 
			
		||||
@@ -657,16 +711,14 @@ def init_app_state(
 | 
			
		||||
@@ -657,16 +713,14 @@ def init_app_state(
 | 
			
		||||
     state.openai_serving_completion = OpenAIServingCompletion(
 | 
			
		||||
         engine_client,
 | 
			
		||||
         model_config,
 | 
			
		||||
| 
						 | 
				
			
			@ -19629,7 +19686,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
         request_logger=request_logger,
 | 
			
		||||
         chat_template=resolved_chat_template,
 | 
			
		||||
         chat_template_content_format=args.chat_template_content_format,
 | 
			
		||||
@@ -674,7 +726,7 @@ def init_app_state(
 | 
			
		||||
@@ -674,7 +728,7 @@ def init_app_state(
 | 
			
		||||
     state.openai_serving_embedding = OpenAIServingEmbedding(
 | 
			
		||||
         engine_client,
 | 
			
		||||
         model_config,
 | 
			
		||||
| 
						 | 
				
			
			@ -19638,7 +19695,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
         request_logger=request_logger,
 | 
			
		||||
         chat_template=resolved_chat_template,
 | 
			
		||||
         chat_template_content_format=args.chat_template_content_format,
 | 
			
		||||
@@ -682,18 +734,18 @@ def init_app_state(
 | 
			
		||||
@@ -682,18 +736,18 @@ def init_app_state(
 | 
			
		||||
     state.openai_serving_scores = OpenAIServingScores(
 | 
			
		||||
         engine_client,
 | 
			
		||||
         model_config,
 | 
			
		||||
| 
						 | 
				
			
			@ -19660,7 +19717,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
 
 | 
			
		||||
 
 | 
			
		||||
 def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
 | 
			
		||||
@@ -715,11 +767,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
 | 
			
		||||
@@ -715,11 +769,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
 | 
			
		||||
     if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
 | 
			
		||||
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 | 
			
		||||
 
 | 
			
		||||
| 
						 | 
				
			
			@ -19675,7 +19732,7 @@ index 2e45b4742..c72581e6c 100644
 | 
			
		|||
 
 | 
			
		||||
     # workaround to make sure that we bind the port before the engine is set up.
 | 
			
		||||
     # This avoids race conditions with ray.
 | 
			
		||||
@@ -765,6 +817,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
 | 
			
		||||
@@ -765,6 +819,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
 | 
			
		||||
 if __name__ == "__main__":
 | 
			
		||||
     # NOTE(simon):
 | 
			
		||||
     # This section should be in sync with vllm/scripts.py for CLI entrypoints.
 | 
			
		||||
| 
						 | 
				
			
			@ -21049,7 +21106,7 @@ index dde347b78..93ad4651f 100644
 | 
			
		|||
     @classmethod
 | 
			
		||||
     def create_dummy_lora_weights(
 | 
			
		||||
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
 | 
			
		||||
index 5c0e4e5cb..5b7225bdc 100644
 | 
			
		||||
index 5c0e4e5cb..7e57d9c85 100644
 | 
			
		||||
--- a/vllm/lora/models.py
 | 
			
		||||
+++ b/vllm/lora/models.py
 | 
			
		||||
@@ -4,7 +4,7 @@ import math
 | 
			
		||||
| 
						 | 
				
			
			@ -21078,6 +21135,16 @@ index 5c0e4e5cb..5b7225bdc 100644
 | 
			
		|||
         if os.path.isfile(lora_tensor_path):
 | 
			
		||||
             tensors: Dict[str, torch.Tensor] = {}
 | 
			
		||||
             # Find unexpected modules.
 | 
			
		||||
@@ -280,7 +281,8 @@ class LoRAModel(AdapterModel):
 | 
			
		||||
                 new_embeddings_tensor_path)
 | 
			
		||||
         elif os.path.isfile(new_embeddings_bin_file_path):
 | 
			
		||||
             embeddings = torch.load(new_embeddings_bin_file_path,
 | 
			
		||||
-                                    map_location=device)
 | 
			
		||||
+                                    map_location=device,
 | 
			
		||||
+                                    weights_only=True)
 | 
			
		||||
 
 | 
			
		||||
         return cls.from_lora_tensors(
 | 
			
		||||
             lora_model_id=get_lora_id()
 | 
			
		||||
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
 | 
			
		||||
index edf4ba565..ddd42ae93 100644
 | 
			
		||||
--- a/vllm/lora/peft_helper.py
 | 
			
		||||
| 
						 | 
				
			
			@ -22555,6 +22622,39 @@ index f2d9293b3..b3d7d6977 100644
 | 
			
		|||
     if isinstance(load_config.load_format, type):
 | 
			
		||||
         return load_config.load_format(load_config)
 | 
			
		||||
 
 | 
			
		||||
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
 | 
			
		||||
index 8aa0c98df..34c86a23a 100644
 | 
			
		||||
--- a/vllm/model_executor/model_loader/weight_utils.py
 | 
			
		||||
+++ b/vllm/model_executor/model_loader/weight_utils.py
 | 
			
		||||
@@ -93,7 +93,7 @@ def convert_bin_to_safetensor_file(
 | 
			
		||||
     pt_filename: str,
 | 
			
		||||
     sf_filename: str,
 | 
			
		||||
 ) -> None:
 | 
			
		||||
-    loaded = torch.load(pt_filename, map_location="cpu")
 | 
			
		||||
+    loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
 | 
			
		||||
     if "state_dict" in loaded:
 | 
			
		||||
         loaded = loaded["state_dict"]
 | 
			
		||||
     shared = _shared_pointers(loaded)
 | 
			
		||||
@@ -381,7 +381,9 @@ def np_cache_weights_iterator(
 | 
			
		||||
                     disable=not enable_tqdm,
 | 
			
		||||
                     bar_format=_BAR_FORMAT,
 | 
			
		||||
             ):
 | 
			
		||||
-                state = torch.load(bin_file, map_location="cpu")
 | 
			
		||||
+                state = torch.load(bin_file,
 | 
			
		||||
+                                   map_location="cpu",
 | 
			
		||||
+                                   weights_only=True)
 | 
			
		||||
                 for name, param in state.items():
 | 
			
		||||
                     param_path = os.path.join(np_folder, name)
 | 
			
		||||
                     with open(param_path, "wb") as f:
 | 
			
		||||
@@ -447,7 +449,7 @@ def pt_weights_iterator(
 | 
			
		||||
             disable=not enable_tqdm,
 | 
			
		||||
             bar_format=_BAR_FORMAT,
 | 
			
		||||
     ):
 | 
			
		||||
-        state = torch.load(bin_file, map_location="cpu")
 | 
			
		||||
+        state = torch.load(bin_file, map_location="cpu", weights_only=True)
 | 
			
		||||
         yield from state.items()
 | 
			
		||||
         del state
 | 
			
		||||
         torch.cuda.empty_cache()
 | 
			
		||||
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
 | 
			
		||||
index 9437ad968..2e649f10c 100644
 | 
			
		||||
--- a/vllm/model_executor/models/aria.py
 | 
			
		||||
| 
						 | 
				
			
			@ -34955,6 +35055,19 @@ index 17f604ea0..c50eb2cef 100644
 | 
			
		|||
+    # general plugins, we only need to execute the loaded functions
 | 
			
		||||
+    for func in plugins.values():
 | 
			
		||||
+        func()
 | 
			
		||||
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
 | 
			
		||||
index 473b87c89..8b2732923 100644
 | 
			
		||||
--- a/vllm/prompt_adapter/utils.py
 | 
			
		||||
+++ b/vllm/prompt_adapter/utils.py
 | 
			
		||||
@@ -89,6 +89,7 @@ def load_peft_weights(model_id: str,
 | 
			
		||||
         adapters_weights = safe_load_file(filename, device=device)
 | 
			
		||||
     else:
 | 
			
		||||
         adapters_weights = torch.load(filename,
 | 
			
		||||
-                                      map_location=torch.device(device))
 | 
			
		||||
+                                      map_location=torch.device(device),
 | 
			
		||||
+                                      weights_only=True)
 | 
			
		||||
 
 | 
			
		||||
     return adapters_weights
 | 
			
		||||
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
 | 
			
		||||
index fc77f3ca5..605c09b8d 100644
 | 
			
		||||
--- a/vllm/sampling_params.py
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue