# # Copyright 2016 The BigDL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import torch from typing import Optional, Union from vllm.distributed import tensor_model_parallel_gather, tensor_model_parallel_all_gather from vllm.logger import init_logger from vllm.model_executor.models.llama import LlamaMLP, LlamaAttention, LlamaForCausalLM from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Attention, Qwen2ForCausalLM from vllm.model_executor.models.qwen import QWenMLP, QWenAttention, QWenLMHeadModel from vllm.model_executor.models.baichuan import BaiChuanMLP, BaiChuanAttention from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.chatglm import GLMMLP, GLMAttention, ChatGLMForCausalLM from vllm.model_executor.model_loader import get_model from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.attention import AttentionMetadata from vllm.config import DeviceConfig from typing import Tuple from ipex_llm.transformers.low_bit_linear import LowBitLinear def _sample_get_logits( self, hidden_states: torch.Tensor, lm_head: Union[VocabParallelEmbedding, LowBitLinear], embedding_bias: Optional[torch.Tensor], ) -> torch.Tensor: # HINT: we do not support other types of quantization for now # TODO: we may encounter tie-word-embedding problems if isinstance(lm_head, VocabParallelEmbedding): logits = lm_head.linear_method.apply(lm_head, hidden_states, bias=embedding_bias) else: logits = lm_head(hidden_states) if embedding_bias is not None: logits += embedding_bias if self.use_gather: logits = tensor_model_parallel_gather(logits) else: logits = tensor_model_parallel_all_gather(logits) if logits is not None: logits = logits[:, : self.org_vocab_size] return logits def _model_sample_convert(): from vllm.model_executor.layers.logits_processor import LogitsProcessor setattr(LogitsProcessor, "_get_logits", _sample_get_logits) def _ipex_llm_convert(load_in_low_bit): from vllm.worker.xpu_model_runner import XPUModelRunner from ipex_llm.vllm.xpu.ipex_llm_wrapper import get_ipex_llm_wrapper from ipex_llm.vllm.xpu.ipex_llm_v1_wrapper import get_ipex_llm_v1_wrapper import vllm.executor.ray_utils as ray_utils_v0 import vllm.v1.executor.ray_utils as ray_utils_v1 from vllm.v1.worker.gpu_model_runner import GPUModelRunner setattr(XPUModelRunner, "load_model", get_load_function(load_in_low_bit)) setattr(GPUModelRunner, "load_model", get_load_function(load_in_low_bit)) setattr(ray_utils_v0, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit)) setattr(ray_utils_v1, "RayWorkerWrapper", get_ipex_llm_v1_wrapper(load_in_low_bit)) def get_load_function(low_bit): def _ipex_llm_load_model(self) -> None: if "gemma-3" not in self.model_config.model.lower(): _model_sample_convert() # from vllm.utils import measure_device_memory from vllm.utils import DeviceMemoryProfiler with DeviceMemoryProfiler() as m: import os from dataclasses import replace new_device_config = DeviceConfig("cpu") new_vllm_config = replace(self.vllm_config, device_config=new_device_config) # We are loading an low-bit model, where all the optimizations should have been # applied... # We can skip the following optimizations self.model = get_model( vllm_config=new_vllm_config ) if self.vllm_config.model_config.low_bit_model_path is None: if ("qwen" in self.vllm_config.model_config.model.lower() or "baichuan" in self.vllm_config.model_config.model.lower() or "codegeex4-all" in self.vllm_config.model_config.model.lower() or "chatglm" in self.vllm_config.model_config.model.lower()) and \ "gptq" not in self.model_config.model.lower() and \ "awq" not in self.model_config.model.lower(): self.model.apply(padding_mlp) from ipex_llm import optimize_model not_convert_last_mlp = os.getenv("IPEX_LLM_NOT_CONVERT_LAST_MLP", None) if not_convert_last_mlp is not None: # only use to avoid nan value in last mlp forward running glm4-9b-chat modules = ["35.mlp", "36.mlp", "37.mlp", "38.mlp", "39.mlp"] else: modules = None not_convert_o_proj = os.getenv("IPEX_LLM_NOT_CONVERT_O_PROJ", None) if not_convert_o_proj is not None: # only use to avoid nan value in o_proj running DeepSeek-R1-Distill-Qwen-14B modules = ["o_proj"] else: modules = None if "minicpm" in self.vllm_config.model_config.model.lower(): modules = ["vpm", "resampler"] if "internvl2" in self.vllm_config.model_config.model.lower(): modules = ["vision_model", "mlp1"] if "deepseek-v2" in self.vllm_config.model_config.model.lower(): modules = ["down_proj"] optimize_model(self.model, low_bit=low_bit, torch_dtype=self.vllm_config.model_config.dtype, modules_to_not_convert=modules) # Guancheng: We have to save the model before moving it to the XPU device. # The `to` method will convert the underlying data. # Saving it before will help to avoid converting two times. if self.vllm_config.model_config.low_bit_save_path is not None: # The local_rank is used for loading models with tensor parallel settings. local_rank = os.environ["LOCAL_RANK"] saved_path = os.path.join(self.vllm_config.model_config.low_bit_save_path, str(local_rank)) self.model.save_low_bit(saved_path) self.model = self.model.to(device=self.vllm_config.device_config.device, dtype=self.vllm_config.model_config.dtype) self.model_memory_usage = m.consumed_memory logger = init_logger(__name__) logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30)) return _ipex_llm_load_model def padding_mlp(module: torch.nn.Module): mlp_gate_up_name = None mlp_down_name = None if isinstance(module, Qwen2MLP): mlp_gate_up_name = "gate_up_proj" mlp_down_name = "down_proj" elif isinstance(module, GLMMLP): mlp_gate_up_name = "dense_h_to_4h" mlp_down_name = "dense_4h_to_h" elif isinstance(module, BaiChuanMLP): mlp_gate_up_name = "gate_up_proj" mlp_down_name = "down_proj" else: return hidden_size = getattr(module, mlp_down_name).output_size # devide by rank intermediate_size = getattr(module, mlp_down_name).input_size_per_partition padding_size = 256 padding_intermediate_size = \ (intermediate_size + padding_size - 1) // padding_size * padding_size if intermediate_size % padding_size == 0: return gate_up_weight = getattr(module, mlp_gate_up_name).weight.data new_gate_up_weight = torch.zeros([padding_intermediate_size * 2, hidden_size], dtype=gate_up_weight.dtype, device=gate_up_weight.device) # merge_gate_up_weight new_gate_up_weight[:intermediate_size, :] = gate_up_weight[:intermediate_size, :] new_gate_up_weight[padding_intermediate_size:padding_intermediate_size+intermediate_size, :] = gate_up_weight[intermediate_size:, :] # noqa getattr(module, mlp_gate_up_name).output_size_per_partition = padding_intermediate_size * 2 getattr(module, mlp_gate_up_name).output_size = padding_intermediate_size * 2 getattr(module, mlp_gate_up_name).weight = \ torch.nn.Parameter(new_gate_up_weight, requires_grad=False) down_weight = getattr(module, mlp_down_name).weight.data new_down_weight = torch.zeros([hidden_size, padding_intermediate_size], dtype=down_weight.dtype, device=down_weight.device) new_down_weight[:, :intermediate_size] = down_weight getattr(module, mlp_down_name).input_size_per_partition = padding_intermediate_size getattr(module, mlp_down_name).input_size = padding_intermediate_size getattr(module, mlp_down_name).weight = torch.nn.Parameter(new_down_weight, requires_grad=False)