From 278b191dc16991f86064e5b2e150e1edb0738e8e Mon Sep 17 00:00:00 2001 From: Guancheng Fu <110874468+gc-fu@users.noreply.github.com> Date: Thu, 22 Aug 2024 17:45:26 +0800 Subject: [PATCH] Fix optimize lm head error (#11899) --- python/llm/src/ipex_llm/transformers/convert.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 4f44b058..a36d668e 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -225,6 +225,8 @@ def convert_vllm(module, qtype, in_features, out_features, mp_group, cur_qtype, from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from ipex_llm.transformers.low_bit_linear import LowBitLinear, \ FP16Linear, BF16Linear, vLLMLowBitLinear, vLLMFP16Linear, vLLMBF16Linear + # Currently, vLLM does not support optimize_lm_head = True + optimize_lm_head = False if isinstance(module, ParallelLMHead): if qtype == ggml_tensor_qtype["fp16"]: new_linear = FP16Linear(