From 154af7d7f75e40fb89c48347673df04461f437ca Mon Sep 17 00:00:00 2001 From: Xiangyu Tian <109123695+xiangyuT@users.noreply.github.com> Date: Wed, 21 May 2025 18:41:28 +0800 Subject: [PATCH] vLLM: set convert_to_half to False by default (#13172) * init * remove * fix --- python/llm/src/ipex_llm/transformers/convert.py | 2 ++ python/llm/src/ipex_llm/transformers/low_bit_linear.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 4db9176e..c3395cef 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -293,6 +293,7 @@ def convert_vllm(module, qtype, in_features, out_features, mp_group, cur_qtype, mp_group=mp_group, optimize_lm_head=optimize_lm_head, enable_scale_search=enable_scale_search, + conver_to_half=False, ) return new_linear @@ -589,6 +590,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, optimize_lm_head=False, act_order=act_order, enable_scale_search=enable_scale_search, + conver_to_half=False, ) device = module.qweight.data.device invalidInputError(device.type != "meta", diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index 2712d82f..292c765a 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -654,7 +654,7 @@ class LowBitLinear(nn.Linear): else: w = self.weight.data - if use_batch_forward(x_2d, self.weight.qtype, self.out_len): + if use_batch_forward(x_2d, self.weight.qtype, self.out_len) and self.conver_to_half: import xe_batch result = xe_batch.batch_forward(x_2d, w, self.qtype) elif not is_training and self.conver_to_half \