From 154af7d7f75e40fb89c48347673df04461f437ca Mon Sep 17 00:00:00 2001
From: Xiangyu Tian <109123695+xiangyuT@users.noreply.github.com>
Date: Wed, 21 May 2025 18:41:28 +0800
Subject: [PATCH] vLLM: set convert_to_half to False by default (#13172)

* init

* remove

* fix
---
 python/llm/src/ipex_llm/transformers/convert.py        | 2 ++
 python/llm/src/ipex_llm/transformers/low_bit_linear.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
index 4db9176e..c3395cef 100644
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@@ -293,6 +293,7 @@ def convert_vllm(module, qtype, in_features, out_features, mp_group, cur_qtype,
                 mp_group=mp_group,
                 optimize_lm_head=optimize_lm_head,
                 enable_scale_search=enable_scale_search,
+                conver_to_half=False,
             )
     return new_linear
 
@@ -589,6 +590,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                             optimize_lm_head=False,
                             act_order=act_order,
                             enable_scale_search=enable_scale_search,
+                            conver_to_half=False,
                         )
                         device = module.qweight.data.device
                         invalidInputError(device.type != "meta",
diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
index 2712d82f..292c765a 100644
--- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py
+++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
@@ -654,7 +654,7 @@ class LowBitLinear(nn.Linear):
                 else:
                     w = self.weight.data
 
-                if use_batch_forward(x_2d, self.weight.qtype, self.out_len):
+                if use_batch_forward(x_2d, self.weight.qtype, self.out_len) and self.conver_to_half:
                     import xe_batch
                     result = xe_batch.batch_forward(x_2d, w, self.qtype)
                 elif not is_training and self.conver_to_half \