vLLM: set convert_to_half to False by default (#13172)

* init * remove * fix
2025-05-21 18:41:28 +08:00 · 2025-05-21 18:41:28 +08:00 · 154af7d7f7
commit 154af7d7f7
parent 1576347892
2 changed files with 3 additions and 1 deletions
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@ -293,6 +293,7 @@ def convert_vllm(module, qtype, in_features, out_features, mp_group, cur_qtype,
                mp_group=mp_group,
                optimize_lm_head=optimize_lm_head,
                enable_scale_search=enable_scale_search,
+                conver_to_half=False,
            )
    return new_linear

@ -589,6 +590,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                            optimize_lm_head=False,
                            act_order=act_order,
                            enable_scale_search=enable_scale_search,
+                            conver_to_half=False,
                        )
                        device = module.qweight.data.device
                        invalidInputError(device.type != "meta",
--- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py
+++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
@ -654,7 +654,7 @@ class LowBitLinear(nn.Linear):
                else:
                    w = self.weight.data

-                if use_batch_forward(x_2d, self.weight.qtype, self.out_len):
+                if use_batch_forward(x_2d, self.weight.qtype, self.out_len) and self.conver_to_half:
                    import xe_batch
                    result = xe_batch.batch_forward(x_2d, w, self.qtype)
                elif not is_training and self.conver_to_half \