From 777e61d8c85122c6aefd3dbbc9020db741c0510a Mon Sep 17 00:00:00 2001
From: binbin Deng <108676127+plusbang@users.noreply.github.com>
Date: Wed, 24 Jul 2024 13:14:39 +0800
Subject: [PATCH] Fix qwen2 & int4 on NPU (#11646)

---
 python/llm/src/ipex_llm/transformers/npu_models/convert.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
index 7129cc54..cd4b5fed 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
@@ -57,6 +57,12 @@ def replace_with_QuantizedLinear(layer, qtype, device):
     from ipex_llm.ggml.quantize import ggml_tensor_qtype
     iqtype = ggml_tensor_qtype[qtype]
     if isinstance(layer, torch.nn.Linear):
+        if qtype == "sym_int4_rtn":
+            # workaround for qwen2 & int4
+            if (layer.in_features == 3584 and layer.out_features == 152064) or \
+               (layer.in_features == 18944 and layer.out_features == 3584):
+                qtype = "sym_int8_rtn"
+                iqtype = ggml_tensor_qtype[qtype]
         qweights, scale = ggml_convert_qtype(layer.weight.data, iqtype, device=device)
         return QuantizedLinear(qweights, scale, layer.bias)