Fix qwen2 & int4 on NPU (#11646)
This commit is contained in:
parent
1b3b46e54d
commit
777e61d8c8
1 changed files with 6 additions and 0 deletions
|
|
@ -57,6 +57,12 @@ def replace_with_QuantizedLinear(layer, qtype, device):
|
||||||
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
||||||
iqtype = ggml_tensor_qtype[qtype]
|
iqtype = ggml_tensor_qtype[qtype]
|
||||||
if isinstance(layer, torch.nn.Linear):
|
if isinstance(layer, torch.nn.Linear):
|
||||||
|
if qtype == "sym_int4_rtn":
|
||||||
|
# workaround for qwen2 & int4
|
||||||
|
if (layer.in_features == 3584 and layer.out_features == 152064) or \
|
||||||
|
(layer.in_features == 18944 and layer.out_features == 3584):
|
||||||
|
qtype = "sym_int8_rtn"
|
||||||
|
iqtype = ggml_tensor_qtype[qtype]
|
||||||
qweights, scale = ggml_convert_qtype(layer.weight.data, iqtype, device=device)
|
qweights, scale = ggml_convert_qtype(layer.weight.data, iqtype, device=device)
|
||||||
return QuantizedLinear(qweights, scale, layer.bias)
|
return QuantizedLinear(qweights, scale, layer.bias)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue