diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py index de9127ed..38910d3d 100644 --- a/python/llm/src/bigdl/llm/transformers/convert.py +++ b/python/llm/src/bigdl/llm/transformers/convert.py @@ -238,6 +238,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\ .to(device) elif qtype not in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]: + if in_features % 64 != 0: + # now our kernel requires in_features is a multiple of 64 + continue new_linear = LowBitLinear( in_features, out_features, diff --git a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py index ffeb6cba..08dbab8f 100644 --- a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py +++ b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py @@ -478,7 +478,11 @@ class LowBitLinear(nn.Linear): if x_2d.is_contiguous() is False: x_2d = x_2d.contiguous() - input_seq_size = x_shape[1] + if len(x_shape) == 3: + input_seq_size = x_shape[1] + elif len(x_shape) < 3: + input_seq_size = 1 + if is_training: # training path if x_2d.requires_grad: