LLM: fix chatglm3 issue (#9820)

* fix chatglm3 issue * small update
2024-01-03 16:15:55 +08:00 · 2024-01-03 16:15:55 +08:00 · 20e9742fa0
commit 20e9742fa0
parent a54cd767b1
3 changed files with 62 additions and 57 deletions
--- a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py
+++ b/python/llm/src/bigdl/llm/transformers/models/chatglm2.py
@ -23,6 +23,7 @@ from typing import Optional, Tuple, List
 import torch.nn.functional as F
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 from bigdl.llm.transformers.models.utils import use_flash_attention
 from bigdl.llm.transformers.models.llama import get_ipex_version
@ -365,11 +366,10 @@ def core_attn_forward_8eb45c(self, query_layer, key_layer, value_layer, attentio
    pytorch_major_version = int(torch.__version__.split('.')[0])
    if pytorch_major_version >= 2 and (query_layer.device.type == 'xpu' or query_layer.size(0) > 1):
        query_layer = query_layer.permute(1, 2, 0, 3)
-        if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+        if attention_mask is None and use_flash_attention(query_layer):
            context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer,
                                                                             key_layer,
                                                                             value_layer,
                                                                             attention_mask,
                                                                             is_causal=True)
        elif attention_mask is None:
            scaling_factor = 1 / math.sqrt(query_layer.size(-1))
--- a/python/llm/src/bigdl/llm/transformers/models/llama.py
+++ b/python/llm/src/bigdl/llm/transformers/models/llama.py
@ -42,6 +42,7 @@ from bigdl.llm.utils.common import invalidInputError
 from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, apply_rotary_pos_emb
 from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
 from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from bigdl.llm.transformers.low_bit_linear import SYM_INT4
 from bigdl.llm.ggml.quantize import ggml_tensor_qtype
@ -508,61 +509,6 @@ def llama_attention_selective_batching_forward_4_31(
    return attn_output.to(original_dtype), attn_weights, updated_past_key_values
 def use_flash_attention(query):
    bsz, q_len, _ = query.size()
    # check whether ipex flash attention can be used
    if bsz > 1:
        # only use flash attention for batch_size = 1 now
        # as flash attention doesn't support attn_mask in ipex 2.1,
        # so it will cause output error for padded batch input
        return False
    if q_len == 1:
        # now only use flash attention for first token
        # as it seems have no performance benifit for rest token now
        return False
    if query.device.type != "xpu":
        # ipex flash attention only support for xpu
        return False
    ipex_version = get_ipex_version()
    if ipex_version <= "2.0.110+xpu":
        # ipex flash attention is supported from ipex 2.1
        return False
    if not torch.xpu.has_xetla():
        # ipex flash attention is only supported for xetla
        # may update this later
        return False
    if query.dtype not in [torch.float32, torch.float16]:
        # only use flash attention for fp32/fp16 input
        return False
    return True
 def use_esimd_sdp(q_len, head_dim, query_states):
    if head_dim != 128:
        # esimd_sdp only support head_dim = 128 now
        return False
    elif q_len != 1:
        # esimd_sdp only support rest token now
        return False
    elif query_states.device.type != "xpu":
        # esimd_sdp only support GPU now
        return False
    elif query_states.dtype != torch.float16:
        # esimd_sdp only has optimization for FP16 now
        return False
    else:
        device_name = torch.xpu.get_device_name(query_states.device.index)
        if device_name.startswith("Intel(R) Arc(TM) A") or \
                device_name.startswith("Intel(R) Data Center GPU Flex"):
            import linear_fp16_esimd
            if hasattr(linear_fp16_esimd, "sdp_forward"):
                return True
            else:
                return False
        else:
            return False
 def native_sdp(query, key, value, attention_mask,
               bsz, q_len, kv_seq_len, head_dim, num_heads):
    attn_weights = torch.matmul(query,
--- a/python/llm/src/bigdl/llm/transformers/models/utils.py
+++ b/python/llm/src/bigdl/llm/transformers/models/utils.py
@ -16,6 +16,7 @@
 import torch
 from bigdl.llm.utils.common import invalidInputError
 from bigdl.llm.transformers.utils import get_ipex_version
 def init_kv_cache(batch_size, num_heads, head_dim, current_length, max_length, dtype, device):
@ -119,3 +120,61 @@ def is_enough_kv_cache_room_4_31(past_key_value):
    # to determinate if is enough kv cache room in transformers between 4.31 and 4.35
    return past_key_value is not None and \
        past_key_value[0].stride()[1] > past_key_value[0].size(2) * past_key_value[0].size(3)
 def use_flash_attention(query):
    if query.dim() == 3:
        bsz, q_len, _ = query.size()
    elif query.dim() == 4:
        bsz, _, q_len, _ = query.size()
    # check whether ipex flash attention can be used
    if bsz > 1:
        # only use flash attention for batch_size = 1 now
        # as flash attention doesn't support attn_mask in ipex 2.1,
        # so it will cause output error for padded batch input
        return False
    if q_len == 1:
        # now only use flash attention for first token
        # as it seems have no performance benifit for rest token now
        return False
    if query.device.type != "xpu":
        # ipex flash attention only support for xpu
        return False
    ipex_version = get_ipex_version()
    if ipex_version <= "2.0.110+xpu":
        # ipex flash attention is supported from ipex 2.1
        return False
    if not torch.xpu.has_xetla():
        # ipex flash attention is only supported for xetla
        # may update this later
        return False
    if query.dtype not in [torch.float32, torch.float16]:
        # only use flash attention for fp32/fp16 input
        return False
    return True
 def use_esimd_sdp(q_len, head_dim, query_states):
    if head_dim != 128:
        # esimd_sdp only support head_dim = 128 now
        return False
    elif q_len != 1:
        # esimd_sdp only support rest token now
        return False
    elif query_states.device.type != "xpu":
        # esimd_sdp only support GPU now
        return False
    elif query_states.dtype != torch.float16:
        # esimd_sdp only has optimization for FP16 now
        return False
    else:
        device_name = torch.xpu.get_device_name(query_states.device.index)
        if device_name.startswith("Intel(R) Arc(TM) A") or \
                device_name.startswith("Intel(R) Data Center GPU Flex"):
            import linear_fp16_esimd
            if hasattr(linear_fp16_esimd, "sdp_forward"):
                return True
            else:
                return False
        else:
            return False