Support vpm and resampler module of minicpm-v on NPU (#12375)

2024-11-12 15:59:55 +08:00 · 2024-11-12 15:59:55 +08:00 · 7a97fbb779
commit 7a97fbb779
parent 85c9279e6e
5 changed files with 592 additions and 15 deletions
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@ -633,7 +633,7 @@ def transformers_int4_npu_win(repo_id,
        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=optimize_model,
                                          trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), 
                                          quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
-                                          attn_implementation="eager", modules_to_not_convert=["vpm", "resampler"]).eval()
+                                          attn_implementation="eager", torch_dtype=torch.float16).eval()
        model = model.llm
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    else:
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py
@ -56,7 +56,7 @@ if __name__ == "__main__":

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
-        torch_dtype=torch.float32,
+        torch_dtype=torch.float16,
        trust_remote_code=True,
        attn_implementation="eager",
        load_in_low_bit="sym_int4",
@ -66,7 +66,6 @@ if __name__ == "__main__":
        intra_pp=args.intra_pp,
        inter_pp=args.inter_pp,
        transpose_value_cache=not args.disable_transpose_value_cache,
-        modules_to_not_convert=['vpm', 'resampler']
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
@ -47,7 +47,7 @@ if __name__ == '__main__':
    image_path = args.image_url_or_path

    model = AutoModel.from_pretrained(model_path, 
-                                      torch_dtype=torch.float32,
+                                      torch_dtype=torch.float16,
                                      trust_remote_code=True,
                                      attn_implementation="eager",
                                      load_in_low_bit="sym_int4",
@ -57,8 +57,7 @@ if __name__ == '__main__':
                                      intra_pp=args.intra_pp,
                                      inter_pp=args.inter_pp,
                                      transpose_value_cache=not args.disable_transpose_value_cache,
-                                      modules_to_not_convert=['vpm', 'resampler']
-                                     )
+                                      )
    tokenizer = AutoTokenizer.from_pretrained(model_path,
                                              trust_remote_code=True)
    model.eval()
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
@ -46,12 +46,7 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
            from ipex_llm.transformers.models.baichuan import pre_compute_inv_freq
            model.apply(pre_compute_inv_freq)

-    # MiniCPM-V 2.6 must put lm_head on CPU now
-    cpu_lm_head = (
-        (model.config.model_type == "minicpmv" and model.config.hidden_size == 3584 and
-         model.config.vocab_size == 151666)
-        or os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0"
-    )
+    cpu_lm_head = os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0"

    # workaround for MiniCPM-2B
    if model.config.model_type == "minicpm" and model.config.num_hidden_layers == 40:
@ -76,6 +71,48 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,

    if model.config.model_type == "minicpmv" and hasattr(model, "llm"):
        # MiniCPM-V
+        # convert conv2d and layernorm
+        from ipex_llm.transformers.npu_models.minicpmv_mp import MinicpmVPatchEmbedding, \
+            replace_with_Layernorm
+        origin_conv = model.vpm.embeddings.patch_embedding
+        new_conv = MinicpmVPatchEmbedding(
+            weight=origin_conv.weight.to(torch.float16),
+            bias=origin_conv.bias.to(torch.float16),
+            strides=model.config.vision_config.patch_size,
+        )
+        model.vpm.embeddings.patch_embedding = new_conv
+        del new_conv
+        replace_with_Layernorm(model, qtype=None, device='NPU',
+                               modules_to_not_convert=[], group_size=0)
+
+        # replace forward function
+        from ipex_llm.transformers.npu_models.minicpmv_mp import pad_mlp_fc2, pad_mlp_forward, \
+            encoder_attn_forward, multi_head_attn_forward, resampler_forward
+        model.apply(pad_mlp_fc2)    # pad mlp.fc2 to avoid compile error
+        modeling_module_name = model.__class__.__module__
+        module = importlib.import_module(modeling_module_name)
+        setattr(module.Resampler, "forward", resampler_forward)
+        module = importlib.import_module(modeling_module_name.replace("modeling_minicpmv",
+                                                                      "resampler"))
+        setattr(module.MultiheadAttention, "multi_head_attention_forward", multi_head_attn_forward)
+        if model.config.hidden_size == 3584 and model.config.vocab_size == 151666:
+            # MiniCPM-V 2.6
+            module = importlib.import_module(modeling_module_name.replace("modeling_minicpmv",
+                                                                          "modeling_navit_siglip"))
+            setattr(module.SiglipAttention, "forward", encoder_attn_forward)
+            setattr(module.SiglipMLP, "forward", pad_mlp_forward)
+
+            # workaround for lm_head on NPU
+            from ipex_llm.transformers.npu_models.minicpmv_mp import pad_lm_head, lm_head_forward
+            model.apply(pad_lm_head)    # pad lm_head to avoid compile error
+            setattr(model.llm.lm_head, "forward", lm_head_forward)
+        elif model.config.hidden_size == 4096 and model.config.vocab_size == 128256:
+            # MiniCPM-V 2.5
+            from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionMLP, \
+                Idefics2VisionAttention
+            convert_forward(model, Idefics2VisionAttention, encoder_attn_forward)
+            convert_forward(model, Idefics2VisionMLP, pad_mlp_forward)
+
        if model.config.hidden_size == 2304 and model.config.vocab_size == 122753:
            # MiniCPM-V 2
            model.llm.config.model_type = "minicpm"
@ -126,9 +163,9 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
                model.lm_head = new_lm_head

    if model.config.model_type == "qwen2":
-        # for Qwen2-7B-Insturct, divide lm_head into 14 parts
-        if model.config.hidden_size == 3584 and model.config.vocab_size == 152064 and \
-                not cpu_lm_head:
+        # for Qwen2-7B-Insturct and MiniCPM-V 2.6, divide lm_head into 14 parts
+        if model.config.hidden_size == 3584 and (model.config.vocab_size == 152064 or
+           model.config.vocab_size == 151666) and not cpu_lm_head:
            # Do not split lm_head and use sym_int8 instead when mixed_precison is True
            if quantization_group_size == 0:
                # Do not split lm_head and use sym_int8 instead when mixed_precison is True
--- a/python/llm/src/ipex_llm/transformers/npu_models/minicpmv_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/minicpmv_mp.py
@ -0,0 +1,542 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/idefics2/modeling_idefics2.py
+# which is licensed under Apache License 2.0:
+#
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/resampler.py
+# which is licensed under Apache License 2.0:
+#
+# Copyright 2024 OpenBMB
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+from ipex_llm.utils.common.log4Error import invalidInputError
+from torch import Tensor
+import warnings
+from torch.nn.functional import *
+from torch.nn.modules.activation import *
+from intel_npu_acceleration_library.backend.factory import NNFactory
+import numpy as np
+from functools import partial
+import uuid
+from ipex_llm.transformers.npu_models.mp_models_base import run_model
+from ipex_llm.transformers.npu_models.convert import module_optimization
+
+
+class MinicpmVConv2d(NNFactory):
+    def __init__(
+        self,
+        input_shape,
+        weight_shape,
+        bias,
+        strides,
+        padding,
+        dilation,
+        groups,
+        device: str = "NPU",
+    ):
+        super().__init__(False, device)
+
+        # define input
+        input = self.parameter(input_shape, dtype=np.float16)
+        weight = self.parameter(weight_shape, dtype=np.float16)
+        if bias is not None:
+            bias_node = self.parameter((1, weight_shape[0], 1, 1), dtype=np.float16)
+        else:
+            bias_node = None
+
+        input = self.concat(input, input, axis=2)  # current workaround for compile error
+        res = self.convolution(input_node=input,
+                               weights_node=weight,
+                               bias=bias_node,
+                               strides=strides,
+                               padding=padding,
+                               dilation=dilation,
+                               groups=groups)
+        res = self.slice(res, begin=[0, 0, 0, 0],
+                         end=[res.shape[0], res.shape[1], 1, res.shape[3]])
+        # define outputs
+        res = self.convert_to_fp16(res)
+
+        print("start compiling")
+        self.compile()
+
+
+class MinicpmVPatchEmbedding(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+        strides=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+    ):
+        super().__init__()
+
+        self.op_id = str(uuid.uuid4())
+        self.parameters = [weight]
+        if bias is not None:
+            self.parameters.append(bias)
+        self.backend_cls = partial(
+            MinicpmVConv2d,
+            weight_shape=weight.shape,
+            bias=bias,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+        )
+
+    def forward(self, x):
+        x = x.to(torch.float16)
+        return run_model(x, self.parameters, self.backend_cls, self.op_id)
+
+
+class LayerNorm(NNFactory):
+    def __init__(
+        self,
+        input_shape,
+        weight_shape,
+        bias_shape,
+        eps,
+        device: str = "NPU",
+    ):
+        super().__init__(False, device)
+
+        # define input
+        input = self.parameter(input_shape, dtype=np.float16)
+        weight = self.parameter(weight_shape, dtype=np.float16)
+        bias = self.parameter(bias_shape, dtype=np.float16)
+
+        input = self.convert_to_fp32(input)
+        mean_res = self.reduce_mean(input, -1, keep_dims=True,)
+        variance = self.reduce_mean(
+            self.power(input - mean_res, self.constant(np.array([[2]], dtype=np.float32))),
+            -1,
+            keep_dims=True,
+        )
+        eps = self.constant(eps)
+        input = self.eltwise_div(input - mean_res, self.sqrt(self.eltwise_add(variance, eps)))
+        weight = self.convert_to_fp32(weight)
+        input = self.eltwise_mul(weight, input)
+        bias = self.convert_to_fp32(bias)
+        input = self.eltwise_add(bias, input)
+
+        # define outputs
+        input = self.convert_to_fp16(input)
+
+        print("start compiling")
+        self.compile()
+
+
+class MinicpmVLayerNorm(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+        eps=1e-6,
+    ):
+        super().__init__()
+        self.op_id = str(uuid.uuid4())
+        self.parameters = [weight, bias]
+        self.backend_cls = partial(
+            LayerNorm,
+            weight_shape=weight.shape,
+            bias_shape=bias.shape,
+            eps=eps,
+        )
+
+    def forward(self, x):
+        x = x.to(torch.float16)
+        return run_model(x, self.parameters, self.backend_cls, self.op_id)
+
+
+@module_optimization
+def replace_with_Layernorm(layer, qtype=None, device='NPU',
+                           modules_to_not_convert=[], group_size=0):
+    if isinstance(layer, torch.nn.LayerNorm):
+        return MinicpmVLayerNorm(
+            weight=layer.weight.to(torch.float16),
+            bias=layer.bias.to(torch.float16),
+        )
+
+
+def pad_mlp_fc2(module: torch.nn.Module):
+    if hasattr(module, 'fc2') and module.fc2.in_features == 4304:
+        new_linear = torch.nn.Linear(0, 0, bias=True)
+        padded_weight = torch.cat((module.fc2.weight, module.fc2.weight[:, :(1152*4-4304)]), dim=1)
+        new_weight = torch.nn.Parameter(padded_weight, requires_grad=False)
+        new_linear.weight = new_weight
+        new_linear.bias = module.fc2.bias
+        new_linear.in_features = new_weight.size(1)
+        new_linear.out_features = new_weight.size(0)
+        module.fc2 = new_linear
+        del new_linear
+
+
+def pad_mlp_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    hidden_states = self.fc1(hidden_states)
+    hidden_states = self.activation_fn(hidden_states)
+    hidden_states = F.pad(hidden_states,
+                          (0, (1152*4-4304), 0, 0, 0, 0))
+    hidden_states = self.fc2(hidden_states)
+    return hidden_states
+
+
+def pad_lm_head(module: torch.nn.Module):
+    if hasattr(module, 'lm_head') and module.lm_head.in_features == 3584 \
+       and module.lm_head.out_features == 151666:
+        new_linear = torch.nn.Linear(0, 0, bias=False)
+        padded_weight = F.pad(module.lm_head.weight,
+                              (0, 0, 0, 152064-151666))  # 152064 is qwen2-7b vocab_size
+        new_weight = torch.nn.Parameter(padded_weight, requires_grad=False)
+        new_linear.weight = new_weight
+        new_linear.in_features = new_weight.size(1)
+        new_linear.out_features = new_weight.size(0)
+        module.lm_head = new_linear
+        del new_linear
+
+
+def lm_head_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    hidden_states = self(hidden_states)
+    hidden_states = hidden_states[:, :, :151666]
+    return hidden_states
+
+
+def encoder_attn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    output_attentions: Optional[bool] = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel"""
+
+    batch_size, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(batch_size, q_len,
+                                     self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(batch_size, q_len,
+                                 self.num_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(batch_size, q_len,
+                                     self.num_heads, self.head_dim).transpose(1, 2)
+
+    k_v_seq_len = key_states.shape[-2]
+    # ipex-llm change starts
+    attn_weights = torch.matmul(query_states.float(),
+                                key_states.float().transpose(2, 3)) * self.scale
+    # ipex-llm change ends
+
+    if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+        invalidInputError(False,
+                          f"Attention weights should be of size ({batch_size, self.num_heads, }"
+                          f"{q_len, k_v_seq_len}), but is {attn_weights.size()}")
+
+    if attention_mask is not None:
+        invalidInputError(attention_mask.size() == (batch_size, 1, q_len, k_v_seq_len),
+                          f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}"
+                          f", but is {attention_mask.size()}")
+        attn_weights = attn_weights + attention_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1,
+                                         dtype=torch.float32).to(query_states.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+    # ipex-llm change starts
+    attn_output = torch.matmul(attn_weights.float(), value_states.float())
+    # ipex-llm change ends
+
+    invalidInputError(attn_output.size() == (batch_size, self.num_heads, q_len, self.head_dim),
+                      f"`attn_output` should be of size ({batch_size, self.num_heads, }"
+                      f"{q_len, self.head_dim}), but is {attn_output.size()}")
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+    attn_output = self.out_proj(attn_output)
+
+    return attn_output, attn_weights
+
+
+def _in_projection_packed(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    w: Tensor,
+    b: Optional[Tensor] = None,
+) -> List[Tensor]:
+    w_q, w_k, w_v = w.chunk(3)
+    if b is None:
+        b_q = b_k = b_v = None
+    else:
+        b_q, b_k, b_v = b.chunk(3)
+    return linear(q.float(), w_q.float(), b_q.float()), \
+        linear(k.float(), w_k.float(), b_k.float()), \
+        linear(v.float(), w_v.float(), b_v.float())
+
+
+def multi_head_attn_forward(
+    self,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+    in_proj_weight: Optional[Tensor],
+    in_proj_bias: Optional[Tensor],
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Optional[Tensor],
+    training: bool = True,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+    average_attn_weights: bool = True,
+    is_causal: bool = False,
+) -> Tuple[Tensor, Optional[Tensor]]:
+    # port from https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/resampler.py#L338
+    # to solve conflict of fp16 and fp32 dtype
+    is_batched = True if query.dim() == 3 else False
+
+    # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
+    # is batched, run the computation and before returning squeeze the
+    # batch dimension so that the output doesn't carry this temporary batch dimension.
+    if not is_batched:
+        # unsqueeze if the input is unbatched
+        query = query.unsqueeze(1)
+        key = key.unsqueeze(1)
+        value = value.unsqueeze(1)
+        if key_padding_mask is not None:
+            key_padding_mask = key_padding_mask.unsqueeze(0)
+
+    # set up shape vars
+    tgt_len, bsz, embed_dim = query.shape
+    src_len, _, _ = key.shape
+
+    if isinstance(embed_dim, torch.Tensor):
+        # embed_dim can be a tensor when JIT tracing
+        head_dim = embed_dim.div(num_heads, rounding_mode='trunc')
+    else:
+        head_dim = embed_dim // num_heads
+
+    # compute in-projection
+    q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
+
+    # prep attention mask
+    if attn_mask is not None:
+        # ensure attn_mask's dim is 3
+        if attn_mask.dim() == 2:
+            correct_2d_size = (tgt_len, src_len)
+            invalidInputError(attn_mask.shape == correct_2d_size,
+                              f"The shape of the 2D attn_mask is {attn_mask.shape},"
+                              f"but should be {correct_2d_size}.")
+            attn_mask = attn_mask.unsqueeze(0)
+        elif attn_mask.dim() == 3:
+            correct_3d_size = (bsz * num_heads, tgt_len, src_len)
+            invalidInputError(attn_mask.shape == correct_3d_size,
+                              f"The shape of the 3D attn_mask is {attn_mask.shape},"
+                              f" but should be {correct_3d_size}.")
+        else:
+            invalidInputError(False, f"attn_mask's dimension {attn_mask.dim()} is not supported")
+
+    # add bias along batch dimension (currently second)
+    if bias_k is not None and bias_v is not None:
+        k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+        v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+
+    #
+    # reshape q, k, v for multihead attention and make em batch first
+    #
+    q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if static_k is None:
+        k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
+    else:
+        k = static_k
+    if static_v is None:
+        v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
+    else:
+        v = static_v
+
+    # add zero attention along batch dimension (now first)
+    if add_zero_attn:
+        zero_attn_shape = (bsz * num_heads, 1, head_dim)
+        k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+
+    # update source sequence length after adjustments
+    src_len = k.size(1)
+
+    # merge key padding and attention masks
+    if key_padding_mask is not None:
+        key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).   \
+            expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
+        if attn_mask is None:
+            attn_mask = key_padding_mask
+        else:
+            attn_mask = attn_mask + key_padding_mask
+
+    # adjust dropout probability
+    if not training:
+        dropout_p = 0.0
+
+    # (deep breath) calculate attention and out projection
+    if need_weights:
+        B, Nt, E = q.shape
+        q_scaled = q / math.sqrt(E)
+
+        if attn_mask is not None:
+            attn_output_weights = torch.baddbmm(attn_mask.float(),
+                                                q_scaled.float(), k.transpose(-2, -1))
+        else:
+            attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
+        attn_output_weights = softmax(attn_output_weights, dim=-1)
+        if dropout_p > 0.0:
+            attn_output_weights = dropout(attn_output_weights, p=dropout_p)
+
+        attn_output = torch.bmm(attn_output_weights.float(), v.float())
+
+        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
+
+        # optionally average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        if average_attn_weights:
+            attn_output_weights = attn_output_weights.mean(dim=1)
+
+        if not is_batched:
+            # squeeze the output if input was unbatched
+            attn_output = attn_output.squeeze(1)
+            attn_output_weights = attn_output_weights.squeeze(0)
+        return attn_output, attn_output_weights
+    else:
+        # attn_mask can be either (L,S) or (N*num_heads, L, S)
+        # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S)
+        # in order to match the input for SDPA of (N, num_heads, L, S)
+        if attn_mask is not None:
+            if attn_mask.size(0) == 1 and attn_mask.dim() == 3:
+                attn_mask = attn_mask.unsqueeze(0)
+            else:
+                attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
+
+        q = q.view(bsz, num_heads, tgt_len, head_dim)
+        k = k.view(bsz, num_heads, src_len, head_dim)
+        v = v.view(bsz, num_heads, src_len, head_dim)
+
+        attn_output = F.scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
+        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
+        if not is_batched:
+            # squeeze the output if input was unbatched
+            attn_output = attn_output.squeeze(1)
+        return attn_output, None
+
+
+def resampler_forward(self, x, tgt_sizes=None):
+    # port from https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/resampler.py#L130
+    bs = x.shape[0]
+
+    device = x.device
+    dtype = x.dtype
+
+    patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+    self._adjust_pos_cache(tgt_sizes, device=device)
+
+    max_patch_len = torch.max(patch_len)
+    key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool, device=device)
+
+    pos_embed = []
+    for i in range(bs):
+        tgt_h, tgt_w = tgt_sizes[i]
+        pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype))
+        key_padding_mask[i, patch_len[i]:] = True
+
+    pos_embed = torch.nn.utils.rnn.pad_sequence(
+        pos_embed, batch_first=True, padding_value=0.0).permute(1, 0, 2)  # BLD => L * B * D
+
+    x = self.kv_proj(x)  # B * L * D
+    x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+
+    q = self.ln_q(self.query)  # Q * D
+
+    out = self.attn(
+        self._repeat(q, bs),  # Q * B * D
+        x + pos_embed,  # L * B * D +  L * B * D
+        x,
+        key_padding_mask=key_padding_mask)[0]
+    #  out: Q * B * D
+    x = out.permute(1, 0, 2)  # B * Q * D
+
+    x = self.ln_post(x)
+    # ipex-llm change starts
+    x = x.float() @ self.proj.float()
+    x = x.to(torch.float16)
+    # ipex-llm change ends
+    return x