LLM: fix qwen-vl interpolation gpu abnormal results. (#10457)

* fix qwen-vl interpolation gpu abnormal results. * fix style. * update qwen-vl gpu example. * fix comment and update example. * fix style.
2024-03-19 16:59:39 +08:00 · 2024-03-19 16:59:39 +08:00 · 463a86cd5d
commit 463a86cd5d
parent e9055c32f9
4 changed files with 82 additions and 21 deletions
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
@ -43,18 +43,9 @@ if __name__ == '__main__':
    model = AutoModelForCausalLM.from_pretrained(model_path, 
                                                 load_in_4bit=True, 
                                                 trust_remote_code=True, 
-                                                 modules_to_not_convert=['c_fc', 'out_proj'])
+                                                 modules_to_not_convert=['c_fc', 'out_proj'],
                                                 torch_dtype=torch.float32)
    model = model.to('xpu')
    # Due to issue https://github.com/intel/intel-extension-for-pytorch/issues/454,
    # currently put interpolation execution into cpu
    def to_cpu(module, input, output):
        return output.to("cpu")
    def to_xpu(module, input):
        return (input[0].to("xpu"),)
    model.transformer.visual.ln_pre.register_forward_hook(to_cpu)
    model.transformer.visual.transformer.register_forward_pre_hook(to_xpu)
    # Specify hyperparameters for generation (No need to do this if you are using transformers>=4.32.0)
    model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
--- a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py
@ -47,16 +47,6 @@ if __name__ == '__main__':
                           low_bit='sym_int4', 
                           modules_to_not_convert=['c_fc', 'out_proj'])
    model = model.to('xpu')
    # Due to issue https://github.com/intel/intel-extension-for-pytorch/issues/454,
    # currently put interpolation execution into cpu
    def to_cpu(module, input, output):
        return output.to("cpu")
    def to_xpu(module, input):
        return (input[0].to("xpu"),)
    model.transformer.visual.ln_pre.register_forward_hook(to_cpu)
    model.transformer.visual.transformer.register_forward_pre_hook(to_xpu)
    # Specify hyperparameters for generation (No need to do this if you are using transformers>=4.32.0)
    model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/bigdl/llm/transformers/convert.py
@ -689,6 +689,23 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
    if optimize_model:
        model = _optimize_post(model, lightweight_bmm)
    if model.config.model_type == "qwen" and hasattr(model.config, "visual"):
        # for Qwen-VL-Chat
        # Due to issue https://github.com/intel/intel-extension-for-pytorch/issues/454,
        # currently put interpolation execution into cpu
        visual_module_name = model.transformer.visual.__class__.__module__
        visual_module = importlib.import_module(visual_module_name)
        from bigdl.llm.transformers.models.qwen_vl import qwen_vl_vision_transformer_forward
        from bigdl.llm.transformers.models.qwen_vl import qwen_vl_resampler_forward
        convert_forward(model,
                        visual_module.VisionTransformer,
                        qwen_vl_vision_transformer_forward
                        )
        convert_forward(model,
                        visual_module.Resampler,
                        qwen_vl_resampler_forward
                        )
    return model
--- a/python/llm/src/bigdl/llm/transformers/models/qwen_vl.py
+++ b/python/llm/src/bigdl/llm/transformers/models/qwen_vl.py
@ -47,6 +47,27 @@ def apply_rotary_pos_emb(t, freqs):
    return torch.cat((t_, t_pass_), dim=-1).type_as(t)
 def get_abs_pos(abs_pos, tgt_size):
    # abs_pos: L, C
    # tgt_size: M
    # return: M, C
    src_size = int(math.sqrt(abs_pos.size(0)))
    tgt_size = int(math.sqrt(tgt_size))
    dtype = abs_pos.dtype
    if src_size != tgt_size:
        # Due to issue https://github.com/intel/intel-extension-for-pytorch/issues/454,
        # currently put interpolation execution into cpu
        return F.interpolate(
            abs_pos.to("cpu").float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
            size=(tgt_size, tgt_size),
            mode="bicubic",
            align_corners=False,
        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype).to(abs_pos.device)
    else:
        return abs_pos
 def qwen_attention_forward_vl(
    self,
    hidden_states: Optional[Tuple[torch.FloatTensor]],
@ -151,3 +172,45 @@ def qwen_attention_forward_vl(
        outputs += (attn_weight,)
    return outputs
 def qwen_vl_resampler_forward(self, x, attn_mask=None):
    pos_embed = get_abs_pos(self.pos_embed, x.size(1))
    x = self.kv_proj(x)
    x = self.ln_kv(x).permute(1, 0, 2)
    N = x.shape[1]
    q = self.ln_q(self.query)
    out = self.attn(
        self._repeat(q, N) + self.pos_embed.unsqueeze(1),
        x + pos_embed.unsqueeze(1),
        x,
        attn_mask=attn_mask)[0]
    return out.permute(1, 0, 2)
 def qwen_vl_vision_transformer_forward(self, x: torch.Tensor):
    x = x.to(
        dtype=self.transformer.get_cast_dtype(),
        device=self.transformer.get_cast_device(),
    )
    # to patches
    x = self.conv1(x)  # shape = [*, width, grid, grid]
    x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
    x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
    x = x + get_abs_pos(self.positional_embedding, x.size(1))
    x = self.ln_pre(x)
    x = x.permute(1, 0, 2)  # NLD -> LND
    x = self.transformer(x)
    x = x.permute(1, 0, 2)  # LND -> NLD
    x = self.attn_pool(x)
    x = self.ln_post(x)
    x = x @ self.proj
    return x