From e1809a6295362d49c5581e950c03c920dac1c3e5 Mon Sep 17 00:00:00 2001
From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com>
Date: Wed, 19 Feb 2025 10:04:42 +0800
Subject: [PATCH] Update multimodal on vllm 0.6.6 (#12816)

* add glm4v and minicpmv example

* fix
---
 .../vllm_offline_inference_vision_language.py | 34 +++++++++++++++++++
 .../src/ipex_llm/vllm/xpu/model_convert.py    |  4 ---
 2 files changed, 34 insertions(+), 4 deletions(-)
diff --git a/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py b/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py
index f860c760..bff9c60e 100644
--- a/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py
+++ b/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py
@@ -6,8 +6,39 @@ import requests
 
 model_path = "/llm/models/MiniCPM-V-2_6"
 model_path = "/llm/models/Qwen2-VL-7B-Instruct"
+model_path = "/llm/models/glm-4v-9b"
+model_path = "/llm/models/InternVL2-8B"
+
 prompt = "What is in the image?"
 
+def run_internvl(question: str, modality: str):
+    assert modality == "image"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return prompt, stop_token_ids
+
+def run_glm4v(question: str, modality: str):
+    assert modality == "image"
+    model_name = "THUDM/glm-4v-9b"
+
+    prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
+        {question}<|assistant|>"
+
+    stop_token_ids = [151329, 151336, 151338]
+    return prompt, stop_token_ids
+
 def run_minicpmv(question, modality):
     assert modality == "image"
     tokenizer = AutoTokenizer.from_pretrained(model_path,
@@ -38,6 +69,9 @@ def run_qwen2_vl(question, modality):
 model_example_map = {
     "minicpmv": run_minicpmv,
     "qwen2_vl": run_qwen2_vl,
+    # only for glm4v
+    "chatglm": run_glm4v,
+    "internvl_chat": run_internvl,
 }
 
 llm = LLM(
diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py
index 7cce3c38..12963b72 100644
--- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py
+++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py
@@ -103,10 +103,6 @@ def get_load_function(low_bit):
                 modules = None
             if "minicpm" in self.vllm_config.model_config.model.lower():
                 modules = ["vpm", "resampler"]
-            # only for minicpm_2_6
-            if "minicpm-v" in self.vllm_config.model_config.model.lower():
-                from ipex_llm.transformers.models.minicpmv import merge_qkv
-                self.model.vpm.apply(merge_qkv)
             if "internvl2" in self.vllm_config.model_config.model.lower():
                 modules = ["vision_model", "mlp1"]
             if "deepseek-v2" in self.vllm_config.model_config.model.lower():