From 16fa778e6531a494103dde0c39d192b75d68fb5a Mon Sep 17 00:00:00 2001 From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com> Date: Sun, 27 Apr 2025 17:10:56 +0800 Subject: [PATCH] enable glm4v and gemma-3 on vllm 083 (#13114) * enable glm4v and gemma-3 * update * add qwen2.5-vl --- .../vllm_offline_inference_vision_language.py | 34 +++++++++++++++++-- .../src/ipex_llm/vllm/xpu/model_convert.py | 3 +- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py b/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py index bff9c60e..5ccaa6ff 100644 --- a/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py +++ b/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py @@ -8,9 +8,20 @@ model_path = "/llm/models/MiniCPM-V-2_6" model_path = "/llm/models/Qwen2-VL-7B-Instruct" model_path = "/llm/models/glm-4v-9b" model_path = "/llm/models/InternVL2-8B" +model_path = "/llm/models/gemma-3-12b-it" +model_path = "/llm/models/Qwen2.5-VL-7B-Instruct" prompt = "What is in the image?" +def run_gemma3(question: str, modality: str): + assert modality == "image" + + prompt = ("user\n" + f"{question}\n" + "model\n") + stop_token_ids = None + return prompt, stop_token_ids + def run_internvl(question: str, modality: str): assert modality == "image" @@ -69,18 +80,35 @@ def run_qwen2_vl(question, modality): model_example_map = { "minicpmv": run_minicpmv, "qwen2_vl": run_qwen2_vl, + "qwen2_5_vl": run_qwen2_vl, # only for glm4v "chatglm": run_glm4v, "internvl_chat": run_internvl, + "gemma3": run_gemma3, } +if "glm-4v" in model_path: + hf_override = {"architectures": ["GLM4VForCausalLM"]} +else: + hf_override = None + +dtype = "float16" +if "gemma-3" in model_path: + mm_processor_kwarg = {"do_pan_and_scan": True} + dtype = "float32" +else: + mm_processor_kwarg = None + + llm = LLM( model=model_path, device="xpu", - dtype="float16", + dtype=dtype, enforce_eager=True, - load_in_low_bit="fp8", - tensor_parallel_size=1, + hf_overrides=hf_override, + mm_processor_kwargs=mm_processor_kwarg, + load_in_low_bit="sym_int4", + tensor_parallel_size=2, disable_async_output_proc=True, distributed_executor_backend="ray", max_model_len=4000, diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index 1237d2b2..7345a0ba 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -77,7 +77,8 @@ def _ipex_llm_convert(load_in_low_bit): def get_load_function(low_bit): def _ipex_llm_load_model(self) -> None: - _model_sample_convert() + if "gemma-3" not in self.model_config.model.lower(): + _model_sample_convert() # from vllm.utils import measure_device_memory from vllm.utils import DeviceMemoryProfiler