enable glm4v and gemma-3 on vllm 083 (#13114)
* enable glm4v and gemma-3 * update * add qwen2.5-vl
This commit is contained in:
		
							parent
							
								
									cf97d8f1d7
								
							
						
					
					
						commit
						16fa778e65
					
				
					 2 changed files with 33 additions and 4 deletions
				
			
		| 
						 | 
					@ -8,9 +8,20 @@ model_path = "/llm/models/MiniCPM-V-2_6"
 | 
				
			||||||
model_path = "/llm/models/Qwen2-VL-7B-Instruct"
 | 
					model_path = "/llm/models/Qwen2-VL-7B-Instruct"
 | 
				
			||||||
model_path = "/llm/models/glm-4v-9b"
 | 
					model_path = "/llm/models/glm-4v-9b"
 | 
				
			||||||
model_path = "/llm/models/InternVL2-8B"
 | 
					model_path = "/llm/models/InternVL2-8B"
 | 
				
			||||||
 | 
					model_path = "/llm/models/gemma-3-12b-it"
 | 
				
			||||||
 | 
					model_path = "/llm/models/Qwen2.5-VL-7B-Instruct"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
prompt = "What is in the image?"
 | 
					prompt = "What is in the image?"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def run_gemma3(question: str, modality: str):
 | 
				
			||||||
 | 
					    assert modality == "image"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    prompt =   ("<bos><start_of_turn>user\n"
 | 
				
			||||||
 | 
					                f"<start_of_image>{question}<end_of_turn>\n"
 | 
				
			||||||
 | 
					                 "<start_of_turn>model\n")
 | 
				
			||||||
 | 
					    stop_token_ids = None
 | 
				
			||||||
 | 
					    return prompt, stop_token_ids
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def run_internvl(question: str, modality: str):
 | 
					def run_internvl(question: str, modality: str):
 | 
				
			||||||
    assert modality == "image"
 | 
					    assert modality == "image"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -69,18 +80,35 @@ def run_qwen2_vl(question, modality):
 | 
				
			||||||
model_example_map = {
 | 
					model_example_map = {
 | 
				
			||||||
    "minicpmv": run_minicpmv,
 | 
					    "minicpmv": run_minicpmv,
 | 
				
			||||||
    "qwen2_vl": run_qwen2_vl,
 | 
					    "qwen2_vl": run_qwen2_vl,
 | 
				
			||||||
 | 
					    "qwen2_5_vl": run_qwen2_vl,
 | 
				
			||||||
    # only for glm4v
 | 
					    # only for glm4v
 | 
				
			||||||
    "chatglm": run_glm4v,
 | 
					    "chatglm": run_glm4v,
 | 
				
			||||||
    "internvl_chat": run_internvl,
 | 
					    "internvl_chat": run_internvl,
 | 
				
			||||||
 | 
					    "gemma3": run_gemma3,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if "glm-4v" in model_path:
 | 
				
			||||||
 | 
					    hf_override = {"architectures": ["GLM4VForCausalLM"]}
 | 
				
			||||||
 | 
					else:
 | 
				
			||||||
 | 
					    hf_override = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					dtype = "float16"
 | 
				
			||||||
 | 
					if "gemma-3" in model_path:
 | 
				
			||||||
 | 
					    mm_processor_kwarg = {"do_pan_and_scan": True}
 | 
				
			||||||
 | 
					    dtype = "float32"
 | 
				
			||||||
 | 
					else:
 | 
				
			||||||
 | 
					    mm_processor_kwarg = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
llm = LLM(
 | 
					llm = LLM(
 | 
				
			||||||
          model=model_path,
 | 
					          model=model_path,
 | 
				
			||||||
          device="xpu",
 | 
					          device="xpu",
 | 
				
			||||||
          dtype="float16",
 | 
					          dtype=dtype,
 | 
				
			||||||
          enforce_eager=True,
 | 
					          enforce_eager=True,
 | 
				
			||||||
          load_in_low_bit="fp8",
 | 
					          hf_overrides=hf_override,
 | 
				
			||||||
          tensor_parallel_size=1,
 | 
					          mm_processor_kwargs=mm_processor_kwarg,
 | 
				
			||||||
 | 
					          load_in_low_bit="sym_int4",
 | 
				
			||||||
 | 
					          tensor_parallel_size=2,
 | 
				
			||||||
          disable_async_output_proc=True,
 | 
					          disable_async_output_proc=True,
 | 
				
			||||||
          distributed_executor_backend="ray",
 | 
					          distributed_executor_backend="ray",
 | 
				
			||||||
          max_model_len=4000,
 | 
					          max_model_len=4000,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,7 +77,8 @@ def _ipex_llm_convert(load_in_low_bit):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_load_function(low_bit):
 | 
					def get_load_function(low_bit):
 | 
				
			||||||
    def _ipex_llm_load_model(self) -> None:
 | 
					    def _ipex_llm_load_model(self) -> None:
 | 
				
			||||||
        _model_sample_convert()
 | 
					        if "gemma-3" not in self.model_config.model.lower():
 | 
				
			||||||
 | 
					            _model_sample_convert()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # from vllm.utils import measure_device_memory
 | 
					        # from vllm.utils import measure_device_memory
 | 
				
			||||||
        from vllm.utils import DeviceMemoryProfiler
 | 
					        from vllm.utils import DeviceMemoryProfiler
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue