enable glm4v and gemma-3 on vllm 083 (#13114)
* enable glm4v and gemma-3 * update * add qwen2.5-vl
This commit is contained in:
parent
cf97d8f1d7
commit
16fa778e65
2 changed files with 33 additions and 4 deletions
|
|
@ -8,9 +8,20 @@ model_path = "/llm/models/MiniCPM-V-2_6"
|
||||||
model_path = "/llm/models/Qwen2-VL-7B-Instruct"
|
model_path = "/llm/models/Qwen2-VL-7B-Instruct"
|
||||||
model_path = "/llm/models/glm-4v-9b"
|
model_path = "/llm/models/glm-4v-9b"
|
||||||
model_path = "/llm/models/InternVL2-8B"
|
model_path = "/llm/models/InternVL2-8B"
|
||||||
|
model_path = "/llm/models/gemma-3-12b-it"
|
||||||
|
model_path = "/llm/models/Qwen2.5-VL-7B-Instruct"
|
||||||
|
|
||||||
prompt = "What is in the image?"
|
prompt = "What is in the image?"
|
||||||
|
|
||||||
|
def run_gemma3(question: str, modality: str):
|
||||||
|
assert modality == "image"
|
||||||
|
|
||||||
|
prompt = ("<bos><start_of_turn>user\n"
|
||||||
|
f"<start_of_image>{question}<end_of_turn>\n"
|
||||||
|
"<start_of_turn>model\n")
|
||||||
|
stop_token_ids = None
|
||||||
|
return prompt, stop_token_ids
|
||||||
|
|
||||||
def run_internvl(question: str, modality: str):
|
def run_internvl(question: str, modality: str):
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
|
|
@ -69,18 +80,35 @@ def run_qwen2_vl(question, modality):
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
"minicpmv": run_minicpmv,
|
"minicpmv": run_minicpmv,
|
||||||
"qwen2_vl": run_qwen2_vl,
|
"qwen2_vl": run_qwen2_vl,
|
||||||
|
"qwen2_5_vl": run_qwen2_vl,
|
||||||
# only for glm4v
|
# only for glm4v
|
||||||
"chatglm": run_glm4v,
|
"chatglm": run_glm4v,
|
||||||
"internvl_chat": run_internvl,
|
"internvl_chat": run_internvl,
|
||||||
|
"gemma3": run_gemma3,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if "glm-4v" in model_path:
|
||||||
|
hf_override = {"architectures": ["GLM4VForCausalLM"]}
|
||||||
|
else:
|
||||||
|
hf_override = None
|
||||||
|
|
||||||
|
dtype = "float16"
|
||||||
|
if "gemma-3" in model_path:
|
||||||
|
mm_processor_kwarg = {"do_pan_and_scan": True}
|
||||||
|
dtype = "float32"
|
||||||
|
else:
|
||||||
|
mm_processor_kwarg = None
|
||||||
|
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model=model_path,
|
model=model_path,
|
||||||
device="xpu",
|
device="xpu",
|
||||||
dtype="float16",
|
dtype=dtype,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
load_in_low_bit="fp8",
|
hf_overrides=hf_override,
|
||||||
tensor_parallel_size=1,
|
mm_processor_kwargs=mm_processor_kwarg,
|
||||||
|
load_in_low_bit="sym_int4",
|
||||||
|
tensor_parallel_size=2,
|
||||||
disable_async_output_proc=True,
|
disable_async_output_proc=True,
|
||||||
distributed_executor_backend="ray",
|
distributed_executor_backend="ray",
|
||||||
max_model_len=4000,
|
max_model_len=4000,
|
||||||
|
|
|
||||||
|
|
@ -77,7 +77,8 @@ def _ipex_llm_convert(load_in_low_bit):
|
||||||
|
|
||||||
def get_load_function(low_bit):
|
def get_load_function(low_bit):
|
||||||
def _ipex_llm_load_model(self) -> None:
|
def _ipex_llm_load_model(self) -> None:
|
||||||
_model_sample_convert()
|
if "gemma-3" not in self.model_config.model.lower():
|
||||||
|
_model_sample_convert()
|
||||||
|
|
||||||
# from vllm.utils import measure_device_memory
|
# from vllm.utils import measure_device_memory
|
||||||
from vllm.utils import DeviceMemoryProfiler
|
from vllm.utils import DeviceMemoryProfiler
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue