diff --git a/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py b/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py index 5ccaa6ff..96d93a1f 100644 --- a/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py +++ b/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py @@ -94,10 +94,9 @@ else: dtype = "float16" if "gemma-3" in model_path: - mm_processor_kwarg = {"do_pan_and_scan": True} dtype = "float32" else: - mm_processor_kwarg = None + pass llm = LLM( @@ -106,7 +105,7 @@ llm = LLM( dtype=dtype, enforce_eager=True, hf_overrides=hf_override, - mm_processor_kwargs=mm_processor_kwarg, + mm_processor_kwargs=None, load_in_low_bit="sym_int4", tensor_parallel_size=2, disable_async_output_proc=True, diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index 3d3315ab..346ecb70 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -41,9 +41,9 @@ def _sample_get_logits( # HINT: we do not support other types of quantization for now # TODO: we may encounter tie-word-embedding problems if isinstance(lm_head, VocabParallelEmbedding): - logits = lm_head.linear_method.apply(lm_head, - hidden_states, - bias=embedding_bias) + logits = lm_head.quant_method.apply(lm_head, + hidden_states, + bias=embedding_bias) else: logits = lm_head(hidden_states) if embedding_bias is not None: