Add vllm 0.6.2 vision offline example (#12721)

* add vision offline example

* add to docker
This commit is contained in:
Wang, Jian4 2025-01-21 09:58:01 +08:00 committed by GitHub
parent 085974e307
commit 716d4fe563
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 87 additions and 0 deletions

View file

@ -101,6 +101,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
COPY ./vllm_online_benchmark.py /llm/ COPY ./vllm_online_benchmark.py /llm/
COPY ./vllm_offline_inference.py /llm/ COPY ./vllm_offline_inference.py /llm/
COPY ./vllm_offline_inference_vision_language.py /llm/
COPY ./payload-1024.lua /llm/ COPY ./payload-1024.lua /llm/
COPY ./start-vllm-service.sh /llm/ COPY ./start-vllm-service.sh /llm/
COPY ./benchmark_vllm_throughput.py /llm/ COPY ./benchmark_vllm_throughput.py /llm/

View file

@ -148,6 +148,7 @@ We have included multiple example files in `/llm/`:
2. `benchmark_vllm_throughput.py`: Used for benchmarking throughput 2. `benchmark_vllm_throughput.py`: Used for benchmarking throughput
3. `payload-1024.lua`: Used for testing request per second using 1k-128 request 3. `payload-1024.lua`: Used for testing request per second using 1k-128 request
4. `start-vllm-service.sh`: Used for template for starting vLLM service 4. `start-vllm-service.sh`: Used for template for starting vLLM service
5. `vllm_offline_inference_vision_language.py`: Used for vLLM offline inference vision example
##### Online benchmark throurgh api_server ##### Online benchmark throurgh api_server

View file

@ -0,0 +1,85 @@
from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
from vllm import SamplingParams
from transformers import AutoTokenizer
import requests
model_path = "/llm/models/MiniCPM-V-2_6"
model_path = "/llm/models/Qwen2-VL-7B-Instruct"
prompt = "What is in the image?"
def run_minicpmv(question, modality):
assert modality == "image"
tokenizer = AutoTokenizer.from_pretrained(model_path,
trust_remote_code=True)
# 2.6
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
messages = [{
'role': 'user',
'content': f'(<image>./</image>)\n{question}'
}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return prompt, stop_token_ids
def run_qwen2_vl(question, modality):
assert modality == "image"
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n")
stop_token_ids = None
return prompt, stop_token_ids
model_example_map = {
"minicpmv": run_minicpmv,
"qwen2_vl": run_qwen2_vl,
}
llm = LLM(
model=model_path,
device="xpu",
dtype="float16",
enforce_eager=True,
load_in_low_bit="fp8",
tensor_parallel_size=1,
disable_async_output_proc=True,
distributed_executor_backend="ray",
max_model_len=4000,
trust_remote_code=True,
block_size=8,
max_num_batched_tokens=4000)
model_type = llm.llm_engine.model_config.hf_config.model_type
prompt, stop_token_ids = model_example_map[model_type](prompt, "image")
# Load the image using PIL.Image
from PIL import Image
image_url="http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg"
image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
sampling_params = SamplingParams(temperature=0.1,
top_p=0.001,
repetition_penalty=1.05,
max_tokens=64,
stop_token_ids=stop_token_ids)
# Single prompt inference
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {"image": image},
}, sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)