From 716d4fe5632b3159fdb904ee31af46f062efd8c1 Mon Sep 17 00:00:00 2001 From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com> Date: Tue, 21 Jan 2025 09:58:01 +0800 Subject: [PATCH] Add vllm 0.6.2 vision offline example (#12721) * add vision offline example * add to docker --- docker/llm/serving/xpu/docker/Dockerfile | 1 + docker/llm/serving/xpu/docker/README.md | 1 + .../vllm_offline_inference_vision_language.py | 85 +++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index b33e787c..8e56921d 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -101,6 +101,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO COPY ./vllm_online_benchmark.py /llm/ COPY ./vllm_offline_inference.py /llm/ +COPY ./vllm_offline_inference_vision_language.py /llm/ COPY ./payload-1024.lua /llm/ COPY ./start-vllm-service.sh /llm/ COPY ./benchmark_vllm_throughput.py /llm/ diff --git a/docker/llm/serving/xpu/docker/README.md b/docker/llm/serving/xpu/docker/README.md index e746ebf5..64ef9d3f 100644 --- a/docker/llm/serving/xpu/docker/README.md +++ b/docker/llm/serving/xpu/docker/README.md @@ -148,6 +148,7 @@ We have included multiple example files in `/llm/`: 2. `benchmark_vllm_throughput.py`: Used for benchmarking throughput 3. `payload-1024.lua`: Used for testing request per second using 1k-128 request 4. `start-vllm-service.sh`: Used for template for starting vLLM service +5. `vllm_offline_inference_vision_language.py`: Used for vLLM offline inference vision example ##### Online benchmark throurgh api_server diff --git a/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py b/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py new file mode 100644 index 00000000..f860c760 --- /dev/null +++ b/docker/llm/serving/xpu/docker/vllm_offline_inference_vision_language.py @@ -0,0 +1,85 @@ +from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM +from vllm import SamplingParams +from transformers import AutoTokenizer +import requests + + +model_path = "/llm/models/MiniCPM-V-2_6" +model_path = "/llm/models/Qwen2-VL-7B-Instruct" +prompt = "What is in the image?" + +def run_minicpmv(question, modality): + assert modality == "image" + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + # 2.6 + stop_tokens = ['<|im_end|>', '<|endoftext|>'] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + + messages = [{ + 'role': 'user', + 'content': f'(./)\n{question}' + }] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + return prompt, stop_token_ids + +def run_qwen2_vl(question, modality): + assert modality == "image" + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return prompt, stop_token_ids + +model_example_map = { + "minicpmv": run_minicpmv, + "qwen2_vl": run_qwen2_vl, +} + +llm = LLM( + model=model_path, + device="xpu", + dtype="float16", + enforce_eager=True, + load_in_low_bit="fp8", + tensor_parallel_size=1, + disable_async_output_proc=True, + distributed_executor_backend="ray", + max_model_len=4000, + trust_remote_code=True, + block_size=8, + max_num_batched_tokens=4000) + + +model_type = llm.llm_engine.model_config.hf_config.model_type +prompt, stop_token_ids = model_example_map[model_type](prompt, "image") + + +# Load the image using PIL.Image +from PIL import Image +image_url="http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg" +image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB') + + +sampling_params = SamplingParams(temperature=0.1, + top_p=0.001, + repetition_penalty=1.05, + max_tokens=64, + stop_token_ids=stop_token_ids) + + +# Single prompt inference +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image}, +}, sampling_params=sampling_params) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +