Add vllm 0.6.2 vision offline example (#12721)
* add vision offline example * add to docker
This commit is contained in:
		
							parent
							
								
									085974e307
								
							
						
					
					
						commit
						716d4fe563
					
				
					 3 changed files with 87 additions and 0 deletions
				
			
		| 
						 | 
				
			
			@ -101,6 +101,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
 | 
			
		|||
 | 
			
		||||
COPY ./vllm_online_benchmark.py        /llm/
 | 
			
		||||
COPY ./vllm_offline_inference.py       /llm/
 | 
			
		||||
COPY ./vllm_offline_inference_vision_language.py  /llm/
 | 
			
		||||
COPY ./payload-1024.lua                /llm/
 | 
			
		||||
COPY ./start-vllm-service.sh           /llm/
 | 
			
		||||
COPY ./benchmark_vllm_throughput.py   /llm/
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -148,6 +148,7 @@ We have included multiple example files in `/llm/`:
 | 
			
		|||
2. `benchmark_vllm_throughput.py`: Used for benchmarking throughput
 | 
			
		||||
3. `payload-1024.lua`: Used for testing request per second using 1k-128 request
 | 
			
		||||
4. `start-vllm-service.sh`: Used for template for starting vLLM service
 | 
			
		||||
5. `vllm_offline_inference_vision_language.py`: Used for vLLM offline inference vision example
 | 
			
		||||
 | 
			
		||||
##### Online benchmark throurgh api_server
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,85 @@
 | 
			
		|||
from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
 | 
			
		||||
from vllm import SamplingParams
 | 
			
		||||
from transformers import AutoTokenizer
 | 
			
		||||
import requests
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
model_path = "/llm/models/MiniCPM-V-2_6"
 | 
			
		||||
model_path = "/llm/models/Qwen2-VL-7B-Instruct"
 | 
			
		||||
prompt = "What is in the image?"
 | 
			
		||||
 | 
			
		||||
def run_minicpmv(question, modality):
 | 
			
		||||
    assert modality == "image"
 | 
			
		||||
    tokenizer = AutoTokenizer.from_pretrained(model_path,
 | 
			
		||||
                                              trust_remote_code=True)
 | 
			
		||||
    # 2.6
 | 
			
		||||
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
 | 
			
		||||
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 | 
			
		||||
 | 
			
		||||
    messages = [{
 | 
			
		||||
        'role': 'user',
 | 
			
		||||
        'content': f'(<image>./</image>)\n{question}'
 | 
			
		||||
    }]
 | 
			
		||||
    prompt = tokenizer.apply_chat_template(messages,
 | 
			
		||||
                                           tokenize=False,
 | 
			
		||||
                                           add_generation_prompt=True)
 | 
			
		||||
    return prompt, stop_token_ids
 | 
			
		||||
 | 
			
		||||
def run_qwen2_vl(question, modality):
 | 
			
		||||
    assert modality == "image"
 | 
			
		||||
 | 
			
		||||
    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 | 
			
		||||
              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
 | 
			
		||||
              f"{question}<|im_end|>\n"
 | 
			
		||||
              "<|im_start|>assistant\n")
 | 
			
		||||
    stop_token_ids = None
 | 
			
		||||
    return prompt, stop_token_ids
 | 
			
		||||
 | 
			
		||||
model_example_map = {
 | 
			
		||||
    "minicpmv": run_minicpmv,
 | 
			
		||||
    "qwen2_vl": run_qwen2_vl,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
llm = LLM(
 | 
			
		||||
          model=model_path,
 | 
			
		||||
          device="xpu",
 | 
			
		||||
          dtype="float16",
 | 
			
		||||
          enforce_eager=True,
 | 
			
		||||
          load_in_low_bit="fp8",
 | 
			
		||||
          tensor_parallel_size=1,
 | 
			
		||||
          disable_async_output_proc=True,
 | 
			
		||||
          distributed_executor_backend="ray",
 | 
			
		||||
          max_model_len=4000,
 | 
			
		||||
          trust_remote_code=True,
 | 
			
		||||
          block_size=8,
 | 
			
		||||
          max_num_batched_tokens=4000)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
model_type = llm.llm_engine.model_config.hf_config.model_type
 | 
			
		||||
prompt, stop_token_ids = model_example_map[model_type](prompt, "image")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Load the image using PIL.Image
 | 
			
		||||
from PIL import Image
 | 
			
		||||
image_url="http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg"
 | 
			
		||||
image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
sampling_params = SamplingParams(temperature=0.1,
 | 
			
		||||
                                 top_p=0.001,
 | 
			
		||||
                                 repetition_penalty=1.05,
 | 
			
		||||
                                 max_tokens=64,
 | 
			
		||||
                                 stop_token_ids=stop_token_ids)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Single prompt inference
 | 
			
		||||
outputs = llm.generate({
 | 
			
		||||
    "prompt": prompt,
 | 
			
		||||
    "multi_modal_data": {"image": image},
 | 
			
		||||
}, sampling_params=sampling_params)
 | 
			
		||||
 | 
			
		||||
for o in outputs:
 | 
			
		||||
    generated_text = o.outputs[0].text
 | 
			
		||||
    print(generated_text)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in a new issue