Add vllm 0.6.2 vision offline example (#12721)
* add vision offline example * add to docker
This commit is contained in:
		
							parent
							
								
									085974e307
								
							
						
					
					
						commit
						716d4fe563
					
				
					 3 changed files with 87 additions and 0 deletions
				
			
		| 
						 | 
					@ -101,6 +101,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
COPY ./vllm_online_benchmark.py        /llm/
 | 
					COPY ./vllm_online_benchmark.py        /llm/
 | 
				
			||||||
COPY ./vllm_offline_inference.py       /llm/
 | 
					COPY ./vllm_offline_inference.py       /llm/
 | 
				
			||||||
 | 
					COPY ./vllm_offline_inference_vision_language.py  /llm/
 | 
				
			||||||
COPY ./payload-1024.lua                /llm/
 | 
					COPY ./payload-1024.lua                /llm/
 | 
				
			||||||
COPY ./start-vllm-service.sh           /llm/
 | 
					COPY ./start-vllm-service.sh           /llm/
 | 
				
			||||||
COPY ./benchmark_vllm_throughput.py   /llm/
 | 
					COPY ./benchmark_vllm_throughput.py   /llm/
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -148,6 +148,7 @@ We have included multiple example files in `/llm/`:
 | 
				
			||||||
2. `benchmark_vllm_throughput.py`: Used for benchmarking throughput
 | 
					2. `benchmark_vllm_throughput.py`: Used for benchmarking throughput
 | 
				
			||||||
3. `payload-1024.lua`: Used for testing request per second using 1k-128 request
 | 
					3. `payload-1024.lua`: Used for testing request per second using 1k-128 request
 | 
				
			||||||
4. `start-vllm-service.sh`: Used for template for starting vLLM service
 | 
					4. `start-vllm-service.sh`: Used for template for starting vLLM service
 | 
				
			||||||
 | 
					5. `vllm_offline_inference_vision_language.py`: Used for vLLM offline inference vision example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
##### Online benchmark throurgh api_server
 | 
					##### Online benchmark throurgh api_server
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,85 @@
 | 
				
			||||||
 | 
					from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
 | 
				
			||||||
 | 
					from vllm import SamplingParams
 | 
				
			||||||
 | 
					from transformers import AutoTokenizer
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					model_path = "/llm/models/MiniCPM-V-2_6"
 | 
				
			||||||
 | 
					model_path = "/llm/models/Qwen2-VL-7B-Instruct"
 | 
				
			||||||
 | 
					prompt = "What is in the image?"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def run_minicpmv(question, modality):
 | 
				
			||||||
 | 
					    assert modality == "image"
 | 
				
			||||||
 | 
					    tokenizer = AutoTokenizer.from_pretrained(model_path,
 | 
				
			||||||
 | 
					                                              trust_remote_code=True)
 | 
				
			||||||
 | 
					    # 2.6
 | 
				
			||||||
 | 
					    stop_tokens = ['<|im_end|>', '<|endoftext|>']
 | 
				
			||||||
 | 
					    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    messages = [{
 | 
				
			||||||
 | 
					        'role': 'user',
 | 
				
			||||||
 | 
					        'content': f'(<image>./</image>)\n{question}'
 | 
				
			||||||
 | 
					    }]
 | 
				
			||||||
 | 
					    prompt = tokenizer.apply_chat_template(messages,
 | 
				
			||||||
 | 
					                                           tokenize=False,
 | 
				
			||||||
 | 
					                                           add_generation_prompt=True)
 | 
				
			||||||
 | 
					    return prompt, stop_token_ids
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def run_qwen2_vl(question, modality):
 | 
				
			||||||
 | 
					    assert modality == "image"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 | 
				
			||||||
 | 
					              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
 | 
				
			||||||
 | 
					              f"{question}<|im_end|>\n"
 | 
				
			||||||
 | 
					              "<|im_start|>assistant\n")
 | 
				
			||||||
 | 
					    stop_token_ids = None
 | 
				
			||||||
 | 
					    return prompt, stop_token_ids
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					model_example_map = {
 | 
				
			||||||
 | 
					    "minicpmv": run_minicpmv,
 | 
				
			||||||
 | 
					    "qwen2_vl": run_qwen2_vl,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					llm = LLM(
 | 
				
			||||||
 | 
					          model=model_path,
 | 
				
			||||||
 | 
					          device="xpu",
 | 
				
			||||||
 | 
					          dtype="float16",
 | 
				
			||||||
 | 
					          enforce_eager=True,
 | 
				
			||||||
 | 
					          load_in_low_bit="fp8",
 | 
				
			||||||
 | 
					          tensor_parallel_size=1,
 | 
				
			||||||
 | 
					          disable_async_output_proc=True,
 | 
				
			||||||
 | 
					          distributed_executor_backend="ray",
 | 
				
			||||||
 | 
					          max_model_len=4000,
 | 
				
			||||||
 | 
					          trust_remote_code=True,
 | 
				
			||||||
 | 
					          block_size=8,
 | 
				
			||||||
 | 
					          max_num_batched_tokens=4000)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					model_type = llm.llm_engine.model_config.hf_config.model_type
 | 
				
			||||||
 | 
					prompt, stop_token_ids = model_example_map[model_type](prompt, "image")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Load the image using PIL.Image
 | 
				
			||||||
 | 
					from PIL import Image
 | 
				
			||||||
 | 
					image_url="http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg"
 | 
				
			||||||
 | 
					image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					sampling_params = SamplingParams(temperature=0.1,
 | 
				
			||||||
 | 
					                                 top_p=0.001,
 | 
				
			||||||
 | 
					                                 repetition_penalty=1.05,
 | 
				
			||||||
 | 
					                                 max_tokens=64,
 | 
				
			||||||
 | 
					                                 stop_token_ids=stop_token_ids)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Single prompt inference
 | 
				
			||||||
 | 
					outputs = llm.generate({
 | 
				
			||||||
 | 
					    "prompt": prompt,
 | 
				
			||||||
 | 
					    "multi_modal_data": {"image": image},
 | 
				
			||||||
 | 
					}, sampling_params=sampling_params)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for o in outputs:
 | 
				
			||||||
 | 
					    generated_text = o.outputs[0].text
 | 
				
			||||||
 | 
					    print(generated_text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue