Support lightweight-serving with internlm-xcomposer2-vl-7b multimodal input (#11703)
* init image_list * enable internlm-xcomposer2 image input * update style * add readme * update model * update readme
This commit is contained in:
		
							parent
							
								
									aa98ef96fe
								
							
						
					
					
						commit
						493cbd9a36
					
				
					 4 changed files with 111 additions and 22 deletions
				
			
		| 
						 | 
					@ -18,6 +18,10 @@ pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-exte
 | 
				
			||||||
pip install fastapi uvicorn openai
 | 
					pip install fastapi uvicorn openai
 | 
				
			||||||
pip install gradio # for gradio web UI
 | 
					pip install gradio # for gradio web UI
 | 
				
			||||||
conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc
 | 
					conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# for internlm-xcomposer2-vl-7b
 | 
				
			||||||
 | 
					pip install transformers==4.31.0
 | 
				
			||||||
 | 
					pip install accelerate timm==0.4.12 sentencepiece==0.1.99 gradio==3.44.4 markdown2==2.4.10 xlsxwriter==3.1.2 einops
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### 1.2 Installation on Windows
 | 
					#### 1.2 Installation on Windows
 | 
				
			||||||
| 
						 | 
					@ -172,10 +176,39 @@ curl http://localhost:8000/v1/chat/completions \
 | 
				
			||||||
  }'
 | 
					  }'
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					##### Image input
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					image input only supports [internlm-xcomposer2-vl-7b](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) now, and it must install transformers==4.31.0 to run.
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					wget -O ./test.jpg http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg
 | 
				
			||||||
 | 
					curl http://localhost:8000/v1/chat/completions \
 | 
				
			||||||
 | 
					  -H "Content-Type: application/json" \
 | 
				
			||||||
 | 
					  -d '{
 | 
				
			||||||
 | 
					    "model": "internlm-xcomposer2-vl-7b",
 | 
				
			||||||
 | 
					    "messages": [
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        "role": "user",
 | 
				
			||||||
 | 
					        "content": [
 | 
				
			||||||
 | 
					          {
 | 
				
			||||||
 | 
					            "type": "text",
 | 
				
			||||||
 | 
					            "text": "What'\''s in this image?"
 | 
				
			||||||
 | 
					          },
 | 
				
			||||||
 | 
					          {
 | 
				
			||||||
 | 
					            "type": "image_url",
 | 
				
			||||||
 | 
					            "image_url": {
 | 
				
			||||||
 | 
					              "url": "./test.jpg"
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					          }
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "max_tokens": 128
 | 
				
			||||||
 | 
					  }'
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### /v1/completions
 | 
					#### /v1/completions
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```bash
 | 
					```bash
 | 
				
			||||||
 | 
					 | 
				
			||||||
curl http://localhost:8000/v1/completions \
 | 
					curl http://localhost:8000/v1/completions \
 | 
				
			||||||
  -H "Content-Type: application/json" \
 | 
					  -H "Content-Type: application/json" \
 | 
				
			||||||
  -d '{
 | 
					  -d '{
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -47,12 +47,17 @@ logger = logging.get_logger(__name__)
 | 
				
			||||||
class InputsRequest(BaseModel):
 | 
					class InputsRequest(BaseModel):
 | 
				
			||||||
    inputs: str
 | 
					    inputs: str
 | 
				
			||||||
    parameters: Optional[Parameters] = None
 | 
					    parameters: Optional[Parameters] = None
 | 
				
			||||||
 | 
					    image_list: Optional[list] = None
 | 
				
			||||||
    stream: Optional[bool] = False
 | 
					    stream: Optional[bool] = False
 | 
				
			||||||
    req_type: str = 'completion'
 | 
					    req_type: str = 'completion'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ChatCompletionRequest(BaseModel):
 | 
					class ChatCompletionRequest(BaseModel):
 | 
				
			||||||
    messages: List[ChatMessage]
 | 
					    messages: Union[
 | 
				
			||||||
 | 
					        str,
 | 
				
			||||||
 | 
					        List[Dict[str, str]],
 | 
				
			||||||
 | 
					        List[Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]],
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
    model: str
 | 
					    model: str
 | 
				
			||||||
    max_tokens: Optional[int] = None
 | 
					    max_tokens: Optional[int] = None
 | 
				
			||||||
    min_tokens: Optional[int] = None
 | 
					    min_tokens: Optional[int] = None
 | 
				
			||||||
| 
						 | 
					@ -266,7 +271,7 @@ async def generate_stream(inputs_request: InputsRequest):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_prompt(messages) -> str:
 | 
					def get_prompt(messages) -> str:
 | 
				
			||||||
    if "codegeex" in local_model.model_name.lower():
 | 
					    if "codegeex" in local_model.model_name.lower():
 | 
				
			||||||
        query = messages[-1].content
 | 
					        query = messages[-1]["content"]
 | 
				
			||||||
        if len(messages) <= 1:
 | 
					        if len(messages) <= 1:
 | 
				
			||||||
            history = []
 | 
					            history = []
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
| 
						 | 
					@ -277,9 +282,24 @@ def get_prompt(messages) -> str:
 | 
				
			||||||
        return inputs
 | 
					        return inputs
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        prompt = ""
 | 
					        prompt = ""
 | 
				
			||||||
 | 
					        image_list = []
 | 
				
			||||||
        for msg in messages:
 | 
					        for msg in messages:
 | 
				
			||||||
            role = msg.role
 | 
					            role = msg["role"]
 | 
				
			||||||
            content = msg.content
 | 
					            content = msg["content"]
 | 
				
			||||||
 | 
					            if type(content) == list:
 | 
				
			||||||
 | 
					                image_list1 = [
 | 
				
			||||||
 | 
					                    item["image_url"]["url"]
 | 
				
			||||||
 | 
					                    for item in content
 | 
				
			||||||
 | 
					                    if item["type"] == "image_url"
 | 
				
			||||||
 | 
					                ]
 | 
				
			||||||
 | 
					                image_list.extend(image_list1)
 | 
				
			||||||
 | 
					                text_list = [
 | 
				
			||||||
 | 
					                    item["text"]
 | 
				
			||||||
 | 
					                    for item in content
 | 
				
			||||||
 | 
					                    if item["type"] == "text"
 | 
				
			||||||
 | 
					                ]
 | 
				
			||||||
 | 
					                prompt = "".join(text_list)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
                if role == "system":
 | 
					                if role == "system":
 | 
				
			||||||
                    prompt += f"<<SYS>>\n{content}\n<</SYS>>\n\n"
 | 
					                    prompt += f"<<SYS>>\n{content}\n<</SYS>>\n\n"
 | 
				
			||||||
                elif role == "user":
 | 
					                elif role == "user":
 | 
				
			||||||
| 
						 | 
					@ -288,7 +308,7 @@ def get_prompt(messages) -> str:
 | 
				
			||||||
                    prompt += f"{content} "
 | 
					                    prompt += f"{content} "
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    invalidInputError(False, f"Unknown role: {role}")
 | 
					                    invalidInputError(False, f"Unknown role: {role}")
 | 
				
			||||||
        return prompt.strip()
 | 
					        return prompt.strip(), image_list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def set_parameters(req):
 | 
					def set_parameters(req):
 | 
				
			||||||
| 
						 | 
					@ -313,11 +333,12 @@ def set_parameters(req):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.post("/v1/chat/completions")
 | 
					@app.post("/v1/chat/completions")
 | 
				
			||||||
async def create_chat_completion(request: ChatCompletionRequest):
 | 
					async def create_chat_completion(request: ChatCompletionRequest):
 | 
				
			||||||
    print(request)
 | 
					 | 
				
			||||||
    model_name = local_model.model_name
 | 
					    model_name = local_model.model_name
 | 
				
			||||||
 | 
					    prompt, image_list = get_prompt(request.messages)
 | 
				
			||||||
    inputs_request = InputsRequest(
 | 
					    inputs_request = InputsRequest(
 | 
				
			||||||
        inputs=get_prompt(request.messages),
 | 
					        inputs=prompt,
 | 
				
			||||||
        parameters=set_parameters(request),
 | 
					        parameters=set_parameters(request),
 | 
				
			||||||
 | 
					        image_list=image_list if len(image_list) >= 1 else None,
 | 
				
			||||||
        stream=request.stream,
 | 
					        stream=request.stream,
 | 
				
			||||||
        req_type="chat"
 | 
					        req_type="chat"
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -60,15 +60,40 @@ class ModelWorker:
 | 
				
			||||||
        tmp_result = await self.waiting_requests.get()
 | 
					        tmp_result = await self.waiting_requests.get()
 | 
				
			||||||
        request_id, prompt_request = tmp_result
 | 
					        request_id, prompt_request = tmp_result
 | 
				
			||||||
        plain_texts = prompt_request.inputs
 | 
					        plain_texts = prompt_request.inputs
 | 
				
			||||||
 | 
					        input_ids = None
 | 
				
			||||||
 | 
					        inputs_embeds = None
 | 
				
			||||||
 | 
					        if "internlm-xcomposer2-vl-7b" in self.model_name.lower():
 | 
				
			||||||
 | 
					            lines = [
 | 
				
			||||||
 | 
					                "You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).",
 | 
				
			||||||
 | 
					                "- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language "
 | 
				
			||||||
 | 
					                "model that is developed by Shanghai AI Laboratory (上海人工智能实验室). "
 | 
				
			||||||
 | 
					                "It is designed to be helpful, honest, and harmless.",
 | 
				
			||||||
 | 
					                "- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in "
 | 
				
			||||||
 | 
					                "the language chosen by the user such as English and 中文.",
 | 
				
			||||||
 | 
					                "- InternLM-XComposer (浦语·灵笔) is capable of comprehending and "
 | 
				
			||||||
 | 
					                "articulating responses effectively based on the provided image."
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
 | 
					            meta_instruction = "\n".join(lines)
 | 
				
			||||||
 | 
					            if prompt_request.image_list is None:
 | 
				
			||||||
 | 
					                inputs = self.model.build_inputs(tokenizer, plain_texts, [], meta_instruction)
 | 
				
			||||||
 | 
					                im_mask = torch.zeros(inputs['input_ids'].shape[:2]).bool()
 | 
				
			||||||
 | 
					                input_ids = inputs["input_ids"].to('xpu')
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                image = self.model.encode_img(prompt_request.image_list[0])
 | 
				
			||||||
 | 
					                plain_texts = "<ImageHere>" + plain_texts
 | 
				
			||||||
 | 
					                inputs, im_mask = self.model.interleav_wrap_chat(tokenizer, plain_texts,
 | 
				
			||||||
 | 
					                                                                 image, [], meta_instruction)
 | 
				
			||||||
 | 
					                inputs_embeds = inputs["inputs_embeds"].to('xpu').to(self.dtype)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
            inputs = tokenizer(plain_texts, return_tensors="pt", padding=True)
 | 
					            inputs = tokenizer(plain_texts, return_tensors="pt", padding=True)
 | 
				
			||||||
            input_ids = inputs.input_ids.to('xpu')
 | 
					            input_ids = inputs.input_ids.to('xpu')
 | 
				
			||||||
        parameters = prompt_request.parameters
 | 
					        parameters = prompt_request.parameters
 | 
				
			||||||
        return input_ids, parameters, request_id
 | 
					        return input_ids, parameters, request_id, inputs_embeds
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @torch.no_grad()
 | 
					    @torch.no_grad()
 | 
				
			||||||
    async def process_step(self, tokenizer, result_dict):
 | 
					    async def process_step(self, tokenizer, result_dict):
 | 
				
			||||||
        if not self.waiting_requests.empty():
 | 
					        if not self.waiting_requests.empty():
 | 
				
			||||||
            input_ids, parameters, request_id = await self.add_request(tokenizer)
 | 
					            input_ids, parameters, request_id, inputs_embeds = await self.add_request(tokenizer)
 | 
				
			||||||
            self.streamer[request_id] = TextIteratorStreamer(tokenizer, skip_prompt=True)
 | 
					            self.streamer[request_id] = TextIteratorStreamer(tokenizer, skip_prompt=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            def model_generate():
 | 
					            def model_generate():
 | 
				
			||||||
| 
						 | 
					@ -78,8 +103,18 @@ class ModelWorker:
 | 
				
			||||||
                                    tokenizer.convert_tokens_to_ids("<|user|>"),
 | 
					                                    tokenizer.convert_tokens_to_ids("<|user|>"),
 | 
				
			||||||
                                    tokenizer.convert_tokens_to_ids("<|observation|>")]
 | 
					                                    tokenizer.convert_tokens_to_ids("<|observation|>")]
 | 
				
			||||||
                    generate_kwargs["eos_token_id"] = eos_token_id
 | 
					                    generate_kwargs["eos_token_id"] = eos_token_id
 | 
				
			||||||
 | 
					                elif "internlm-xcomposer2-vl-7b" in self.model_name.lower():
 | 
				
			||||||
 | 
					                    eos_token_id = [
 | 
				
			||||||
 | 
					                        tokenizer.eos_token_id,
 | 
				
			||||||
 | 
					                        tokenizer.convert_tokens_to_ids(['[UNUSED_TOKEN_145]'])[0]
 | 
				
			||||||
 | 
					                    ]
 | 
				
			||||||
 | 
					                    generate_kwargs["eos_token_id"] = eos_token_id
 | 
				
			||||||
 | 
					                if input_ids is not None:
 | 
				
			||||||
                    self.model.generate(input_ids,
 | 
					                    self.model.generate(input_ids,
 | 
				
			||||||
                                        streamer=self.streamer[request_id], **generate_kwargs)
 | 
					                                        streamer=self.streamer[request_id], **generate_kwargs)
 | 
				
			||||||
 | 
					                elif inputs_embeds is not None:
 | 
				
			||||||
 | 
					                    self.model.generate(inputs_embeds=inputs_embeds,
 | 
				
			||||||
 | 
					                                        streamer=self.streamer[request_id], **generate_kwargs)
 | 
				
			||||||
                torch.xpu.empty_cache()
 | 
					                torch.xpu.empty_cache()
 | 
				
			||||||
                torch.xpu.synchronize()
 | 
					                torch.xpu.synchronize()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -574,7 +574,7 @@ class BenchmarkWrapper:
 | 
				
			||||||
        if input_name == "input_ids" and "inputs_embeds" in model_kwargs:
 | 
					        if input_name == "input_ids" and "inputs_embeds" in model_kwargs:
 | 
				
			||||||
            if not self.config.is_encoder_decoder:
 | 
					            if not self.config.is_encoder_decoder:
 | 
				
			||||||
                has_inputs_embeds_forwarding = "inputs_embeds" in set(
 | 
					                has_inputs_embeds_forwarding = "inputs_embeds" in set(
 | 
				
			||||||
                    inspect.signature(self.prepare_inputs_for_generation).parameters.keys()
 | 
					                    inspect.signature(self.model.prepare_inputs_for_generation).parameters.keys()
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                if not has_inputs_embeds_forwarding:
 | 
					                if not has_inputs_embeds_forwarding:
 | 
				
			||||||
                    raise ValueError(
 | 
					                    raise ValueError(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue