Support lightweight-serving with internlm-xcomposer2-vl-7b multimodal input (#11703)

* init image_list * enable internlm-xcomposer2 image input * update style * add readme * update model * update readme
2024-08-05 09:36:04 +08:00 · 2024-08-05 09:36:04 +08:00 · 493cbd9a36
commit 493cbd9a36
parent aa98ef96fe
4 changed files with 111 additions and 22 deletions
--- a/python/llm/example/GPU/Lightweight-Serving/README.md
+++ b/python/llm/example/GPU/Lightweight-Serving/README.md
@ -18,6 +18,10 @@ pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-exte
 pip install fastapi uvicorn openai
 pip install gradio # for gradio web UI
 conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc
 # for internlm-xcomposer2-vl-7b
 pip install transformers==4.31.0
 pip install accelerate timm==0.4.12 sentencepiece==0.1.99 gradio==3.44.4 markdown2==2.4.10 xlsxwriter==3.1.2 einops
 ```
 #### 1.2 Installation on Windows
@ -172,10 +176,39 @@ curl http://localhost:8000/v1/chat/completions \
  }'
 ```
 ##### Image input
 image input only supports [internlm-xcomposer2-vl-7b](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) now, and it must install transformers==4.31.0 to run.
 ```bash
 wget -O ./test.jpg http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg
 curl http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "internlm-xcomposer2-vl-7b",
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": "What'\''s in this image?"
          },
          {
            "type": "image_url",
            "image_url": {
              "url": "./test.jpg"
            }
          }
        ]
      }
    ],
    "max_tokens": 128
  }'
 ```
 #### /v1/completions
 ```bash
 curl http://localhost:8000/v1/completions \
  -H "Content-Type: application/json" \
  -d '{
--- a/python/llm/src/ipex_llm/serving/fastapi/api_server.py
+++ b/python/llm/src/ipex_llm/serving/fastapi/api_server.py
@ -47,12 +47,17 @@ logger = logging.get_logger(__name__)
 class InputsRequest(BaseModel):
    inputs: str
    parameters: Optional[Parameters] = None
    image_list: Optional[list] = None
    stream: Optional[bool] = False
    req_type: str = 'completion'
 class ChatCompletionRequest(BaseModel):
-    messages: List[ChatMessage]
+    messages: Union[
        str,
        List[Dict[str, str]],
        List[Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]],
    ]
    model: str
    max_tokens: Optional[int] = None
    min_tokens: Optional[int] = None
@ -266,7 +271,7 @@ async def generate_stream(inputs_request: InputsRequest):
 def get_prompt(messages) -> str:
    if "codegeex" in local_model.model_name.lower():
-        query = messages[-1].content
+        query = messages[-1]["content"]
        if len(messages) <= 1:
            history = []
        else:
@ -277,9 +282,24 @@ def get_prompt(messages) -> str:
        return inputs
    else:
        prompt = ""
        image_list = []
        for msg in messages:
-            role = msg.role
+            role = msg["role"]
-            content = msg.content
+            content = msg["content"]
            if type(content) == list:
                image_list1 = [
                    item["image_url"]["url"]
                    for item in content
                    if item["type"] == "image_url"
                ]
                image_list.extend(image_list1)
                text_list = [
                    item["text"]
                    for item in content
                    if item["type"] == "text"
                ]
                prompt = "".join(text_list)
            else:
                if role == "system":
                    prompt += f"<<SYS>>\n{content}\n<</SYS>>\n\n"
                elif role == "user":
@ -288,7 +308,7 @@ def get_prompt(messages) -> str:
                    prompt += f"{content} "
                else:
                    invalidInputError(False, f"Unknown role: {role}")
-        return prompt.strip()
+        return prompt.strip(), image_list
 def set_parameters(req):
@ -313,11 +333,12 @@ def set_parameters(req):
@app.post("/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest):
    print(request)
    model_name = local_model.model_name
    prompt, image_list = get_prompt(request.messages)
    inputs_request = InputsRequest(
-        inputs=get_prompt(request.messages),
+        inputs=prompt,
        parameters=set_parameters(request),
        image_list=image_list if len(image_list) >= 1 else None,
        stream=request.stream,
        req_type="chat"
    )
--- a/python/llm/src/ipex_llm/serving/fastapi/model_worker.py
+++ b/python/llm/src/ipex_llm/serving/fastapi/model_worker.py
@ -60,15 +60,40 @@ class ModelWorker:
        tmp_result = await self.waiting_requests.get()
        request_id, prompt_request = tmp_result
        plain_texts = prompt_request.inputs
        input_ids = None
        inputs_embeds = None
        if "internlm-xcomposer2-vl-7b" in self.model_name.lower():
            lines = [
                "You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).",
                "- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language "
                "model that is developed by Shanghai AI Laboratory (上海人工智能实验室). "
                "It is designed to be helpful, honest, and harmless.",
                "- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in "
                "the language chosen by the user such as English and 中文.",
                "- InternLM-XComposer (浦语·灵笔) is capable of comprehending and "
                "articulating responses effectively based on the provided image."
            ]
            meta_instruction = "\n".join(lines)
            if prompt_request.image_list is None:
                inputs = self.model.build_inputs(tokenizer, plain_texts, [], meta_instruction)
                im_mask = torch.zeros(inputs['input_ids'].shape[:2]).bool()
                input_ids = inputs["input_ids"].to('xpu')
            else:
                image = self.model.encode_img(prompt_request.image_list[0])
                plain_texts = "<ImageHere>" + plain_texts
                inputs, im_mask = self.model.interleav_wrap_chat(tokenizer, plain_texts,
                                                                 image, [], meta_instruction)
                inputs_embeds = inputs["inputs_embeds"].to('xpu').to(self.dtype)
        else:
            inputs = tokenizer(plain_texts, return_tensors="pt", padding=True)
            input_ids = inputs.input_ids.to('xpu')
        parameters = prompt_request.parameters
-        return input_ids, parameters, request_id
+        return input_ids, parameters, request_id, inputs_embeds
    @torch.no_grad()
    async def process_step(self, tokenizer, result_dict):
        if not self.waiting_requests.empty():
-            input_ids, parameters, request_id = await self.add_request(tokenizer)
+            input_ids, parameters, request_id, inputs_embeds = await self.add_request(tokenizer)
            self.streamer[request_id] = TextIteratorStreamer(tokenizer, skip_prompt=True)
            def model_generate():
@ -78,8 +103,18 @@ class ModelWorker:
                                    tokenizer.convert_tokens_to_ids("<|user|>"),
                                    tokenizer.convert_tokens_to_ids("<|observation|>")]
                    generate_kwargs["eos_token_id"] = eos_token_id
                elif "internlm-xcomposer2-vl-7b" in self.model_name.lower():
                    eos_token_id = [
                        tokenizer.eos_token_id,
                        tokenizer.convert_tokens_to_ids(['[UNUSED_TOKEN_145]'])[0]
                    ]
                    generate_kwargs["eos_token_id"] = eos_token_id
                if input_ids is not None:
                    self.model.generate(input_ids,
                                        streamer=self.streamer[request_id], **generate_kwargs)
                elif inputs_embeds is not None:
                    self.model.generate(inputs_embeds=inputs_embeds,
                                        streamer=self.streamer[request_id], **generate_kwargs)
                torch.xpu.empty_cache()
                torch.xpu.synchronize()
--- a/python/llm/src/ipex_llm/utils/benchmark_util.py
+++ b/python/llm/src/ipex_llm/utils/benchmark_util.py
@ -574,7 +574,7 @@ class BenchmarkWrapper:
        if input_name == "input_ids" and "inputs_embeds" in model_kwargs:
            if not self.config.is_encoder_decoder:
                has_inputs_embeds_forwarding = "inputs_embeds" in set(
-                    inspect.signature(self.prepare_inputs_for_generation).parameters.keys()
+                    inspect.signature(self.model.prepare_inputs_for_generation).parameters.keys()
                )
                if not has_inputs_embeds_forwarding:
                    raise ValueError(