From b3b2cd64b437f2c91659ef4a02290ef15b48b0e8 Mon Sep 17 00:00:00 2001
From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com>
Date: Thu, 5 Sep 2024 09:25:08 +0800
Subject: [PATCH] Support lightweight-serving glm-4v-9b  (#11994)

* enable glm-4v-9b serving

* update readme

* update for no image input
---
 .../example/GPU/Lightweight-Serving/README.md |  8 ++-
 .../ipex_llm/serving/fastapi/api_server.py    |  5 +-
 .../ipex_llm/serving/fastapi/model_worker.py  | 57 +++++++++++++++++--
 3 files changed, 61 insertions(+), 9 deletions(-)
diff --git a/python/llm/example/GPU/Lightweight-Serving/README.md b/python/llm/example/GPU/Lightweight-Serving/README.md
index c21aa880..1a1f7f5c 100644
--- a/python/llm/example/GPU/Lightweight-Serving/README.md
+++ b/python/llm/example/GPU/Lightweight-Serving/README.md
@@ -40,6 +40,9 @@ pip install fastapi uvicorn openai
 pip install gradio # for gradio web UI
 conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc
 
+# for glm-4v-9b
+pip install transformers==4.42.4 trl
+
 # for internlm-xcomposer2-vl-7b
 pip install transformers==4.31.0
 pip install accelerate timm==0.4.12 sentencepiece==0.1.99 gradio==3.44.4 markdown2==2.4.10 xlsxwriter==3.1.2 einops
@@ -190,9 +193,8 @@ curl http://localhost:8000/v1/chat/completions \
 
 ##### Image input
 
-image input only supports [internlm-xcomposer2-vl-7b](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) now, and it must install transformers==4.31.0 to run.
+image input only supports [internlm-xcomposer2-vl-7b](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) and [glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b) now. And they should both install specific transformers version to run.
 ```bash
-wget -O /llm/lightweight_serving/test.jpg http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg
 curl http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
@@ -208,7 +210,7 @@ curl http://localhost:8000/v1/chat/completions \
           {
             "type": "image_url",
             "image_url": {
-              "url": "./test.jpg"
+              "url": "http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg"
             }
           }
         ]
diff --git a/python/llm/src/ipex_llm/serving/fastapi/api_server.py b/python/llm/src/ipex_llm/serving/fastapi/api_server.py
index 86fc6bce..387d47e5 100644
--- a/python/llm/src/ipex_llm/serving/fastapi/api_server.py
+++ b/python/llm/src/ipex_llm/serving/fastapi/api_server.py
@@ -317,7 +317,10 @@ def get_prompt(messages) -> str:
                 if role == "system":
                     prompt += f"<<SYS>>\n{content}\n<</SYS>>\n\n"
                 elif role == "user":
-                    prompt += f"[INST] {content} [/INST] "
+                    if "glm" in local_model.model_name.lower():
+                        prompt += f"<|user|>\n{content}\n<|assistant|>"
+                    else:
+                        prompt += f"[INST] {content} [/INST] "
                 elif role == "assistant":
                     prompt += f"{content} "
                 else:
diff --git a/python/llm/src/ipex_llm/serving/fastapi/model_worker.py b/python/llm/src/ipex_llm/serving/fastapi/model_worker.py
index 9a7b2b0b..e339bcc1 100644
--- a/python/llm/src/ipex_llm/serving/fastapi/model_worker.py
+++ b/python/llm/src/ipex_llm/serving/fastapi/model_worker.py
@@ -16,8 +16,11 @@
 
 import torch
 from transformers.utils import logging
+import os
 import time
 import asyncio
+from PIL import Image
+import requests
 from transformers import TextIteratorStreamer
 logger = logging.get_logger(__name__)
 
@@ -30,8 +33,12 @@ class ModelWorker:
             self.model = self.load_model(checkpoint, low_bit, "audio")
         else:
             model = self.load_model(checkpoint, low_bit)
-            from ipex_llm.utils import BenchmarkWrapper
-            self.model = BenchmarkWrapper(model, do_print=True)
+            if "glm-4v" not in checkpoint.lower():
+                from ipex_llm.utils import BenchmarkWrapper
+                self.model = BenchmarkWrapper(model, do_print=True)
+            else:
+                # glm-4v-9b does not support benchmark_util now
+                self.model = model
         end = time.perf_counter()
         logger.info(f"Time to load weights: {end - start:.2f}s")
         self.waiting_requests = asyncio.Queue()
@@ -49,12 +56,18 @@ class ModelWorker:
                                                               use_cache=True)
         else:
             from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
+            modules = None
+            if "glm-4" in model_path.lower():
+                modules = ["encoder.layers.35.mlp", "encoder.layers.36.mlp",
+                           "encoder.layers.37.mlp", "encoder.layers.38.mlp",
+                           "encoder.layers.39.mlp"]
             try:
                 model = AutoModelForCausalLM.from_pretrained(model_path,
                                                              load_in_low_bit=low_bit,
                                                              torch_dtype=self.dtype,
                                                              optimize_model=True,
                                                              trust_remote_code=True,
+                                                             modules_to_not_convert=modules,
                                                              use_cache=True,)
             except:
                 model = AutoModel.from_pretrained(model_path,
@@ -62,10 +75,25 @@ class ModelWorker:
                                                   torch_dtype=self.dtype,
                                                   optimize_model=True,
                                                   trust_remote_code=True,
+                                                  modules_to_not_convert=modules,
                                                   use_cache=True,)
         model = model.eval().to("xpu")
         return model
 
+    def get_local_image_path(self, image_path):
+        local_dir = './local_images/'
+        local_path = local_dir + os.path.basename(image_path)
+        if os.path.exists(image_path) or os.path.exists(local_path):
+            pass
+        else:
+            response = requests.get(image_path)
+            if response.status_code == 200:
+                if not os.path.exists(local_dir):
+                    os.makedirs(local_dir)
+                with open(local_path, 'wb') as file:
+                    file.write(response.content)
+        return local_path
+
     async def add_asr_request(self, processor):
         if self.waiting_requests.empty():
             return
@@ -94,6 +122,7 @@ class ModelWorker:
         plain_texts = prompt_request.inputs
         input_ids = None
         inputs_embeds = None
+        inputs = None
         if "internlm-xcomposer2-vl-7b" in self.model_name.lower():
             lines = [
                 "You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).",
@@ -111,16 +140,30 @@ class ModelWorker:
                 im_mask = torch.zeros(inputs['input_ids'].shape[:2]).bool()
                 input_ids = inputs["input_ids"].to('xpu')
             else:
-                image = self.model.encode_img(prompt_request.image_list[0])
+                # only process the first image now
+                local_path = self.get_local_image_path(prompt_request.image_list[0])
+                image = self.model.encode_img(local_path)
                 plain_texts = "<ImageHere>" + plain_texts
                 inputs, im_mask = self.model.interleav_wrap_chat(tokenizer, plain_texts,
                                                                  image, [], meta_instruction)
                 inputs_embeds = inputs["inputs_embeds"].to('xpu').to(self.dtype)
+        elif "glm-4v" in self.model_name.lower() and prompt_request.image_list is not None:
+            # only process the first image now
+            local_path = self.get_local_image_path(prompt_request.image_list[0])
+            image = Image.open(local_path)
+
+            inputs = tokenizer.apply_chat_template([{"role": "user", "image": image,
+                                                   "content": plain_texts}],
+                                                   add_generation_prompt=True,
+                                                   tokenize=True,
+                                                   return_tensors="pt",
+                                                   return_dict=True)
+            inputs = inputs.to('xpu')
         else:
             inputs = tokenizer(plain_texts, return_tensors="pt", padding=True)
             input_ids = inputs.input_ids.to('xpu')
         parameters = prompt_request.parameters
-        return input_ids, parameters, request_id, inputs_embeds
+        return input_ids, parameters, request_id, inputs_embeds, inputs
 
     @torch.no_grad()
     async def process_step(self, tokenizer, result_dict, processor=None):
@@ -134,7 +177,8 @@ class ModelWorker:
                                         streamer=self.streamer[request_id],
                                         forced_decoder_ids=decoder_ids)
             else:
-                input_ids, parameters, request_id, inputs_embeds = await self.add_request(tokenizer)
+                input_ids, parameters, request_id, inputs_embeds, inputs = \
+                    await self.add_request(tokenizer)
                 self.streamer[request_id] = TextIteratorStreamer(tokenizer, skip_prompt=True)
 
                 def model_generate():
@@ -156,6 +200,9 @@ class ModelWorker:
                     elif inputs_embeds is not None:
                         self.model.generate(inputs_embeds=inputs_embeds,
                                             streamer=self.streamer[request_id], **generate_kwargs)
+                    else:
+                        self.model.generate(**inputs,
+                                            streamer=self.streamer[request_id], **generate_kwargs)
             torch.xpu.empty_cache()
             torch.xpu.synchronize()
             from threading import Thread