LLM: reduce GPU 1st token latency and update example (#8763)

* reduce 1st token latency * update example * fix * fix style * update readme of gpu benchmark
2023-08-16 18:01:23 +08:00 · 2023-08-16 18:01:23 +08:00 · e9aa2bd890
commit e9aa2bd890
parent 06609d9260
4 changed files with 15 additions and 6 deletions
--- a/python/llm/dev/benchmark/README.md
+++ b/python/llm/dev/benchmark/README.md
@ -54,6 +54,13 @@ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 prompt = "今天睡不着怎么办"
 
 with torch.inference_mode():
+    # wamup two times as use ipex
+    for i in range(2):
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
+        output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
+        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+    # collect performance data now
+    for i in range(5):
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
        output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
--- a/python/llm/example/transformers/transformers_int4/GPU/chatglm2/generate.py
+++ b/python/llm/example/transformers/transformers_int4/GPU/chatglm2/generate.py
@ -46,7 +46,7 @@ if __name__ == '__main__':
                                      load_in_4bit=True,
                                      optimize_model=False,
                                      trust_remote_code=True)
-    model = model.half().to('xpu')
+    model = model.to('xpu')

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path,
--- a/python/llm/example/transformers/transformers_int4/GPU/llama2/generate.py
+++ b/python/llm/example/transformers/transformers_int4/GPU/llama2/generate.py
@ -49,7 +49,7 @@ if __name__ == '__main__':
                                                 load_in_4bit=True,
                                                 optimize_model=False,
                                                 trust_remote_code=True)
-    model = model.half().to('xpu')
+    model = model.to('xpu')

    # Load tokenizer
    tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
--- a/python/llm/src/bigdl/llm/transformers/linear_quant.py
+++ b/python/llm/src/bigdl/llm/transformers/linear_quant.py
@ -169,7 +169,6 @@ class ParamsQuant(torch.nn.Parameter):
                                    quantized=self.quantized,
                                    _shape=self._shape,
                                    qtype=self.qtype)
-
            return new_param


@ -244,6 +243,9 @@ class LinearQuant(nn.Linear):

            if x_2d.is_contiguous() is False:
                x_2d = x_2d.contiguous()
+            # current workaround to reduce first token latency of fp32 input
+            if x_2d.shape[0] > 1 and x_2d.dtype == torch.float32:
+                x_2d = x_2d.half()
            # input format of linear_q4.forward is 1: input, 2: weight
            result = linear_q4_0.forward(x_2d, x0)
            new_shape = x_shape[:-1] + (self.out_len,)