From 0186f3ab2fe572f1676b5b3c5f56766be09d2865 Mon Sep 17 00:00:00 2001 From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com> Date: Fri, 25 Aug 2023 15:26:10 +0800 Subject: [PATCH] llm: update all ARC int4 examples (#8809) * update GPU examples * update other examples * fix * update based on comment --- .../GPU/baichuan/generate.py | 13 ++-- .../transformers_int4/GPU/chatglm2/README.md | 41 +++++++++++ .../GPU/chatglm2/generate.py | 7 +- .../GPU/chatglm2/streamchat.py | 72 +++++++++++++++++++ .../transformers_int4/GPU/falcon/generate.py | 10 ++- .../GPU/internlm/generate.py | 9 ++- .../transformers_int4/GPU/llama2/generate.py | 7 +- .../transformers_int4/GPU/mpt/generate.py | 9 ++- .../transformers_int4/GPU/qwen/generate.py | 13 ++-- .../GPU/whisper/recognize.py | 9 +-- 10 files changed, 166 insertions(+), 24 deletions(-) create mode 100644 python/llm/example/transformers/transformers_int4/GPU/chatglm2/streamchat.py diff --git a/python/llm/example/transformers/transformers_int4/GPU/baichuan/generate.py b/python/llm/example/transformers/transformers_int4/GPU/baichuan/generate.py index 8d1ed267..cc7bee1d 100644 --- a/python/llm/example/transformers/transformers_int4/GPU/baichuan/generate.py +++ b/python/llm/example/transformers/transformers_int4/GPU/baichuan/generate.py @@ -15,12 +15,12 @@ # import torch +import intel_extension_for_pytorch as ipex import time import argparse from bigdl.llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer -import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, BAICHUAN_PROMPT_FORMAT = "{prompt} " @@ -44,7 +44,7 @@ if __name__ == '__main__': load_in_4bit=True, optimize_model=False, trust_remote_code=True) - model = model.half().to('xpu') + model = model.to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, @@ -54,16 +54,17 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + + # start inference st = time.time() # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function # to obtain optimal performance with BigDL-LLM INT4 optimizations - # if your selected model has `"do_sample": true` in its generation config, - # it is important to set `do_sample=False` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations output = model.generate(input_ids, - do_sample=False, max_new_tokens=args.n_predict) torch.xpu.synchronize() end = time.time() diff --git a/python/llm/example/transformers/transformers_int4/GPU/chatglm2/README.md b/python/llm/example/transformers/transformers_int4/GPU/chatglm2/README.md index 7f11a344..55bf6b17 100644 --- a/python/llm/example/transformers/transformers_int4/GPU/chatglm2/README.md +++ b/python/llm/example/transformers/transformers_int4/GPU/chatglm2/README.md @@ -65,3 +65,44 @@ Inference time: xxxx s 答: Artificial Intelligence (AI) refers to the ability of a computer or machine to perform tasks that typically require human-like intelligence, such as understanding language, recognizing patterns ``` + +## Example 2: Stream Chat using `stream_chat()` API +In the example [streamchat.py](./streamchat.py), we show a basic use case for a ChatGLM2 model to stream chat, with BigDL-LLM INT4 optimizations. +### 1. Install +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.9 +conda activate llm +# below command will install intel_extension_for_pytorch==2.0.110+xpu as default +# you can install specific ipex/torch version for your need +pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu +``` + +### 2. Configures OneAPI environment variables +```bash +source /opt/intel/oneapi/setvars.sh +``` + +### 3. Run + +For optimal performance on Arc, it is recommended to set several environment variables. + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +``` + +**Stream Chat using `stream_chat()` API**: +``` +python ./streamchat.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --question QUESTION +``` + +**Chat using `chat()` API**: +``` +python ./streamchat.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --question QUESTION --disable-stream +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the ChatGLM2 model to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'THUDM/chatglm2-6b'`. +- `--question QUESTION`: argument defining the question to ask. It is default to be `"晚上睡不着应该怎么办"`. +- `--disable-stream`: argument defining whether to stream chat. If include `--disable-stream` when running the script, the stream chat is disabled and `chat()` API is used. diff --git a/python/llm/example/transformers/transformers_int4/GPU/chatglm2/generate.py b/python/llm/example/transformers/transformers_int4/GPU/chatglm2/generate.py index 20c27a25..b52057a8 100644 --- a/python/llm/example/transformers/transformers_int4/GPU/chatglm2/generate.py +++ b/python/llm/example/transformers/transformers_int4/GPU/chatglm2/generate.py @@ -15,13 +15,13 @@ # import torch +import intel_extension_for_pytorch as ipex import time import argparse import numpy as np from bigdl.llm.transformers import AutoModel from transformers import AutoTokenizer -import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L1007 @@ -56,6 +56,11 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = CHATGLM_V2_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + + # start inference st = time.time() # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, diff --git a/python/llm/example/transformers/transformers_int4/GPU/chatglm2/streamchat.py b/python/llm/example/transformers/transformers_int4/GPU/chatglm2/streamchat.py new file mode 100644 index 00000000..eea3655c --- /dev/null +++ b/python/llm/example/transformers/transformers_int4/GPU/chatglm2/streamchat.py @@ -0,0 +1,72 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import intel_extension_for_pytorch as ipex +import time +import argparse +import numpy as np + +from bigdl.llm.transformers import AutoModel +from transformers import AutoTokenizer + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Stream Chat for ChatGLM2 model') + parser.add_argument('--repo-id-or-model-path', type=str, default="/mnt/disk1/models/chatglm2-6b", + help='The huggingface repo id for the ChatGLM2 model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--question', type=str, default="晚上睡不着应该怎么办", + help='Qustion you want to ask') + parser.add_argument('--disable-stream', action="store_true", + help='Disable stream chat') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + disable_stream = args.disable_stream + + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + model = AutoModel.from_pretrained(model_path, + load_in_4bit=True, + trust_remote_code=True, + optimize_model=False) + model.to('xpu') + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + with torch.inference_mode(): + prompt = args.question + input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=32) + + # start inference + if disable_stream: + # Chat + response, history = model.chat(tokenizer, args.question, history=[]) + print('-'*20, 'Chat Output', '-'*20) + print(response) + else: + # Stream chat + response_ = "" + print('-'*20, 'Stream Chat Output', '-'*20) + for response, history in model.stream_chat(tokenizer, args.question, history=[]): + print(response.replace(response_, ""), end="") + response_ = response diff --git a/python/llm/example/transformers/transformers_int4/GPU/falcon/generate.py b/python/llm/example/transformers/transformers_int4/GPU/falcon/generate.py index 8ffe846a..f84b3869 100644 --- a/python/llm/example/transformers/transformers_int4/GPU/falcon/generate.py +++ b/python/llm/example/transformers/transformers_int4/GPU/falcon/generate.py @@ -15,12 +15,12 @@ # import torch +import intel_extension_for_pytorch as ipex import time import argparse from bigdl.llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer -import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, FALCON_PROMPT_FORMAT = " {prompt} " @@ -46,7 +46,7 @@ if __name__ == '__main__': load_in_4bit=True, optimize_model=False, trust_remote_code=True) - model = model.half().to('xpu') + model = model.to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, @@ -56,6 +56,12 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = FALCON_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + + # start inference st = time.time() # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, diff --git a/python/llm/example/transformers/transformers_int4/GPU/internlm/generate.py b/python/llm/example/transformers/transformers_int4/GPU/internlm/generate.py index e20e66c2..92b610be 100644 --- a/python/llm/example/transformers/transformers_int4/GPU/internlm/generate.py +++ b/python/llm/example/transformers/transformers_int4/GPU/internlm/generate.py @@ -15,12 +15,12 @@ # import torch +import intel_extension_for_pytorch as ipex import time import argparse from bigdl.llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer -import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/internlm/internlm-chat-7b-8k/blob/main/modeling_internlm.py#L768 @@ -45,7 +45,7 @@ if __name__ == '__main__': load_in_4bit=True, optimize_model=False, trust_remote_code=True) - model = model.half().to('xpu') + model = model.to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, @@ -55,6 +55,11 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + + # start inference st = time.time() # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, diff --git a/python/llm/example/transformers/transformers_int4/GPU/llama2/generate.py b/python/llm/example/transformers/transformers_int4/GPU/llama2/generate.py index 0a97a67a..ac66c963 100644 --- a/python/llm/example/transformers/transformers_int4/GPU/llama2/generate.py +++ b/python/llm/example/transformers/transformers_int4/GPU/llama2/generate.py @@ -15,12 +15,12 @@ # import torch +import intel_extension_for_pytorch as ipex import time import argparse from bigdl.llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer -import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style @@ -58,6 +58,11 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + + # start inference st = time.time() # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, diff --git a/python/llm/example/transformers/transformers_int4/GPU/mpt/generate.py b/python/llm/example/transformers/transformers_int4/GPU/mpt/generate.py index c5412073..a64148e5 100644 --- a/python/llm/example/transformers/transformers_int4/GPU/mpt/generate.py +++ b/python/llm/example/transformers/transformers_int4/GPU/mpt/generate.py @@ -15,12 +15,12 @@ # import torch +import intel_extension_for_pytorch as ipex import time import argparse from bigdl.llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer, GenerationConfig -import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py @@ -46,7 +46,7 @@ if __name__ == '__main__': load_in_4bit=True, optimize_model=False, trust_remote_code=True) - model = model.half().to('xpu') + model = model.to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, @@ -56,6 +56,11 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = MPT_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + + # start inference # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; # to obtain optimal performance with BigDL-LLM INT4 optimizations, diff --git a/python/llm/example/transformers/transformers_int4/GPU/qwen/generate.py b/python/llm/example/transformers/transformers_int4/GPU/qwen/generate.py index 2244ac0d..7418ddf0 100644 --- a/python/llm/example/transformers/transformers_int4/GPU/qwen/generate.py +++ b/python/llm/example/transformers/transformers_int4/GPU/qwen/generate.py @@ -15,12 +15,12 @@ # import torch +import intel_extension_for_pytorch as ipex import time import argparse from bigdl.llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer -import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model QWEN_PROMPT_FORMAT = "{prompt} " @@ -44,7 +44,7 @@ if __name__ == '__main__': load_in_4bit=True, optimize_model=False, trust_remote_code=True) - model = model.half().to('xpu') + model = model.to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, @@ -54,16 +54,17 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = QWEN_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + + # start inference st = time.time() # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function # to obtain optimal performance with BigDL-LLM INT4 optimizations - # if your selected model has `"do_sample": true` in its generation config, - # it is important to set `do_sample=False` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations output = model.generate(input_ids, - do_sample=False, max_new_tokens=args.n_predict) torch.xpu.synchronize() end = time.time() diff --git a/python/llm/example/transformers/transformers_int4/GPU/whisper/recognize.py b/python/llm/example/transformers/transformers_int4/GPU/whisper/recognize.py index 9ae001c7..f31f02ec 100644 --- a/python/llm/example/transformers/transformers_int4/GPU/whisper/recognize.py +++ b/python/llm/example/transformers/transformers_int4/GPU/whisper/recognize.py @@ -15,13 +15,14 @@ # import torch +import intel_extension_for_pytorch as ipex import time import argparse from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq from transformers import WhisperProcessor from datasets import load_dataset -import intel_extension_for_pytorch as ipex + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Recognize Tokens using `generate()` API for Whisper model') @@ -45,7 +46,7 @@ if __name__ == '__main__': model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, load_in_4bit=True, optimize_model=False) - model.half().to('xpu') + model.to('xpu') model.config.forced_decoder_ids = None # Load processor @@ -61,7 +62,7 @@ if __name__ == '__main__': input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], - return_tensors="pt").input_features.half().to('xpu') + return_tensors="pt").input_features.to('xpu') st = time.time() # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, @@ -73,4 +74,4 @@ if __name__ == '__main__': output_str = processor.batch_decode(predicted_ids, skip_special_tokens=True) print(f'Inference time: {end-st} s') print('-'*20, 'Output', '-'*20) - print(output_str) \ No newline at end of file + print(output_str)