llm: update all ARC int4 examples (#8809)

* update GPU examples

* update other examples

* fix

* update based on comment
This commit is contained in:
Ruonan Wang 2023-08-25 15:26:10 +08:00 committed by GitHub
parent b8b1b6888b
commit 0186f3ab2f
10 changed files with 166 additions and 24 deletions

View file

@ -15,12 +15,12 @@
# #
import torch import torch
import intel_extension_for_pytorch as ipex
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
import intel_extension_for_pytorch as ipex
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,
BAICHUAN_PROMPT_FORMAT = "<human>{prompt} <bot>" BAICHUAN_PROMPT_FORMAT = "<human>{prompt} <bot>"
@ -44,7 +44,7 @@ if __name__ == '__main__':
load_in_4bit=True, load_in_4bit=True,
optimize_model=False, optimize_model=False,
trust_remote_code=True) trust_remote_code=True)
model = model.half().to('xpu') model = model.to('xpu')
# Load tokenizer # Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, tokenizer = AutoTokenizer.from_pretrained(model_path,
@ -54,16 +54,17 @@ if __name__ == '__main__':
with torch.inference_mode(): with torch.inference_mode():
prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt) prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
# ipex model needs a warmup, then inference time can be accurate
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
# start inference
st = time.time() st = time.time()
# if your selected model is capable of utilizing previous key/value attentions # if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config, # to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function # it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations # to obtain optimal performance with BigDL-LLM INT4 optimizations
# if your selected model has `"do_sample": true` in its generation config,
# it is important to set `do_sample=False` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
output = model.generate(input_ids, output = model.generate(input_ids,
do_sample=False,
max_new_tokens=args.n_predict) max_new_tokens=args.n_predict)
torch.xpu.synchronize() torch.xpu.synchronize()
end = time.time() end = time.time()

View file

@ -65,3 +65,44 @@ Inference time: xxxx s
答: Artificial Intelligence (AI) refers to the ability of a computer or machine to perform tasks that typically require human-like intelligence, such as understanding language, recognizing patterns 答: Artificial Intelligence (AI) refers to the ability of a computer or machine to perform tasks that typically require human-like intelligence, such as understanding language, recognizing patterns
``` ```
## Example 2: Stream Chat using `stream_chat()` API
In the example [streamchat.py](./streamchat.py), we show a basic use case for a ChatGLM2 model to stream chat, with BigDL-LLM INT4 optimizations.
### 1. Install
We suggest using conda to manage environment:
```bash
conda create -n llm python=3.9
conda activate llm
# below command will install intel_extension_for_pytorch==2.0.110+xpu as default
# you can install specific ipex/torch version for your need
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
```
### 2. Configures OneAPI environment variables
```bash
source /opt/intel/oneapi/setvars.sh
```
### 3. Run
For optimal performance on Arc, it is recommended to set several environment variables.
```bash
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
```
**Stream Chat using `stream_chat()` API**:
```
python ./streamchat.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --question QUESTION
```
**Chat using `chat()` API**:
```
python ./streamchat.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --question QUESTION --disable-stream
```
Arguments info:
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the ChatGLM2 model to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'THUDM/chatglm2-6b'`.
- `--question QUESTION`: argument defining the question to ask. It is default to be `"晚上睡不着应该怎么办"`.
- `--disable-stream`: argument defining whether to stream chat. If include `--disable-stream` when running the script, the stream chat is disabled and `chat()` API is used.

View file

@ -15,13 +15,13 @@
# #
import torch import torch
import intel_extension_for_pytorch as ipex
import time import time
import argparse import argparse
import numpy as np import numpy as np
from bigdl.llm.transformers import AutoModel from bigdl.llm.transformers import AutoModel
from transformers import AutoTokenizer from transformers import AutoTokenizer
import intel_extension_for_pytorch as ipex
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,
# here the prompt tuning refers to https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L1007 # here the prompt tuning refers to https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L1007
@ -56,6 +56,11 @@ if __name__ == '__main__':
with torch.inference_mode(): with torch.inference_mode():
prompt = CHATGLM_V2_PROMPT_FORMAT.format(prompt=args.prompt) prompt = CHATGLM_V2_PROMPT_FORMAT.format(prompt=args.prompt)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
# ipex model needs a warmup, then inference time can be accurate
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
# start inference
st = time.time() st = time.time()
# if your selected model is capable of utilizing previous key/value attentions # if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config, # to enhance decoding speed, but has `"use_cache": false` in its model config,

View file

@ -0,0 +1,72 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import torch
import intel_extension_for_pytorch as ipex
import time
import argparse
import numpy as np
from bigdl.llm.transformers import AutoModel
from transformers import AutoTokenizer
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Stream Chat for ChatGLM2 model')
parser.add_argument('--repo-id-or-model-path', type=str, default="/mnt/disk1/models/chatglm2-6b",
help='The huggingface repo id for the ChatGLM2 model to be downloaded'
', or the path to the huggingface checkpoint folder')
parser.add_argument('--question', type=str, default="晚上睡不着应该怎么办",
help='Qustion you want to ask')
parser.add_argument('--disable-stream', action="store_true",
help='Disable stream chat')
args = parser.parse_args()
model_path = args.repo_id_or_model_path
disable_stream = args.disable_stream
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
model = AutoModel.from_pretrained(model_path,
load_in_4bit=True,
trust_remote_code=True,
optimize_model=False)
model.to('xpu')
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path,
trust_remote_code=True)
with torch.inference_mode():
prompt = args.question
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
# ipex model needs a warmup, then inference time can be accurate
output = model.generate(input_ids,
max_new_tokens=32)
# start inference
if disable_stream:
# Chat
response, history = model.chat(tokenizer, args.question, history=[])
print('-'*20, 'Chat Output', '-'*20)
print(response)
else:
# Stream chat
response_ = ""
print('-'*20, 'Stream Chat Output', '-'*20)
for response, history in model.stream_chat(tokenizer, args.question, history=[]):
print(response.replace(response_, ""), end="")
response_ = response

View file

@ -15,12 +15,12 @@
# #
import torch import torch
import intel_extension_for_pytorch as ipex
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
import intel_extension_for_pytorch as ipex
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,
FALCON_PROMPT_FORMAT = "<human> {prompt} <bot>" FALCON_PROMPT_FORMAT = "<human> {prompt} <bot>"
@ -46,7 +46,7 @@ if __name__ == '__main__':
load_in_4bit=True, load_in_4bit=True,
optimize_model=False, optimize_model=False,
trust_remote_code=True) trust_remote_code=True)
model = model.half().to('xpu') model = model.to('xpu')
# Load tokenizer # Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, tokenizer = AutoTokenizer.from_pretrained(model_path,
@ -56,6 +56,12 @@ if __name__ == '__main__':
with torch.inference_mode(): with torch.inference_mode():
prompt = FALCON_PROMPT_FORMAT.format(prompt=args.prompt) prompt = FALCON_PROMPT_FORMAT.format(prompt=args.prompt)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
# ipex model needs a warmup, then inference time can be accurate
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
# start inference
st = time.time() st = time.time()
# if your selected model is capable of utilizing previous key/value attentions # if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config, # to enhance decoding speed, but has `"use_cache": false` in its model config,

View file

@ -15,12 +15,12 @@
# #
import torch import torch
import intel_extension_for_pytorch as ipex
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
import intel_extension_for_pytorch as ipex
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,
# here the prompt tuning refers to https://huggingface.co/internlm/internlm-chat-7b-8k/blob/main/modeling_internlm.py#L768 # here the prompt tuning refers to https://huggingface.co/internlm/internlm-chat-7b-8k/blob/main/modeling_internlm.py#L768
@ -45,7 +45,7 @@ if __name__ == '__main__':
load_in_4bit=True, load_in_4bit=True,
optimize_model=False, optimize_model=False,
trust_remote_code=True) trust_remote_code=True)
model = model.half().to('xpu') model = model.to('xpu')
# Load tokenizer # Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, tokenizer = AutoTokenizer.from_pretrained(model_path,
@ -55,6 +55,11 @@ if __name__ == '__main__':
with torch.inference_mode(): with torch.inference_mode():
prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt) prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
# ipex model needs a warmup, then inference time can be accurate
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
# start inference
st = time.time() st = time.time()
# if your selected model is capable of utilizing previous key/value attentions # if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config, # to enhance decoding speed, but has `"use_cache": false` in its model config,

View file

@ -15,12 +15,12 @@
# #
import torch import torch
import intel_extension_for_pytorch as ipex
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer from transformers import LlamaTokenizer
import intel_extension_for_pytorch as ipex
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,
# here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style
@ -58,6 +58,11 @@ if __name__ == '__main__':
with torch.inference_mode(): with torch.inference_mode():
prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt) prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
# ipex model needs a warmup, then inference time can be accurate
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
# start inference
st = time.time() st = time.time()
# if your selected model is capable of utilizing previous key/value attentions # if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config, # to enhance decoding speed, but has `"use_cache": false` in its model config,

View file

@ -15,12 +15,12 @@
# #
import torch import torch
import intel_extension_for_pytorch as ipex
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig from transformers import AutoTokenizer, GenerationConfig
import intel_extension_for_pytorch as ipex
# you could tune the prompt based on your own model, # you could tune the prompt based on your own model,
# here the prompt tuning refers to https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py # here the prompt tuning refers to https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py
@ -46,7 +46,7 @@ if __name__ == '__main__':
load_in_4bit=True, load_in_4bit=True,
optimize_model=False, optimize_model=False,
trust_remote_code=True) trust_remote_code=True)
model = model.half().to('xpu') model = model.to('xpu')
# Load tokenizer # Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, tokenizer = AutoTokenizer.from_pretrained(model_path,
@ -56,6 +56,11 @@ if __name__ == '__main__':
with torch.inference_mode(): with torch.inference_mode():
prompt = MPT_PROMPT_FORMAT.format(prompt=args.prompt) prompt = MPT_PROMPT_FORMAT.format(prompt=args.prompt)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
# ipex model needs a warmup, then inference time can be accurate
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
# start inference
# enabling `use_cache=True` allows the model to utilize the previous # enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding; # key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM INT4 optimizations, # to obtain optimal performance with BigDL-LLM INT4 optimizations,

View file

@ -15,12 +15,12 @@
# #
import torch import torch
import intel_extension_for_pytorch as ipex
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForCausalLM from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
import intel_extension_for_pytorch as ipex
# you could tune the prompt based on your own model # you could tune the prompt based on your own model
QWEN_PROMPT_FORMAT = "<human>{prompt} <bot>" QWEN_PROMPT_FORMAT = "<human>{prompt} <bot>"
@ -44,7 +44,7 @@ if __name__ == '__main__':
load_in_4bit=True, load_in_4bit=True,
optimize_model=False, optimize_model=False,
trust_remote_code=True) trust_remote_code=True)
model = model.half().to('xpu') model = model.to('xpu')
# Load tokenizer # Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, tokenizer = AutoTokenizer.from_pretrained(model_path,
@ -54,16 +54,17 @@ if __name__ == '__main__':
with torch.inference_mode(): with torch.inference_mode():
prompt = QWEN_PROMPT_FORMAT.format(prompt=args.prompt) prompt = QWEN_PROMPT_FORMAT.format(prompt=args.prompt)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
# ipex model needs a warmup, then inference time can be accurate
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
# start inference
st = time.time() st = time.time()
# if your selected model is capable of utilizing previous key/value attentions # if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config, # to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function # it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations # to obtain optimal performance with BigDL-LLM INT4 optimizations
# if your selected model has `"do_sample": true` in its generation config,
# it is important to set `do_sample=False` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
output = model.generate(input_ids, output = model.generate(input_ids,
do_sample=False,
max_new_tokens=args.n_predict) max_new_tokens=args.n_predict)
torch.xpu.synchronize() torch.xpu.synchronize()
end = time.time() end = time.time()

View file

@ -15,13 +15,14 @@
# #
import torch import torch
import intel_extension_for_pytorch as ipex
import time import time
import argparse import argparse
from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
from transformers import WhisperProcessor from transformers import WhisperProcessor
from datasets import load_dataset from datasets import load_dataset
import intel_extension_for_pytorch as ipex
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recognize Tokens using `generate()` API for Whisper model') parser = argparse.ArgumentParser(description='Recognize Tokens using `generate()` API for Whisper model')
@ -45,7 +46,7 @@ if __name__ == '__main__':
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path,
load_in_4bit=True, load_in_4bit=True,
optimize_model=False) optimize_model=False)
model.half().to('xpu') model.to('xpu')
model.config.forced_decoder_ids = None model.config.forced_decoder_ids = None
# Load processor # Load processor
@ -61,7 +62,7 @@ if __name__ == '__main__':
input_features = processor(sample["array"], input_features = processor(sample["array"],
sampling_rate=sample["sampling_rate"], sampling_rate=sample["sampling_rate"],
return_tensors="pt").input_features.half().to('xpu') return_tensors="pt").input_features.to('xpu')
st = time.time() st = time.time()
# if your selected model is capable of utilizing previous key/value attentions # if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config, # to enhance decoding speed, but has `"use_cache": false` in its model config,
@ -73,4 +74,4 @@ if __name__ == '__main__':
output_str = processor.batch_decode(predicted_ids, skip_special_tokens=True) output_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(f'Inference time: {end-st} s') print(f'Inference time: {end-st} s')
print('-'*20, 'Output', '-'*20) print('-'*20, 'Output', '-'*20)
print(output_str) print(output_str)