From 428b7105f6cc67b9a520e51bba639404072feac0 Mon Sep 17 00:00:00 2001 From: ivy-lv11 <59141989+ivy-lv11@users.noreply.github.com> Date: Sun, 4 Feb 2024 10:25:55 +0800 Subject: [PATCH] Add HF and PyTorch example InternLM2 (#10061) --- README.md | 1 + python/llm/README.md | 1 + .../Model/internlm2/README.md | 72 ++++++++++ .../Model/internlm2/generate.py | 69 ++++++++++ .../PyTorch-Models/Model/internlm2/README.md | 71 ++++++++++ .../Model/internlm2/generate.py | 70 ++++++++++ .../Model/internlm2/README.md | 127 ++++++++++++++++++ .../Model/internlm2/generate.py | 88 ++++++++++++ .../PyTorch-Models/Model/internlm2/README.md | 127 ++++++++++++++++++ .../Model/internlm2/generate.py | 80 +++++++++++ .../llm/src/bigdl/llm/transformers/convert.py | 29 ++-- .../bigdl/llm/transformers/models/internlm.py | 115 ++++++++++++++++ 12 files changed, 842 insertions(+), 8 deletions(-) create mode 100644 python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/README.md create mode 100644 python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py create mode 100644 python/llm/example/CPU/PyTorch-Models/Model/internlm2/README.md create mode 100644 python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py create mode 100644 python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/README.md create mode 100644 python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py create mode 100644 python/llm/example/GPU/PyTorch-Models/Model/internlm2/README.md create mode 100644 python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py diff --git a/README.md b/README.md index 70d91cb5..b5746d9e 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,7 @@ Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLa | Yi | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi) | | BlueLM | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm) | | SOLAR | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar) | +| InternLM2 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2) | ***For more details, please refer to the `bigdl-llm` [Document](https://test-bigdl-llm.readthedocs.io/en/main/doc/LLM/index.html), [Readme](python/llm), [Tutorial](https://github.com/intel-analytics/bigdl-llm-tutorial) and [API Doc](https://bigdl.readthedocs.io/en/latest/doc/PythonAPI/LLM/index.html).*** diff --git a/python/llm/README.md b/python/llm/README.md index 185d72d8..2a8369be 100644 --- a/python/llm/README.md +++ b/python/llm/README.md @@ -75,6 +75,7 @@ Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLa | Yi | [link](example/CPU/HF-Transformers-AutoModels/Model/yi) | [link](example/GPU/HF-Transformers-AutoModels/Model/yi) | | BlueLM | [link](example/CPU/HF-Transformers-AutoModels/Model/bluelm) | [link](example/GPU/HF-Transformers-AutoModels/Model/bluelm) | | SOLAR | [link](example/CPU/HF-Transformers-AutoModels/Model/solar) | [link](example/GPU/HF-Transformers-AutoModels/Model/solar) | +| InternLM2 | [link](example/CPU/HF-Transformers-AutoModels/Model/internlm2) | [link](example/GPU/HF-Transformers-AutoModels/Model/internlm2) | ### Working with `bigdl-llm` diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/README.md new file mode 100644 index 00000000..139dc6f4 --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/README.md @@ -0,0 +1,72 @@ +# InternLM2 + +In this directory, you will find examples on how you could apply BigDL-LLM INT4 optimizations on InternLM2 models. For illustration purposes, we utilize the [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as a reference InternLM2 model. + +## 0. Requirements +To run these examples with BigDL-LLM, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a InternLM2 model to predict the next N tokens using `generate()` API, with BigDL-LLM INT4 optimizations. +### 1. Install +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.9 +conda activate llm + +pip install --pre --upgrade bigdl-llm[all] # install bigdl-llm with 'all' option +``` + +### 2. Run +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the InternLM2 model to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'internlm/internlm2-chat-7b'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +> **Note**: When loading the model in 4-bit, BigDL-LLM converts linear layers in the model into INT4 format. In theory, a *X*B model saved in 16-bit will requires approximately 2*X* GB of memory for loading, and ~0.5*X* GB memory for further inference. +> +> Please select the appropriate size of the InternLM2 model based on the capabilities of your machine. + +#### 2.1 Client +On client Windows machine, it is recommended to run directly with full utilization of all cores: +```powershell +python ./generate.py +``` + +#### 2.2 Server +For optimal performance on server, it is recommended to set several environment variables (refer to [here](../README.md#best-known-configuration-on-linux) for more information), and run the example with all the physical cores of a single socket. + +E.g. on Linux, +```bash +# set BigDL-LLM env variables +source bigdl-llm-init + +# e.g. for a server with 48 cores per socket +export OMP_NUM_THREADS=48 +numactl -C 0-47 -m 0 python ./generate.py +``` + +#### 2.3 Sample Output +#### [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<|User|>:AI是什么? +<|Bot|>: +-------------------- Output -------------------- +<|User|>:AI是什么? +<|Bot|>:AI是人工智能的缩写,是计算机科学的一个分支,旨在使计算机能够像人类一样思考、学习和执行任务。AI技术包括机器学习、自然 +``` + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<|User|>:What is AI? +<|Bot|>: +-------------------- Output -------------------- +<|User|>:What is AI? +<|Bot|>:AI is the ability of machines to perform tasks that would normally require human intelligence, such as perception, reasoning, learning, and decision-making. AI is made possible +``` diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py new file mode 100644 index 00000000..00ffec57 --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py @@ -0,0 +1,69 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse +import numpy as np + +from transformers import AutoTokenizer + +# you could tune the prompt based on your own model, +# here the prompt tuning refers to https://huggingface.co/internlm/internlm-chat-7b-8k/blob/main/modeling_internlm.py#L768 +INTERNLM_PROMPT_FORMAT = "<|User|>:{prompt}\n<|Bot|>:" + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for InternLM model') + parser.add_argument('--repo-id-or-model-path', type=str, default="internlm/internlm2-chat-7b", + help='The huggingface repo id for the InternLM model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="AI是什么?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + + from bigdl.llm.transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(model_path, + load_in_4bit=True, + trust_remote_code=True) + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + # Generate predicted tokens + with torch.inference_mode(): + prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt) + input_ids = tokenizer.encode(prompt, return_tensors="pt") + st = time.time() + # if your selected model is capable of utilizing previous key/value attentions + # to enhance decoding speed, but has `"use_cache": false` in its model config, + # it is important to set `use_cache=True` explicitly in the `generate` function + # to obtain optimal performance with BigDL-LLM INT4 optimizations + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + end = time.time() + output_str = tokenizer.decode(output[0], skip_special_tokens=True) + output_str = output_str.split("")[0] + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(output_str) diff --git a/python/llm/example/CPU/PyTorch-Models/Model/internlm2/README.md b/python/llm/example/CPU/PyTorch-Models/Model/internlm2/README.md new file mode 100644 index 00000000..1f4b8731 --- /dev/null +++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm2/README.md @@ -0,0 +1,71 @@ +# InternLM2 +In this directory, you will find examples on how you could use BigDL-LLM `optimize_model` API to accelerate InternLM2 models. For illustration purposes, we utilize the [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as reference InternLM2 model. + +## Requirements +To run these examples with BigDL-LLM, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a InternLM2 model to predict the next N tokens using `generate()` API, with BigDL-LLM INT4 optimizations. +### 1. Install +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). + +After installing conda, create a Python environment for BigDL-LLM: +```bash +conda create -n llm python=3.9 # recommend to use Python 3.9 +conda activate llm + +pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option +``` + +### 2. Run +After setting up the Python environment, you could run the example by following steps. + +#### 2.1 Client +On client Windows machines, it is recommended to run directly with full utilization of all cores: +```powershell +python ./generate.py --prompt 'What is AI?' +``` +More information about arguments can be found in [Arguments Info](#23-arguments-info) section. The expected output can be found in [Sample Output](#24-sample-output) section. + +#### 2.2 Server +For optimal performance on server, it is recommended to set several environment variables (refer to [here](../README.md#best-known-configuration-on-linux) for more information), and run the example with all the physical cores of a single socket. + +E.g. on Linux, +```bash +# set BigDL-LLM env variables +source bigdl-llm-init + +# e.g. for a server with 48 cores per socket +export OMP_NUM_THREADS=48 +numactl -C 0-47 -m 0 python ./generate.py --prompt 'What is AI?' +``` +More information about arguments can be found in [Arguments Info](#23-arguments-info) section. The expected output can be found in [Sample Output](#24-sample-output) section. + +#### 2.3 Arguments Info +In the example, several arguments can be passed to satisfy your requirements: + +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the InternLM2 to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'internlm/internlm2-chat-7b'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +#### 2.3 Sample Output +#### [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<|User|>:AI是什么? +<|Bot|>: +-------------------- Output -------------------- +<|User|>:AI是什么? +<|Bot|>:AI是人工智能的缩写,是计算机科学的一个分支,旨在使计算机能够像人类一样思考、学习和执行任务。AI技术包括机器学习、自然 +``` + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<|User|>:What is AI? +<|Bot|>: +-------------------- Output -------------------- +<|User|>:What is AI? +<|Bot|>:AI is the ability of machines to perform tasks that would normally require human intelligence, such as perception, reasoning, learning, and decision-making. AI is made possible +``` diff --git a/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py new file mode 100644 index 00000000..afca8397 --- /dev/null +++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py @@ -0,0 +1,70 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse +import numpy as np + +from bigdl.llm import optimize_model +from transformers import AutoTokenizer + +# you could tune the prompt based on your own model, +# here the prompt tuning refers to https://huggingface.co/internlm/internlm-chat-7b-8k/blob/main/modeling_internlm.py#L768 +INTERNLM_PROMPT_FORMAT = "<|User|>:{prompt}\n<|Bot|>:" + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for InternLM model') + parser.add_argument('--repo-id-or-model-path', type=str, default="internlm/internlm2-chat-7b", + help='The huggingface repo id for the InternLM model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="AI是什么?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + + from bigdl.llm import optimize_model + from transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(model_path, + trust_remote_code=True) + model = optimize_model(model) + + + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + # Generate predicted tokens + with torch.inference_mode(): + prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt) + input_ids = tokenizer.encode(prompt, return_tensors="pt") + st = time.time() + + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + end = time.time() + output_str = tokenizer.decode(output[0], skip_special_tokens=True) + output_str = output_str.split("")[0] + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(output_str) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/README.md new file mode 100644 index 00000000..78b02254 --- /dev/null +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/README.md @@ -0,0 +1,127 @@ +# InternLM2 +In this directory, you will find examples on how you could apply BigDL-LLM INT4 optimizations on InternLM2 models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as a reference InternLM model. + +## 0. Requirements +To run these examples with BigDL-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a InternLM2 model to predict the next N tokens using `generate()` API, with BigDL-LLM INT4 optimizations on Intel GPUs. +### 1. Install +#### 1.1 Installation on Linux +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.9 +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu +``` + +#### 1.2 Installation on Windows +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.9 libuv +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu +``` + +### 2. Configures OneAPI environment variables +#### 2.1 Configurations for Linux +```bash +source /opt/intel/oneapi/setvars.sh +``` + +#### 2.2 Configurations for Windows +```cmd +call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" +``` +> Note: Please make sure you are using **CMD** (**Anaconda Prompt** if using conda) to run the command as PowerShell is not supported. +### 3. Runtime Configurations +For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. +#### 3.1 Configurations for Linux +
+ +For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +``` + +
+ +
+ +For Intel Data Center GPU Max Series + +```bash +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export ENABLE_SDP_FUSION=1 +``` +> Note: Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`. +
+ +#### 3.2 Configurations for Windows +
+ +For Intel iGPU + +```cmd +set SYCL_CACHE_PERSISTENT=1 +set BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +
+ +For Intel Arc™ A300-Series or Pro A60 + +```cmd +set SYCL_CACHE_PERSISTENT=1 +``` + +
+ +
+ +For other Intel dGPU Series + +There is no need to set further environment variables. + +
+ +> Note: For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. +### 4. Running examples + +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the InternLM2 model (e.g. `internlm/internlm2-chat-7b`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'internlm/internlm2-chat-7b'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +#### Sample Output +#### [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<|User|>:AI是什么? +<|Bot|>: +-------------------- Output -------------------- +<|User|>:AI是什么? +<|Bot|>:AI是人工智能的缩写,是计算机科学的一个分支,旨在使计算机能够像人类一样思考、学习和执行任务。AI技术包括机器学习、自然 +``` + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<|User|>:What is AI? +<|Bot|>: +-------------------- Output -------------------- +<|User|>:What is AI? +<|Bot|>:AI is the ability of machines to perform tasks that would normally require human intelligence, such as perception, reasoning, learning, and decision-making. AI is made possible +``` diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py new file mode 100644 index 00000000..fdbd312c --- /dev/null +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py @@ -0,0 +1,88 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse + +from transformers import AutoTokenizer +from bigdl.llm import optimize_model +import intel_extension_for_pytorch as ipex + +# you could tune the prompt based on your own model, +# here the prompt tuning refers to https://huggingface.co/internlm/internlm-chat-7b-8k/blob/main/modeling_internlm.py#L768 +INTERNLM_PROMPT_FORMAT = "<|User|>:{prompt}\n<|Bot|>:" + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for InternLM model') + parser.add_argument('--repo-id-or-model-path', type=str, default="internlm/internlm2-chat-7b", + help='The huggingface repo id for the InternLM model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="AI是什么?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. + # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. + from bigdl.llm.transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(model_path, + load_in_4bit=True, + trust_remote_code=True, + use_cache=True) + + + + + model = model.to('xpu') + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + # Generate predicted tokens + with torch.inference_mode(): + prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt) + input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + + + # start inference + st = time.time() + # if your selected model is capable of utilizing previous key/value attentions + # to enhance decoding speed, but has `"use_cache": false` in its model config, + # it is important to set `use_cache=True` explicitly in the `generate` function + # to obtain optimal performance with BigDL-LLM INT4 optimizations + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + torch.xpu.synchronize() + + end = time.time() + output = output.cpu() + output_str = tokenizer.decode(output[0], skip_special_tokens=True) + output_str = output_str.split("")[0] + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(output_str) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/README.md b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/README.md new file mode 100644 index 00000000..78b02254 --- /dev/null +++ b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/README.md @@ -0,0 +1,127 @@ +# InternLM2 +In this directory, you will find examples on how you could apply BigDL-LLM INT4 optimizations on InternLM2 models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as a reference InternLM model. + +## 0. Requirements +To run these examples with BigDL-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a InternLM2 model to predict the next N tokens using `generate()` API, with BigDL-LLM INT4 optimizations on Intel GPUs. +### 1. Install +#### 1.1 Installation on Linux +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.9 +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu +``` + +#### 1.2 Installation on Windows +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.9 libuv +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu +``` + +### 2. Configures OneAPI environment variables +#### 2.1 Configurations for Linux +```bash +source /opt/intel/oneapi/setvars.sh +``` + +#### 2.2 Configurations for Windows +```cmd +call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" +``` +> Note: Please make sure you are using **CMD** (**Anaconda Prompt** if using conda) to run the command as PowerShell is not supported. +### 3. Runtime Configurations +For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. +#### 3.1 Configurations for Linux +
+ +For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +``` + +
+ +
+ +For Intel Data Center GPU Max Series + +```bash +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export ENABLE_SDP_FUSION=1 +``` +> Note: Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`. +
+ +#### 3.2 Configurations for Windows +
+ +For Intel iGPU + +```cmd +set SYCL_CACHE_PERSISTENT=1 +set BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +
+ +For Intel Arc™ A300-Series or Pro A60 + +```cmd +set SYCL_CACHE_PERSISTENT=1 +``` + +
+ +
+ +For other Intel dGPU Series + +There is no need to set further environment variables. + +
+ +> Note: For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. +### 4. Running examples + +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the InternLM2 model (e.g. `internlm/internlm2-chat-7b`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'internlm/internlm2-chat-7b'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +#### Sample Output +#### [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<|User|>:AI是什么? +<|Bot|>: +-------------------- Output -------------------- +<|User|>:AI是什么? +<|Bot|>:AI是人工智能的缩写,是计算机科学的一个分支,旨在使计算机能够像人类一样思考、学习和执行任务。AI技术包括机器学习、自然 +``` + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<|User|>:What is AI? +<|Bot|>: +-------------------- Output -------------------- +<|User|>:What is AI? +<|Bot|>:AI is the ability of machines to perform tasks that would normally require human intelligence, such as perception, reasoning, learning, and decision-making. AI is made possible +``` diff --git a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py new file mode 100644 index 00000000..570edfbe --- /dev/null +++ b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py @@ -0,0 +1,80 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse + +from transformers import AutoTokenizer +from bigdl.llm import optimize_model +import intel_extension_for_pytorch as ipex + +# you could tune the prompt based on your own model, +# here the prompt tuning refers to https://huggingface.co/internlm/internlm-chat-7b-8k/blob/main/modeling_internlm.py#L768 +INTERNLM_PROMPT_FORMAT = "<|User|>:{prompt}\n<|Bot|>:" + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for InternLM model') + parser.add_argument('--repo-id-or-model-path', type=str, default="internlm/internlm2-chat-7b", + help='The huggingface repo id for the InternLM model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="AI是什么?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + + + from transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(model_path, + trust_remote_code=True, + use_cache=True) + model = optimize_model(model) + + model = model.to('xpu') + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + # Generate predicted tokens + with torch.inference_mode(): + prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt) + input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + + + # start inference + st = time.time() + + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + torch.xpu.synchronize() + + end = time.time() + output = output.cpu() + output_str = tokenizer.decode(output[0], skip_special_tokens=True) + output_str = output_str.split("")[0] + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(output_str) diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py index 0b30b579..de9127ed 100644 --- a/python/llm/src/bigdl/llm/transformers/convert.py +++ b/python/llm/src/bigdl/llm/transformers/convert.py @@ -829,14 +829,27 @@ def _optimize_post(model, lightweight_bmm=False): modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) from bigdl.llm.transformers.models.internlm import internlm_attention_forward - convert_forward(model, - module.InternLMAttention, - internlm_attention_forward - ) - convert_forward(model, - module.InternLMRMSNorm, - llama_rms_norm_forward - ) + from bigdl.llm.transformers.models.internlm import internlm2_attention_forward + try: + convert_forward(model, + module.InternLM2Attention, + internlm2_attention_forward + ) + except: + convert_forward(model, + module.InternLMAttention, + internlm_attention_forward + ) + try: + convert_forward(model, + module.InternLM2RMSNorm, + llama_rms_norm_forward + ) + except: + convert_forward(model, + module.InternLMRMSNorm, + llama_rms_norm_forward + ) elif model.config.model_type == "qwen": if hasattr(model.config, "visual"): # for Qwen-VL-Chat diff --git a/python/llm/src/bigdl/llm/transformers/models/internlm.py b/python/llm/src/bigdl/llm/transformers/models/internlm.py index 53475ed9..e75e67d4 100644 --- a/python/llm/src/bigdl/llm/transformers/models/internlm.py +++ b/python/llm/src/bigdl/llm/transformers/models/internlm.py @@ -175,3 +175,118 @@ def internlm_attention_forward( attn_weights = None return attn_output, attn_weights, past_key_value + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). + The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to + (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, + n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def internlm2_attention_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor]=None, + position_ids: Optional[torch.LongTensor]=None, + past_key_value: Optional[Tuple[torch.Tensor]]=None, + output_attentions: bool=False, + use_cache: bool=False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + from einops import rearrange + qkv_states = rearrange( + qkv_states, + "b q (h gs d) -> b q h gs d", + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d") + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + if query_states.device.type == "xpu" and not (self.training and query_states.requires_grad): + query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, + key_states, + position_ids, + "internlm") + else: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + # query_states, key_states = apply_rotary_pos_emb(query_states, + # key_states, cos, sin, position_ids) + query_states, key_states = apply_rotary_pos_emb( + query_states, + key_states, + cos, + sin, + position_ids, + "internlm") + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + invalidInputError( + False, + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, " + f"but is {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + invalidInputError( + False, + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, " + f"but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, + dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + invalidInputError( + False, + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, " + f"but is {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value