diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/README.md b/python/llm/example/GPU/Deepspeed-AutoTP/README.md index 60000b4b..fe2804e5 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/README.md +++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md @@ -2,7 +2,7 @@ This example demonstrates how to run BigDL-LLM optimized low-bit model on multiple [Intel GPUs](../README.md) by leveraging DeepSpeed AutoTP. -## 0. Requirements +## Requirements To run this example with BigDL-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. For this particular example, you will need at least two GPUs on your machine. ## Example: @@ -12,23 +12,29 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi ```bash conda create -n llm python=3.9 conda activate llm -# below command will install intel_extension_for_pytorch==2.0.110+xpu as default +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default # you can install specific ipex/torch version for your need -pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu -pip install oneccl_bind_pt==2.0.100 -f https://developer.intel.com/ipex-whl-stable-xpu -pip install git+https://github.com/microsoft/DeepSpeed.git@78c518e +pip install --pre --upgrade bigdl-llm[xpu_2.1] -f https://developer.intel.com/ipex-whl-stable-xpu +pip install oneccl_bind_pt==2.1.100 -f https://developer.intel.com/ipex-whl-stable-xpu +# configures OneAPI environment variables +source /opt/intel/oneapi/setvars.sh +pip install git+https://github.com/microsoft/DeepSpeed.git@4fc181b0 pip install git+https://github.com/intel/intel-extension-for-deepspeed.git@ec33277 pip install mpi4py +conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc ``` +> **Important**: IPEX 2.1.10+xpu requires IntelĀ® oneAPI Base Toolkit's version == 2024.0. Please make sure you have installed the correct version. -### 2. Configures OneAPI environment variables -```bash -source /opt/intel/oneapi/setvars.sh -``` +### 2. Run tensor parallel inference on multiple GPUs +Here, we provide example usages on different models and different hardwares. Please refer to the appropriate script based on your model and device: -### 3. Run tensor parallel inference on multiple GPUs -You many want to change some of the parameters in the script such as `NUM_GPUS`` to the number of GPUs you have on your machine. +#### Llama2 series +
Show LLaMA2-70B example +Run LLaMA2-70B on four Intel Data Center GPU Max 1550 ``` -bash run.sh +bash run_llama2_70b_pvc_1550_4_card.sh ``` +
+ +> **Note**:If you may want to select only part of GPUs on your machine, please change `ZE_AFFINITY_MASK` and `NUM_GPUS` to your prefer value. diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index 6b1309a7..7ccc4014 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -19,8 +19,16 @@ import torch import transformers import deepspeed -local_rank = int(os.getenv("LOCAL_RANK", "0")) -world_size = int(os.getenv("WORLD_SIZE", "1")) +def get_int_from_env(env_keys, default): + """Returns the first positive env value found in the `env_keys` list or the default.""" + for e in env_keys: + val = int(os.environ.get(e, -1)) + if val >= 0: + return val + return int(default) + +local_rank = get_int_from_env(["LOCAL_RANK","PMI_RANK"], "0") +world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1") from bigdl.llm import optimize_model @@ -35,7 +43,7 @@ from transformers import LlamaTokenizer, AutoTokenizer if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model') parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf", - help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded' + help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf`, `meta-llama/Llama-2-13b-chat-hf` and `meta-llama/Llama-2-70b-chat-hf`) to be downloaded' ', or the path to the huggingface checkpoint folder') parser.add_argument('--prompt', type=str, default="Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun", help='Prompt to infer') diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run.sh deleted file mode 100644 index 9c3490a1..00000000 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -source bigdl-llm-init -t -g -export MASTER_ADDR=127.0.0.1 -export CCL_ZE_IPC_EXCHANGE=sockets -NUM_GPUS=4 -if [[ -n $OMP_NUM_THREADS ]]; then - export OMP_NUM_THREADS=$(($OMP_NUM_THREADS / $NUM_GPUS)) -else - export OMP_NUM_THREADS=$(($(nproc) / $NUM_GPUS)) -fi -torchrun --standalone \ - --nnodes=1 \ - --nproc-per-node $NUM_GPUS \ - deepspeed_autotp.py --repo-id-or-model-path "meta-llama/Llama-2-7b-hf" diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_4_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_4_card.sh new file mode 100644 index 00000000..6c91ffd5 --- /dev/null +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_4_card.sh @@ -0,0 +1,31 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +export ZE_AFFINITY_MASK="0,1,2,3,4,5,6,7" # specify the used GPU +NUM_GPUS=8 # number of used GPU +export MASTER_ADDR=127.0.0.1 +export FI_PROVIDER=tcp +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} + +basekit_root=/opt/intel/oneapi +source $basekit_root/setvars.sh --force +source $basekit_root/ccl/latest/env/vars.sh --force + +export OMP_NUM_THREADS=$((56/$NUM_GPUS)) +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 +export TORCH_LLM_ALLREDUCE=1 +mpirun -np $NUM_GPUS --prepend-rank \ + python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf'