Add deepspeed autotp example readme (#9289)
* Add deepspeed autotp example readme * change word
This commit is contained in:
		
							parent
							
								
									f053688cad
								
							
						
					
					
						commit
						8838707009
					
				
					 2 changed files with 38 additions and 3 deletions
				
			
		
							
								
								
									
										34
									
								
								python/llm/example/GPU/Deepspeed-AutoTP/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								python/llm/example/GPU/Deepspeed-AutoTP/README.md
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,34 @@
 | 
				
			||||||
 | 
					# Run BigDL-LLM on Multiple Intel GPUs using DeepSpeed AutoTP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This example demonstrates how to run BigDL-LLM optimized low-bit model on multiple [Intel GPUs](../README.md) by leveraging DeepSpeed AutoTP.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## 0. Requirements
 | 
				
			||||||
 | 
					To run this example with BigDL-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. For this particular example, you will need at least two GPUs on your machine.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Example:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### 1. Install
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					conda create -n llm python=3.9
 | 
				
			||||||
 | 
					conda activate llm
 | 
				
			||||||
 | 
					# below command will install intel_extension_for_pytorch==2.0.110+xpu as default
 | 
				
			||||||
 | 
					# you can install specific ipex/torch version for your need
 | 
				
			||||||
 | 
					pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
 | 
				
			||||||
 | 
					pip install oneccl_bind_pt==2.0.100 -f https://developer.intel.com/ipex-whl-stable-xpu
 | 
				
			||||||
 | 
					pip install git+https://github.com/microsoft/DeepSpeed.git@78c518e
 | 
				
			||||||
 | 
					pip install git+https://github.com/intel/intel-extension-for-deepspeed.git@ec33277
 | 
				
			||||||
 | 
					pip install mpi4py
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### 2. Configures OneAPI environment variables
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					source /opt/intel/oneapi/setvars.sh
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### 3. Run tensor parallel inference on multiple GPUs
 | 
				
			||||||
 | 
					You many want to change some of the parameters in the script such as `NUM_GPUS`` to the number of GPUs you have on your machine.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					bash run.sh
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,13 @@
 | 
				
			||||||
source bigdl-llm-init -t -g
 | 
					source bigdl-llm-init -t -g
 | 
				
			||||||
export MASTER_ADDR=127.0.0.1
 | 
					export MASTER_ADDR=127.0.0.1
 | 
				
			||||||
export CCL_ZE_IPC_EXCHANGE=sockets
 | 
					export CCL_ZE_IPC_EXCHANGE=sockets
 | 
				
			||||||
 | 
					NUM_GPUS=4
 | 
				
			||||||
if [[ -n $OMP_NUM_THREADS ]]; then
 | 
					if [[ -n $OMP_NUM_THREADS ]]; then
 | 
				
			||||||
    export OMP_NUM_THREADS=$(($OMP_NUM_THREADS / 4))
 | 
					    export OMP_NUM_THREADS=$(($OMP_NUM_THREADS / $NUM_GPUS))
 | 
				
			||||||
else
 | 
					else
 | 
				
			||||||
    export OMP_NUM_THREADS=$(($(nproc) / 4))
 | 
					    export OMP_NUM_THREADS=$(($(nproc) / $NUM_GPUS))
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
torchrun --standalone \
 | 
					torchrun --standalone \
 | 
				
			||||||
         --nnodes=1 \
 | 
					         --nnodes=1 \
 | 
				
			||||||
         --nproc-per-node 4 \
 | 
					         --nproc-per-node $NUM_GPUS \
 | 
				
			||||||
         deepspeed_autotp.py --repo-id-or-model-path "meta-llama/Llama-2-7b-hf"
 | 
					         deepspeed_autotp.py --repo-id-or-model-path "meta-llama/Llama-2-7b-hf"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue