Refine Pipeline Parallel FastAPI example (#11168)
This commit is contained in:
		
							parent
							
								
									9bfbf78bf4
								
							
						
					
					
						commit
						2299698b45
					
				
					 2 changed files with 4 additions and 1 deletions
				
			
		| 
						 | 
				
			
			@ -20,6 +20,8 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.
 | 
			
		|||
source /opt/intel/oneapi/setvars.sh
 | 
			
		||||
pip install mpi4py fastapi uvicorn
 | 
			
		||||
conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc
 | 
			
		||||
 | 
			
		||||
pip install transformers==4.31.0 # for llama2 models
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### 2. Run pipeline parallel serving on multiple GPUs
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,4 +8,5 @@ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 | 
			
		|||
export TORCH_LLM_ALLREDUCE=0
 | 
			
		||||
 | 
			
		||||
export MODEL_PATH=YOUR_MODEL_PATH
 | 
			
		||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node 2 pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit fp8
 | 
			
		||||
export NUM_GPUS=2
 | 
			
		||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit fp8
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue