Refine Pipeline Parallel FastAPI example (#11168)
This commit is contained in:
parent
9bfbf78bf4
commit
2299698b45
2 changed files with 4 additions and 1 deletions
|
|
@ -20,6 +20,8 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.
|
|||
source /opt/intel/oneapi/setvars.sh
|
||||
pip install mpi4py fastapi uvicorn
|
||||
conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc
|
||||
|
||||
pip install transformers==4.31.0 # for llama2 models
|
||||
```
|
||||
|
||||
### 2. Run pipeline parallel serving on multiple GPUs
|
||||
|
|
|
|||
|
|
@ -8,4 +8,5 @@ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
|||
export TORCH_LLM_ALLREDUCE=0
|
||||
|
||||
export MODEL_PATH=YOUR_MODEL_PATH
|
||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node 2 pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit fp8
|
||||
export NUM_GPUS=2
|
||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit fp8
|
||||
|
|
|
|||
Loading…
Reference in a new issue