LLM: add low bit option in deepspeed autotp example (#10382)

This commit is contained in:
binbin Deng 2024-03-12 17:07:09 +08:00 committed by GitHub
parent df3bcc0e65
commit 5d7e044dbc
4 changed files with 8 additions and 5 deletions

View file

@ -37,7 +37,7 @@ We provide example usages on different models and different hardwares as followi
bash run_llama2_70b_pvc_1550_1_card.sh bash run_llama2_70b_pvc_1550_1_card.sh
``` ```
> **Note**: You could change `ZE_AFFINITY_MASK` and `NUM_GPUS` according to your requirements. > **Note**: You could change `ZE_AFFINITY_MASK` and `NUM_GPUS` according to your requirements. And you could also specify other low bit optimizations through `--low-bit`.
- Run Vicuna-33B on two Intel Arc A770 - Run Vicuna-33B on two Intel Arc A770
@ -45,7 +45,7 @@ bash run_llama2_70b_pvc_1550_1_card.sh
bash run_vicuna_33b_arc_2_card.sh bash run_vicuna_33b_arc_2_card.sh
``` ```
> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine. > **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine. And you could also specify other low bit optimizations through `--low-bit`.
### 3. Sample Output ### 3. Sample Output

View file

@ -55,9 +55,12 @@ if __name__ == '__main__':
help='Prompt to infer') help='Prompt to infer')
parser.add_argument('--n-predict', type=int, default=32, parser.add_argument('--n-predict', type=int, default=32,
help='Max tokens to predict') help='Max tokens to predict')
parser.add_argument('--low-bit', type=str, default='sym_int4',
help='The quantization type the model will convert to.')
args = parser.parse_args() args = parser.parse_args()
model_path = args.repo_id_or_model_path model_path = args.repo_id_or_model_path
low_bit = args.low_bit
# First use CPU as accelerator # First use CPU as accelerator
# Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage # Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage
@ -79,7 +82,7 @@ if __name__ == '__main__':
# Use bigdl-llm `optimize_model` to convert the model into optimized low bit format # Use bigdl-llm `optimize_model` to convert the model into optimized low bit format
# Convert the rest of the model into float16 to reduce allreduce traffic # Convert the rest of the model into float16 to reduce allreduce traffic
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4').to(torch.float16) model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16)
# Next, use XPU as accelerator to speed up inference # Next, use XPU as accelerator to speed up inference
current_accel = XPU_Accelerator() current_accel = XPU_Accelerator()

View file

@ -30,4 +30,4 @@ export OMP_NUM_THREADS=$((56/$NUM_GPUS))
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
export TORCH_LLM_ALLREDUCE=1 export TORCH_LLM_ALLREDUCE=1
mpirun -np $NUM_GPUS --prepend-rank \ mpirun -np $NUM_GPUS --prepend-rank \
python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' --low-bit 'sym_int4'

View file

@ -30,4 +30,4 @@ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
export TORCH_LLM_ALLREDUCE=0 # Different from PVC export TORCH_LLM_ALLREDUCE=0 # Different from PVC
mpirun -np $NUM_GPUS --prepend-rank \ mpirun -np $NUM_GPUS --prepend-rank \
python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --low-bit 'sym_int4'