LLM: add low bit option in deepspeed autotp example (#10382)
This commit is contained in:
parent
df3bcc0e65
commit
5d7e044dbc
4 changed files with 8 additions and 5 deletions
|
|
@ -37,7 +37,7 @@ We provide example usages on different models and different hardwares as followi
|
||||||
bash run_llama2_70b_pvc_1550_1_card.sh
|
bash run_llama2_70b_pvc_1550_1_card.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
> **Note**: You could change `ZE_AFFINITY_MASK` and `NUM_GPUS` according to your requirements.
|
> **Note**: You could change `ZE_AFFINITY_MASK` and `NUM_GPUS` according to your requirements. And you could also specify other low bit optimizations through `--low-bit`.
|
||||||
|
|
||||||
- Run Vicuna-33B on two Intel Arc A770
|
- Run Vicuna-33B on two Intel Arc A770
|
||||||
|
|
||||||
|
|
@ -45,7 +45,7 @@ bash run_llama2_70b_pvc_1550_1_card.sh
|
||||||
bash run_vicuna_33b_arc_2_card.sh
|
bash run_vicuna_33b_arc_2_card.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine.
|
> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine. And you could also specify other low bit optimizations through `--low-bit`.
|
||||||
|
|
||||||
### 3. Sample Output
|
### 3. Sample Output
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -55,9 +55,12 @@ if __name__ == '__main__':
|
||||||
help='Prompt to infer')
|
help='Prompt to infer')
|
||||||
parser.add_argument('--n-predict', type=int, default=32,
|
parser.add_argument('--n-predict', type=int, default=32,
|
||||||
help='Max tokens to predict')
|
help='Max tokens to predict')
|
||||||
|
parser.add_argument('--low-bit', type=str, default='sym_int4',
|
||||||
|
help='The quantization type the model will convert to.')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
model_path = args.repo_id_or_model_path
|
model_path = args.repo_id_or_model_path
|
||||||
|
low_bit = args.low_bit
|
||||||
|
|
||||||
# First use CPU as accelerator
|
# First use CPU as accelerator
|
||||||
# Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage
|
# Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage
|
||||||
|
|
@ -79,7 +82,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# Use bigdl-llm `optimize_model` to convert the model into optimized low bit format
|
# Use bigdl-llm `optimize_model` to convert the model into optimized low bit format
|
||||||
# Convert the rest of the model into float16 to reduce allreduce traffic
|
# Convert the rest of the model into float16 to reduce allreduce traffic
|
||||||
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4').to(torch.float16)
|
model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16)
|
||||||
|
|
||||||
# Next, use XPU as accelerator to speed up inference
|
# Next, use XPU as accelerator to speed up inference
|
||||||
current_accel = XPU_Accelerator()
|
current_accel = XPU_Accelerator()
|
||||||
|
|
|
||||||
|
|
@ -30,4 +30,4 @@ export OMP_NUM_THREADS=$((56/$NUM_GPUS))
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
||||||
export TORCH_LLM_ALLREDUCE=1
|
export TORCH_LLM_ALLREDUCE=1
|
||||||
mpirun -np $NUM_GPUS --prepend-rank \
|
mpirun -np $NUM_GPUS --prepend-rank \
|
||||||
python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf'
|
python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -30,4 +30,4 @@ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
||||||
export TORCH_LLM_ALLREDUCE=0 # Different from PVC
|
export TORCH_LLM_ALLREDUCE=0 # Different from PVC
|
||||||
|
|
||||||
mpirun -np $NUM_GPUS --prepend-rank \
|
mpirun -np $NUM_GPUS --prepend-rank \
|
||||||
python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3'
|
python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue