diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/README.md b/python/llm/example/GPU/Deepspeed-AutoTP/README.md index 94757822..346bc8fc 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/README.md +++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md @@ -37,7 +37,7 @@ We provide example usages on different models and different hardwares as followi bash run_llama2_70b_pvc_1550_1_card.sh ``` -> **Note**: You could change `ZE_AFFINITY_MASK` and `NUM_GPUS` according to your requirements. +> **Note**: You could change `ZE_AFFINITY_MASK` and `NUM_GPUS` according to your requirements. And you could also specify other low bit optimizations through `--low-bit`. - Run Vicuna-33B on two Intel Arc A770 @@ -45,7 +45,7 @@ bash run_llama2_70b_pvc_1550_1_card.sh bash run_vicuna_33b_arc_2_card.sh ``` -> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine. +> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine. And you could also specify other low bit optimizations through `--low-bit`. ### 3. Sample Output diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index f69c66e0..54994c0f 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -55,9 +55,12 @@ if __name__ == '__main__': help='Prompt to infer') parser.add_argument('--n-predict', type=int, default=32, help='Max tokens to predict') + parser.add_argument('--low-bit', type=str, default='sym_int4', + help='The quantization type the model will convert to.') args = parser.parse_args() model_path = args.repo_id_or_model_path + low_bit = args.low_bit # First use CPU as accelerator # Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage @@ -79,7 +82,7 @@ if __name__ == '__main__': # Use bigdl-llm `optimize_model` to convert the model into optimized low bit format # Convert the rest of the model into float16 to reduce allreduce traffic - model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4').to(torch.float16) + model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16) # Next, use XPU as accelerator to speed up inference current_accel = XPU_Accelerator() diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh index 380e1a58..4e968541 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh @@ -30,4 +30,4 @@ export OMP_NUM_THREADS=$((56/$NUM_GPUS)) export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export TORCH_LLM_ALLREDUCE=1 mpirun -np $NUM_GPUS --prepend-rank \ - python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' + python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh index ca0697a6..39b060ae 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh @@ -30,4 +30,4 @@ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export TORCH_LLM_ALLREDUCE=0 # Different from PVC mpirun -np $NUM_GPUS --prepend-rank \ - python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' + python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --low-bit 'sym_int4'