LLM: add low bit option in deepspeed autotp example (#10382)

2024-03-12 17:07:09 +08:00 · 2024-03-12 17:07:09 +08:00 · 5d7e044dbc
commit 5d7e044dbc
parent df3bcc0e65
4 changed files with 8 additions and 5 deletions
--- a/python/llm/example/GPU/Deepspeed-AutoTP/README.md
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md
@ -37,7 +37,7 @@ We provide example usages on different models and different hardwares as followi
 bash run_llama2_70b_pvc_1550_1_card.sh
 ```

-> **Note**: You could change `ZE_AFFINITY_MASK` and `NUM_GPUS` according to your requirements.
+> **Note**: You could change `ZE_AFFINITY_MASK` and `NUM_GPUS` according to your requirements. And you could also specify other low bit optimizations through `--low-bit`.

 - Run Vicuna-33B on two Intel Arc A770

@ -45,7 +45,7 @@ bash run_llama2_70b_pvc_1550_1_card.sh
 bash run_vicuna_33b_arc_2_card.sh
 ```

-> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine.
+> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine. And you could also specify other low bit optimizations through `--low-bit`.

 ### 3. Sample Output

--- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
@ -55,9 +55,12 @@ if __name__ == '__main__':
                        help='Prompt to infer')
    parser.add_argument('--n-predict', type=int, default=32,
                        help='Max tokens to predict')
+    parser.add_argument('--low-bit', type=str, default='sym_int4',
+                        help='The quantization type the model will convert to.')

    args = parser.parse_args()
    model_path = args.repo_id_or_model_path
+    low_bit = args.low_bit

    # First use CPU as accelerator
    # Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage
@ -79,7 +82,7 @@ if __name__ == '__main__':

    # Use bigdl-llm `optimize_model` to convert the model into optimized low bit format
    # Convert the rest of the model into float16 to reduce allreduce traffic
-    model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4').to(torch.float16)
+    model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16)

    # Next, use XPU as accelerator to speed up inference
    current_accel = XPU_Accelerator()
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
@ -30,4 +30,4 @@ export OMP_NUM_THREADS=$((56/$NUM_GPUS))
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 export TORCH_LLM_ALLREDUCE=1
 mpirun -np $NUM_GPUS --prepend-rank \
-    python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf'
+    python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
@ -30,4 +30,4 @@ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 export TORCH_LLM_ALLREDUCE=0 # Different from PVC

 mpirun -np $NUM_GPUS --prepend-rank \
-    python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3'
+    python deepspeed_autotp.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --low-bit 'sym_int4'