Add precision option in PP inference examples (#11440)

2024-06-27 09:24:27 +08:00 · 2024-06-27 09:24:27 +08:00 · 508c364a79
commit 508c364a79
parent e9e8f9b4d4
13 changed files with 31 additions and 27 deletions
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
@ -52,6 +52,8 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.
 For optimal performance, it is recommended to set several environment variables. We provide example usages as following:
 > Note: INT4 optimization is applied to the model by default. You could specify other low bit optimizations (such as 'fp8' and 'fp6') through `--low-bit`.
 </details>
 <details>
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
@ -34,16 +34,18 @@ if __name__ == '__main__':
                        help='Prompt to infer')
    parser.add_argument('--n-predict', type=int, default=32,
                        help='Max tokens to predict')
    parser.add_argument('--low-bit', type=str, default='sym_int4', help='The quantization type the model will convert to.')
    parser.add_argument('--gpu-num', type=int, default=2, help='GPU number to use')
    args = parser.parse_args()
    model_path = args.repo_id_or_model_path
    low_bit = args.low_bit
    # Load model in 4 bit,
    # which convert the relevant layers in the model into INT4 format
    try:
        model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                     load_in_4bit=True,
+                                                     load_in_low_bit=low_bit,
                                                     optimize_model=True,
                                                     trust_remote_code=True,
                                                     use_cache=True,
@ -51,7 +53,7 @@ if __name__ == '__main__':
                                                     pipeline_parallel_stages=args.gpu_num)
    except:
        model = AutoModel.from_pretrained(model_path,
-                                          load_in_4bit=True,
+                                          load_in_low_bit=low_bit,
                                          optimize_model=True,
                                          trust_remote_code=True,
                                          use_cache=True,
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_baichuan2_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_baichuan2_arc_2_card.sh
@ -29,8 +29,8 @@ NUM_GPUS=2 # number of used GPU
 # To run Baichuan2-7B-Chat
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # # To run Baichuan2-13B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh
@ -28,4 +28,4 @@ export TORCH_LLM_ALLREDUCE=0
 NUM_GPUS=2 # number of used GPU
 # To run chatglm3-6b
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_codellama_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_codellama_arc_2_card.sh
@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
 # To run CodeLlama-7b-Instruct-hf
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # To run CodeLlama-13b-Instruct-hf
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # To run CodeLlama-34b-Instruct-hf
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh
@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
 # To run Llama-2-7b-chat-hf
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # # To run Llama-2-13b-chat-hf
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # # To run Meta-Llama-3-8B-Instruct
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_mistral_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_mistral_arc_2_card.sh
@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
 # To run Mistral-7B-v0.1
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # To run Mixtral-8x7B-Instruct-v0.1
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_phi3_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_phi3_arc_2_card.sh
@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
 # To run Phi-3-medium-4k-instruct
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # # To run Phi-3-mini-4k-instruct
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh
@ -29,20 +29,20 @@ NUM_GPUS=2 # number of used GPU
 # To run Qwen1.5-7B-Chat
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # # To run Qwen1.5-14B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # # To run Qwen1.5-32B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # # To run Qwen1.5-MoE-A2.7B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # # To run CodeQwen1.5-7B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen2_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen2_arc_2_card.sh
@ -29,4 +29,4 @@ NUM_GPUS=2 # number of used GPU
 # To run Qwen2-7B-Instruct
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_solar_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_solar_arc_2_card.sh
@ -30,4 +30,4 @@ NUM_GPUS=2 # number of used GPU
 # To run SOLAR-10.7B-Instruct-v1.0
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_vicuna_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_vicuna_arc_2_card.sh
@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
 # To run vicuna-7b-v1.3
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # To run vicuna-13b-v1.3
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # To run vicuna-33b-v1.3
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_yi_arc_2_card.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_yi_arc_2_card.sh
@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
 # To run Yi-6B-Chat
 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-    generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS
+    generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
 # To run Yi-34B-Chat
 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
-#     generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS
+#     generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'