Add precision option in PP inference examples (#11440)

This commit is contained in:
binbin Deng 2024-06-27 09:24:27 +08:00 committed by GitHub
parent e9e8f9b4d4
commit 508c364a79
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 31 additions and 27 deletions

View file

@ -52,6 +52,8 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.
For optimal performance, it is recommended to set several environment variables. We provide example usages as following: For optimal performance, it is recommended to set several environment variables. We provide example usages as following:
> Note: INT4 optimization is applied to the model by default. You could specify other low bit optimizations (such as 'fp8' and 'fp6') through `--low-bit`.
</details> </details>
<details> <details>

View file

@ -34,16 +34,18 @@ if __name__ == '__main__':
help='Prompt to infer') help='Prompt to infer')
parser.add_argument('--n-predict', type=int, default=32, parser.add_argument('--n-predict', type=int, default=32,
help='Max tokens to predict') help='Max tokens to predict')
parser.add_argument('--low-bit', type=str, default='sym_int4', help='The quantization type the model will convert to.')
parser.add_argument('--gpu-num', type=int, default=2, help='GPU number to use') parser.add_argument('--gpu-num', type=int, default=2, help='GPU number to use')
args = parser.parse_args() args = parser.parse_args()
model_path = args.repo_id_or_model_path model_path = args.repo_id_or_model_path
low_bit = args.low_bit
# Load model in 4 bit, # Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format # which convert the relevant layers in the model into INT4 format
try: try:
model = AutoModelForCausalLM.from_pretrained(model_path, model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True, load_in_low_bit=low_bit,
optimize_model=True, optimize_model=True,
trust_remote_code=True, trust_remote_code=True,
use_cache=True, use_cache=True,
@ -51,7 +53,7 @@ if __name__ == '__main__':
pipeline_parallel_stages=args.gpu_num) pipeline_parallel_stages=args.gpu_num)
except: except:
model = AutoModel.from_pretrained(model_path, model = AutoModel.from_pretrained(model_path,
load_in_4bit=True, load_in_low_bit=low_bit,
optimize_model=True, optimize_model=True,
trust_remote_code=True, trust_remote_code=True,
use_cache=True, use_cache=True,

View file

@ -29,8 +29,8 @@ NUM_GPUS=2 # number of used GPU
# To run Baichuan2-7B-Chat # To run Baichuan2-7B-Chat
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# # To run Baichuan2-13B-Chat # # To run Baichuan2-13B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

View file

@ -28,4 +28,4 @@ export TORCH_LLM_ALLREDUCE=0
NUM_GPUS=2 # number of used GPU NUM_GPUS=2 # number of used GPU
# To run chatglm3-6b # To run chatglm3-6b
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

View file

@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
# To run CodeLlama-7b-Instruct-hf # To run CodeLlama-7b-Instruct-hf
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# To run CodeLlama-13b-Instruct-hf # To run CodeLlama-13b-Instruct-hf
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# To run CodeLlama-34b-Instruct-hf # To run CodeLlama-34b-Instruct-hf
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

View file

@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
# To run Llama-2-7b-chat-hf # To run Llama-2-7b-chat-hf
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# # To run Llama-2-13b-chat-hf # # To run Llama-2-13b-chat-hf
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# # To run Meta-Llama-3-8B-Instruct # # To run Meta-Llama-3-8B-Instruct
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

View file

@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
# To run Mistral-7B-v0.1 # To run Mistral-7B-v0.1
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# To run Mixtral-8x7B-Instruct-v0.1 # To run Mixtral-8x7B-Instruct-v0.1
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

View file

@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
# To run Phi-3-medium-4k-instruct # To run Phi-3-medium-4k-instruct
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# # To run Phi-3-mini-4k-instruct # # To run Phi-3-mini-4k-instruct
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

View file

@ -29,20 +29,20 @@ NUM_GPUS=2 # number of used GPU
# To run Qwen1.5-7B-Chat # To run Qwen1.5-7B-Chat
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# # To run Qwen1.5-14B-Chat # # To run Qwen1.5-14B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# # To run Qwen1.5-32B-Chat # # To run Qwen1.5-32B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# # To run Qwen1.5-MoE-A2.7B-Chat # # To run Qwen1.5-MoE-A2.7B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# # To run CodeQwen1.5-7B-Chat # # To run CodeQwen1.5-7B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

View file

@ -29,4 +29,4 @@ NUM_GPUS=2 # number of used GPU
# To run Qwen2-7B-Instruct # To run Qwen2-7B-Instruct
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

View file

@ -30,4 +30,4 @@ NUM_GPUS=2 # number of used GPU
# To run SOLAR-10.7B-Instruct-v1.0 # To run SOLAR-10.7B-Instruct-v1.0
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

View file

@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
# To run vicuna-7b-v1.3 # To run vicuna-7b-v1.3
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# To run vicuna-13b-v1.3 # To run vicuna-13b-v1.3
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# To run vicuna-33b-v1.3 # To run vicuna-33b-v1.3
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

View file

@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
# To run Yi-6B-Chat # To run Yi-6B-Chat
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
# To run Yi-34B-Chat # To run Yi-34B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS # generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'