Add precision option in PP inference examples (#11440)
This commit is contained in:
parent
e9e8f9b4d4
commit
508c364a79
13 changed files with 31 additions and 27 deletions
|
|
@ -52,6 +52,8 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.
|
||||||
|
|
||||||
For optimal performance, it is recommended to set several environment variables. We provide example usages as following:
|
For optimal performance, it is recommended to set several environment variables. We provide example usages as following:
|
||||||
|
|
||||||
|
> Note: INT4 optimization is applied to the model by default. You could specify other low bit optimizations (such as 'fp8' and 'fp6') through `--low-bit`.
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
|
|
|
||||||
|
|
@ -34,16 +34,18 @@ if __name__ == '__main__':
|
||||||
help='Prompt to infer')
|
help='Prompt to infer')
|
||||||
parser.add_argument('--n-predict', type=int, default=32,
|
parser.add_argument('--n-predict', type=int, default=32,
|
||||||
help='Max tokens to predict')
|
help='Max tokens to predict')
|
||||||
|
parser.add_argument('--low-bit', type=str, default='sym_int4', help='The quantization type the model will convert to.')
|
||||||
parser.add_argument('--gpu-num', type=int, default=2, help='GPU number to use')
|
parser.add_argument('--gpu-num', type=int, default=2, help='GPU number to use')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
model_path = args.repo_id_or_model_path
|
model_path = args.repo_id_or_model_path
|
||||||
|
low_bit = args.low_bit
|
||||||
|
|
||||||
# Load model in 4 bit,
|
# Load model in 4 bit,
|
||||||
# which convert the relevant layers in the model into INT4 format
|
# which convert the relevant layers in the model into INT4 format
|
||||||
try:
|
try:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_low_bit=low_bit,
|
||||||
optimize_model=True,
|
optimize_model=True,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
|
|
@ -51,7 +53,7 @@ if __name__ == '__main__':
|
||||||
pipeline_parallel_stages=args.gpu_num)
|
pipeline_parallel_stages=args.gpu_num)
|
||||||
except:
|
except:
|
||||||
model = AutoModel.from_pretrained(model_path,
|
model = AutoModel.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_low_bit=low_bit,
|
||||||
optimize_model=True,
|
optimize_model=True,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
|
|
|
||||||
|
|
@ -29,8 +29,8 @@ NUM_GPUS=2 # number of used GPU
|
||||||
|
|
||||||
# To run Baichuan2-7B-Chat
|
# To run Baichuan2-7B-Chat
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# # To run Baichuan2-13B-Chat
|
# # To run Baichuan2-13B-Chat
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -28,4 +28,4 @@ export TORCH_LLM_ALLREDUCE=0
|
||||||
NUM_GPUS=2 # number of used GPU
|
NUM_GPUS=2 # number of used GPU
|
||||||
# To run chatglm3-6b
|
# To run chatglm3-6b
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
|
||||||
|
|
||||||
# To run CodeLlama-7b-Instruct-hf
|
# To run CodeLlama-7b-Instruct-hf
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# To run CodeLlama-13b-Instruct-hf
|
# To run CodeLlama-13b-Instruct-hf
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# To run CodeLlama-34b-Instruct-hf
|
# To run CodeLlama-34b-Instruct-hf
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
|
||||||
|
|
||||||
# To run Llama-2-7b-chat-hf
|
# To run Llama-2-7b-chat-hf
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# # To run Llama-2-13b-chat-hf
|
# # To run Llama-2-13b-chat-hf
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# # To run Meta-Llama-3-8B-Instruct
|
# # To run Meta-Llama-3-8B-Instruct
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
|
||||||
|
|
||||||
# To run Mistral-7B-v0.1
|
# To run Mistral-7B-v0.1
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# To run Mixtral-8x7B-Instruct-v0.1
|
# To run Mixtral-8x7B-Instruct-v0.1
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
|
||||||
|
|
||||||
# To run Phi-3-medium-4k-instruct
|
# To run Phi-3-medium-4k-instruct
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# # To run Phi-3-mini-4k-instruct
|
# # To run Phi-3-mini-4k-instruct
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -29,20 +29,20 @@ NUM_GPUS=2 # number of used GPU
|
||||||
|
|
||||||
# To run Qwen1.5-7B-Chat
|
# To run Qwen1.5-7B-Chat
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# # To run Qwen1.5-14B-Chat
|
# # To run Qwen1.5-14B-Chat
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# # To run Qwen1.5-32B-Chat
|
# # To run Qwen1.5-32B-Chat
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# # To run Qwen1.5-MoE-A2.7B-Chat
|
# # To run Qwen1.5-MoE-A2.7B-Chat
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# # To run CodeQwen1.5-7B-Chat
|
# # To run CodeQwen1.5-7B-Chat
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -29,4 +29,4 @@ NUM_GPUS=2 # number of used GPU
|
||||||
|
|
||||||
# To run Qwen2-7B-Instruct
|
# To run Qwen2-7B-Instruct
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -30,4 +30,4 @@ NUM_GPUS=2 # number of used GPU
|
||||||
|
|
||||||
# To run SOLAR-10.7B-Instruct-v1.0
|
# To run SOLAR-10.7B-Instruct-v1.0
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU
|
||||||
|
|
||||||
# To run vicuna-7b-v1.3
|
# To run vicuna-7b-v1.3
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# To run vicuna-13b-v1.3
|
# To run vicuna-13b-v1.3
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# To run vicuna-33b-v1.3
|
# To run vicuna-33b-v1.3
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
|
|
@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU
|
||||||
|
|
||||||
# To run Yi-6B-Chat
|
# To run Yi-6B-Chat
|
||||||
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS
|
generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
||||||
# To run Yi-34B-Chat
|
# To run Yi-34B-Chat
|
||||||
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
|
||||||
# generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS
|
# generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue