Add batch_size in all_in_one (#9999)

Add batch_size in all_in_one, except run_native_int4
This commit is contained in:
Ziteng Zhang 2024-01-25 17:43:49 +08:00 committed by GitHub
parent 093e6f8f73
commit 8b08ad408b
3 changed files with 53 additions and 31 deletions

View file

@ -32,6 +32,7 @@ warm_up: 1
num_trials: 3
num_beams: 1 # default to greedy search
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
batch_size: 1 # default to 1
in_out_pairs:
- '32-32'
- '1024-128'

View file

@ -8,6 +8,7 @@ warm_up: 1
num_trials: 3
num_beams: 1 # default to greedy search
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
batch_size: 1 # default to 1
in_out_pairs:
- '32-32'
- '1024-128'

View file

@ -62,31 +62,31 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
actual_in_len, actual_out_len, model.peak_memory])
def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False):
def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False, batch_size=1):
# TODO: make a parameter
result= {}
if test_api == 'transformer_int4':
result = run_transformer_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
result = run_transformer_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
elif test_api == 'native_int4':
run_native_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials)
elif test_api == 'optimize_model':
result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
elif test_api == 'transformer_int4_gpu':
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
elif test_api == 'optimize_model_gpu':
result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
elif test_api == 'pytorch_autocast_bf16':
result = run_pytorch_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
result = run_pytorch_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size)
elif test_api == 'ipex_fp16_gpu':
result = run_ipex_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
result = run_ipex_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size)
elif test_api == "bigdl_fp16_gpu":
result = result = run_bigdl_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
result = result = run_bigdl_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size)
elif test_api == 'deepspeed_transformer_int4_cpu':
result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
elif test_api == 'transformer_int4_gpu_win':
result = run_transformer_int4_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding)
result = run_transformer_int4_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size)
elif test_api == 'transformer_autocast_bf16':
result = run_transformer_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
result = run_transformer_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size)
for in_out_pair in in_out_pairs:
if result and result[in_out_pair]:
@ -159,7 +159,8 @@ def run_transformer_int4(repo_id,
warm_up,
num_trials,
num_beams,
low_bit):
low_bit,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
@ -201,7 +202,8 @@ def run_transformer_int4(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt")
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
@ -223,7 +225,8 @@ def run_pytorch_autocast_bf16(repo_id,
in_out_pairs,
warm_up,
num_trials,
num_beams):
num_beams,
batch_size):
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub)
@ -263,7 +266,8 @@ def run_pytorch_autocast_bf16(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt")
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids
actual_in_len = input_ids.shape[1]
result[in_out] = []
print("input tokens: {}".format(input_ids.shape[1]))
@ -287,7 +291,8 @@ def run_optimize_model(repo_id,
warm_up,
num_trials,
num_beams,
low_bit):
low_bit,
batch_size):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from bigdl.llm import optimize_model
@ -331,7 +336,8 @@ def run_optimize_model(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt")
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
@ -355,7 +361,8 @@ def run_transformer_int4_gpu(repo_id,
warm_up,
num_trials,
num_beams,
low_bit):
low_bit,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex
@ -418,7 +425,8 @@ def run_transformer_int4_gpu(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = []
thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials))
@ -438,7 +446,8 @@ def run_optimize_model_gpu(repo_id,
warm_up,
num_trials,
num_beams,
low_bit):
low_bit,
batch_size):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from bigdl.llm import optimize_model
import intel_extension_for_pytorch as ipex
@ -490,7 +499,8 @@ def run_optimize_model_gpu(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
@ -517,7 +527,8 @@ def run_ipex_fp16_gpu(repo_id,
in_out_pairs,
warm_up,
num_trials,
num_beams):
num_beams,
batch_size):
from transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex
@ -562,7 +573,8 @@ def run_ipex_fp16_gpu(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
@ -589,7 +601,8 @@ def run_bigdl_fp16_gpu(repo_id,
in_out_pairs,
warm_up,
num_trials,
num_beams):
num_beams,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex
@ -637,7 +650,8 @@ def run_bigdl_fp16_gpu(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
@ -664,7 +678,8 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
warm_up,
num_trials,
num_beams,
low_bit):
low_bit,
batch_size):
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
import deepspeed
from bigdl.llm import optimize_model
@ -726,7 +741,8 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt")
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
@ -753,7 +769,8 @@ def run_transformer_int4_gpu_win(repo_id,
num_trials,
num_beams,
low_bit,
cpu_embedding):
cpu_embedding,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex
@ -811,7 +828,8 @@ def run_transformer_int4_gpu_win(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):
@ -844,7 +862,8 @@ def run_transformer_autocast_bf16( repo_id,
in_out_pairs,
warm_up,
num_trials,
num_beams):
num_beams,
batch_size):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer
@ -887,7 +906,8 @@ def run_transformer_autocast_bf16( repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt")
input_list = [true_str] * batch_size
input_ids = tokenizer(input_list, return_tensors="pt").input_ids
actual_in_len = input_ids.shape[1]
result[in_out] = []
for i in range(num_trials + warm_up):