Add batch_size in all_in_one (#9999)
Add batch_size in all_in_one, except run_native_int4
This commit is contained in:
parent
093e6f8f73
commit
8b08ad408b
3 changed files with 53 additions and 31 deletions
|
|
@ -32,6 +32,7 @@ warm_up: 1
|
|||
num_trials: 3
|
||||
num_beams: 1 # default to greedy search
|
||||
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
|
||||
batch_size: 1 # default to 1
|
||||
in_out_pairs:
|
||||
- '32-32'
|
||||
- '1024-128'
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ warm_up: 1
|
|||
num_trials: 3
|
||||
num_beams: 1 # default to greedy search
|
||||
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
|
||||
batch_size: 1 # default to 1
|
||||
in_out_pairs:
|
||||
- '32-32'
|
||||
- '1024-128'
|
||||
|
|
|
|||
|
|
@ -62,31 +62,31 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in
|
|||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||
actual_in_len, actual_out_len, model.peak_memory])
|
||||
|
||||
def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False):
|
||||
def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False, batch_size=1):
|
||||
# TODO: make a parameter
|
||||
result= {}
|
||||
if test_api == 'transformer_int4':
|
||||
result = run_transformer_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
|
||||
result = run_transformer_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||
elif test_api == 'native_int4':
|
||||
run_native_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials)
|
||||
elif test_api == 'optimize_model':
|
||||
result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
|
||||
result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||
elif test_api == 'transformer_int4_gpu':
|
||||
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
|
||||
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||
elif test_api == 'optimize_model_gpu':
|
||||
result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
|
||||
result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||
elif test_api == 'pytorch_autocast_bf16':
|
||||
result = run_pytorch_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
|
||||
result = run_pytorch_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size)
|
||||
elif test_api == 'ipex_fp16_gpu':
|
||||
result = run_ipex_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
|
||||
result = run_ipex_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size)
|
||||
elif test_api == "bigdl_fp16_gpu":
|
||||
result = result = run_bigdl_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
|
||||
result = result = run_bigdl_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size)
|
||||
elif test_api == 'deepspeed_transformer_int4_cpu':
|
||||
result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
|
||||
result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||
elif test_api == 'transformer_int4_gpu_win':
|
||||
result = run_transformer_int4_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding)
|
||||
result = run_transformer_int4_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size)
|
||||
elif test_api == 'transformer_autocast_bf16':
|
||||
result = run_transformer_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
|
||||
result = run_transformer_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size)
|
||||
|
||||
for in_out_pair in in_out_pairs:
|
||||
if result and result[in_out_pair]:
|
||||
|
|
@ -159,7 +159,8 @@ def run_transformer_int4(repo_id,
|
|||
warm_up,
|
||||
num_trials,
|
||||
num_beams,
|
||||
low_bit):
|
||||
low_bit,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
|
|
@ -201,7 +202,8 @@ def run_transformer_int4(repo_id,
|
|||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
||||
input_list = [true_str] * batch_size
|
||||
input_ids = tokenizer(input_list, return_tensors="pt").input_ids
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
for i in range(num_trials + warm_up):
|
||||
|
|
@ -223,7 +225,8 @@ def run_pytorch_autocast_bf16(repo_id,
|
|||
in_out_pairs,
|
||||
warm_up,
|
||||
num_trials,
|
||||
num_beams):
|
||||
num_beams,
|
||||
batch_size):
|
||||
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, LlamaTokenizer
|
||||
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
|
|
@ -263,7 +266,8 @@ def run_pytorch_autocast_bf16(repo_id,
|
|||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
||||
input_list = [true_str] * batch_size
|
||||
input_ids = tokenizer(input_list, return_tensors="pt").input_ids
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
print("input tokens: {}".format(input_ids.shape[1]))
|
||||
|
|
@ -287,7 +291,8 @@ def run_optimize_model(repo_id,
|
|||
warm_up,
|
||||
num_trials,
|
||||
num_beams,
|
||||
low_bit):
|
||||
low_bit,
|
||||
batch_size):
|
||||
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
|
||||
from bigdl.llm import optimize_model
|
||||
|
||||
|
|
@ -331,7 +336,8 @@ def run_optimize_model(repo_id,
|
|||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
||||
input_list = [true_str] * batch_size
|
||||
input_ids = tokenizer(input_list, return_tensors="pt").input_ids
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
for i in range(num_trials + warm_up):
|
||||
|
|
@ -355,7 +361,8 @@ def run_transformer_int4_gpu(repo_id,
|
|||
warm_up,
|
||||
num_trials,
|
||||
num_beams,
|
||||
low_bit):
|
||||
low_bit,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
|
@ -418,7 +425,8 @@ def run_transformer_int4_gpu(repo_id,
|
|||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||
input_list = [true_str] * batch_size
|
||||
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials))
|
||||
|
|
@ -438,7 +446,8 @@ def run_optimize_model_gpu(repo_id,
|
|||
warm_up,
|
||||
num_trials,
|
||||
num_beams,
|
||||
low_bit):
|
||||
low_bit,
|
||||
batch_size):
|
||||
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
from bigdl.llm import optimize_model
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
|
@ -490,7 +499,8 @@ def run_optimize_model_gpu(repo_id,
|
|||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||
input_list = [true_str] * batch_size
|
||||
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
for i in range(num_trials + warm_up):
|
||||
|
|
@ -517,7 +527,8 @@ def run_ipex_fp16_gpu(repo_id,
|
|||
in_out_pairs,
|
||||
warm_up,
|
||||
num_trials,
|
||||
num_beams):
|
||||
num_beams,
|
||||
batch_size):
|
||||
from transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
|
@ -562,7 +573,8 @@ def run_ipex_fp16_gpu(repo_id,
|
|||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||
input_list = [true_str] * batch_size
|
||||
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
for i in range(num_trials + warm_up):
|
||||
|
|
@ -589,7 +601,8 @@ def run_bigdl_fp16_gpu(repo_id,
|
|||
in_out_pairs,
|
||||
warm_up,
|
||||
num_trials,
|
||||
num_beams):
|
||||
num_beams,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
|
@ -637,7 +650,8 @@ def run_bigdl_fp16_gpu(repo_id,
|
|||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||
input_list = [true_str] * batch_size
|
||||
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
for i in range(num_trials + warm_up):
|
||||
|
|
@ -664,7 +678,8 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
|
|||
warm_up,
|
||||
num_trials,
|
||||
num_beams,
|
||||
low_bit):
|
||||
low_bit,
|
||||
batch_size):
|
||||
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
|
||||
import deepspeed
|
||||
from bigdl.llm import optimize_model
|
||||
|
|
@ -726,7 +741,8 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
|
|||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
||||
input_list = [true_str] * batch_size
|
||||
input_ids = tokenizer(input_list, return_tensors="pt").input_ids
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
for i in range(num_trials + warm_up):
|
||||
|
|
@ -753,7 +769,8 @@ def run_transformer_int4_gpu_win(repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
low_bit,
|
||||
cpu_embedding):
|
||||
cpu_embedding,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
|
@ -811,7 +828,8 @@ def run_transformer_int4_gpu_win(repo_id,
|
|||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||
input_list = [true_str] * batch_size
|
||||
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
for i in range(num_trials + warm_up):
|
||||
|
|
@ -844,7 +862,8 @@ def run_transformer_autocast_bf16( repo_id,
|
|||
in_out_pairs,
|
||||
warm_up,
|
||||
num_trials,
|
||||
num_beams):
|
||||
num_beams,
|
||||
batch_size):
|
||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
|
|
@ -887,7 +906,8 @@ def run_transformer_autocast_bf16( repo_id,
|
|||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
input_ids = input_ids[:, :in_len]
|
||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
||||
input_list = [true_str] * batch_size
|
||||
input_ids = tokenizer(input_list, return_tensors="pt").input_ids
|
||||
actual_in_len = input_ids.shape[1]
|
||||
result[in_out] = []
|
||||
for i in range(num_trials + warm_up):
|
||||
|
|
|
|||
Loading…
Reference in a new issue