From 8b08ad408bcda7c929dbe097cbbba11e0be9e573 Mon Sep 17 00:00:00 2001 From: Ziteng Zhang <87107332+Jasonzzt@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:43:49 +0800 Subject: [PATCH] Add batch_size in all_in_one (#9999) Add batch_size in all_in_one, except run_native_int4 --- python/llm/dev/benchmark/all-in-one/README.md | 1 + .../llm/dev/benchmark/all-in-one/config.yaml | 1 + python/llm/dev/benchmark/all-in-one/run.py | 82 ++++++++++++------- 3 files changed, 53 insertions(+), 31 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/README.md b/python/llm/dev/benchmark/all-in-one/README.md index aa78ea7f..2c136321 100644 --- a/python/llm/dev/benchmark/all-in-one/README.md +++ b/python/llm/dev/benchmark/all-in-one/README.md @@ -32,6 +32,7 @@ warm_up: 1 num_trials: 3 num_beams: 1 # default to greedy search low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 1 # default to 1 in_out_pairs: - '32-32' - '1024-128' diff --git a/python/llm/dev/benchmark/all-in-one/config.yaml b/python/llm/dev/benchmark/all-in-one/config.yaml index 580f1d42..deb1b501 100644 --- a/python/llm/dev/benchmark/all-in-one/config.yaml +++ b/python/llm/dev/benchmark/all-in-one/config.yaml @@ -8,6 +8,7 @@ warm_up: 1 num_trials: 3 num_beams: 1 # default to greedy search low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 1 # default to 1 in_out_pairs: - '32-32' - '1024-128' diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index c1cc2a76..bc662f4a 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -62,31 +62,31 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, actual_in_len, actual_out_len, model.peak_memory]) -def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False): +def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False, batch_size=1): # TODO: make a parameter result= {} if test_api == 'transformer_int4': - result = run_transformer_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit) + result = run_transformer_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) elif test_api == 'native_int4': run_native_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) elif test_api == 'optimize_model': - result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit) + result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) elif test_api == 'transformer_int4_gpu': - result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit) + result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) elif test_api == 'optimize_model_gpu': - result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit) + result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) elif test_api == 'pytorch_autocast_bf16': - result = run_pytorch_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams) + result = run_pytorch_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size) elif test_api == 'ipex_fp16_gpu': - result = run_ipex_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams) + result = run_ipex_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size) elif test_api == "bigdl_fp16_gpu": - result = result = run_bigdl_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams) + result = result = run_bigdl_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size) elif test_api == 'deepspeed_transformer_int4_cpu': - result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit) + result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) elif test_api == 'transformer_int4_gpu_win': - result = run_transformer_int4_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding) + result = run_transformer_int4_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size) elif test_api == 'transformer_autocast_bf16': - result = run_transformer_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams) + result = run_transformer_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, batch_size) for in_out_pair in in_out_pairs: if result and result[in_out_pair]: @@ -159,7 +159,8 @@ def run_transformer_int4(repo_id, warm_up, num_trials, num_beams, - low_bit): + low_bit, + batch_size): from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer @@ -201,7 +202,8 @@ def run_transformer_int4(repo_id, input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt") + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -223,7 +225,8 @@ def run_pytorch_autocast_bf16(repo_id, in_out_pairs, warm_up, num_trials, - num_beams): + num_beams, + batch_size): from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, LlamaTokenizer model_path = get_model_path(repo_id, local_model_hub) @@ -263,7 +266,8 @@ def run_pytorch_autocast_bf16(repo_id, input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt") + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids actual_in_len = input_ids.shape[1] result[in_out] = [] print("input tokens: {}".format(input_ids.shape[1])) @@ -287,7 +291,8 @@ def run_optimize_model(repo_id, warm_up, num_trials, num_beams, - low_bit): + low_bit, + batch_size): from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer from bigdl.llm import optimize_model @@ -331,7 +336,8 @@ def run_optimize_model(repo_id, input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt") + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -355,7 +361,8 @@ def run_transformer_int4_gpu(repo_id, warm_up, num_trials, num_beams, - low_bit): + low_bit, + batch_size): from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer import intel_extension_for_pytorch as ipex @@ -418,7 +425,8 @@ def run_transformer_int4_gpu(repo_id, input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials)) @@ -438,7 +446,8 @@ def run_optimize_model_gpu(repo_id, warm_up, num_trials, num_beams, - low_bit): + low_bit, + batch_size): from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from bigdl.llm import optimize_model import intel_extension_for_pytorch as ipex @@ -490,7 +499,8 @@ def run_optimize_model_gpu(repo_id, input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -517,7 +527,8 @@ def run_ipex_fp16_gpu(repo_id, in_out_pairs, warm_up, num_trials, - num_beams): + num_beams, + batch_size): from transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer import intel_extension_for_pytorch as ipex @@ -562,7 +573,8 @@ def run_ipex_fp16_gpu(repo_id, input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -589,7 +601,8 @@ def run_bigdl_fp16_gpu(repo_id, in_out_pairs, warm_up, num_trials, - num_beams): + num_beams, + batch_size): from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer import intel_extension_for_pytorch as ipex @@ -637,7 +650,8 @@ def run_bigdl_fp16_gpu(repo_id, input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -664,7 +678,8 @@ def run_deepspeed_transformer_int4_cpu(repo_id, warm_up, num_trials, num_beams, - low_bit): + low_bit, + batch_size): from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer import deepspeed from bigdl.llm import optimize_model @@ -726,7 +741,8 @@ def run_deepspeed_transformer_int4_cpu(repo_id, input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt") + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -753,7 +769,8 @@ def run_transformer_int4_gpu_win(repo_id, num_trials, num_beams, low_bit, - cpu_embedding): + cpu_embedding, + batch_size): from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer import intel_extension_for_pytorch as ipex @@ -811,7 +828,8 @@ def run_transformer_int4_gpu_win(repo_id, input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -844,7 +862,8 @@ def run_transformer_autocast_bf16( repo_id, in_out_pairs, warm_up, num_trials, - num_beams): + num_beams, + batch_size): from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer @@ -887,7 +906,8 @@ def run_transformer_autocast_bf16( repo_id, input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt") + input_list = [true_str] * batch_size + input_ids = tokenizer(input_list, return_tensors="pt").input_ids actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up):