From 7897eb4b51f6858016aa6bfa3a7aaf7c8acf0994 Mon Sep 17 00:00:00 2001 From: binbin Deng <108676127+plusbang@users.noreply.github.com> Date: Thu, 7 Sep 2023 18:08:17 +0800 Subject: [PATCH] LLM: add benchmark scripts on GPU (#8916) --- python/llm/dev/benchmark/all-in-one/README.md | 22 ++- .../llm/dev/benchmark/all-in-one/config.yaml | 3 + .../llm/dev/benchmark/all-in-one/run-arc.sh | 5 + python/llm/dev/benchmark/all-in-one/run.py | 176 +++++++++++++++++- 4 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 python/llm/dev/benchmark/all-in-one/run-arc.sh diff --git a/python/llm/dev/benchmark/all-in-one/README.md b/python/llm/dev/benchmark/all-in-one/README.md index 97d12896..56bfc730 100644 --- a/python/llm/dev/benchmark/all-in-one/README.md +++ b/python/llm/dev/benchmark/all-in-one/README.md @@ -6,12 +6,26 @@ Before running, make sure to have [bigdl-llm](../../../README.md) installed. ## Config Config YAML file has following format ```yaml -model_name: model_path -# following is an example, with model name llama2 -llama2: /path/to/llama2 +repo_id: + - 'THUDM/chatglm-6b' + - 'THUDM/chatglm2-6b' + - 'meta-llama/Llama-2-7b-chat-hf' +local_model_hub: 'path to your local model hub' +warm_up: 1 +num_trials: 3 +in_out_pairs: + - '32-32' + - '1024-128' +test_api: + - "transformer_int4" + - "native_int4" + - "optimize_model" + # - "transformer_int4_gpu" # on arc + # - "optimize_model_gpu" # on arc ``` ## Run run `python run.py`, this will output results to `results.csv`. -For SPR performance, run `bash run-spr.sh`. \ No newline at end of file +For SPR performance, run `bash run-spr.sh`. +For ARC performance, run `bash run-arc.sh` diff --git a/python/llm/dev/benchmark/all-in-one/config.yaml b/python/llm/dev/benchmark/all-in-one/config.yaml index e4863154..39662ad6 100644 --- a/python/llm/dev/benchmark/all-in-one/config.yaml +++ b/python/llm/dev/benchmark/all-in-one/config.yaml @@ -11,3 +11,6 @@ in_out_pairs: test_api: - "transformer_int4" - "native_int4" + - "optimize_model" + # - "transformer_int4_gpu" # on arc + # - "optimize_model_gpu" # on arc \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/run-arc.sh b/python/llm/dev/benchmark/all-in-one/run-arc.sh new file mode 100644 index 00000000..c6133d78 --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/run-arc.sh @@ -0,0 +1,5 @@ +source /opt/intel/oneapi/setvars.sh +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 + +python run.py # make sure config YAML file diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 138adc48..0383f0a2 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -19,8 +19,6 @@ import torch import time -from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM -from transformers import AutoTokenizer import numpy as np from datetime import date @@ -41,6 +39,12 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, result = run_transformer_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) elif test_api == 'native_int4': run_native_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) + elif test_api == 'optimize_model': + result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) + elif test_api == 'transformer_int4_gpu': + result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) + elif test_api == 'optimize_model_gpu': + result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) for in_out_pair in in_out_pairs: results.append([repo_id, @@ -101,6 +105,9 @@ def run_transformer_int4(repo_id, in_out_pairs, warm_up, num_trials): + from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from transformers import AutoTokenizer + model_path = get_model_path(repo_id, local_model_hub) # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format @@ -142,6 +149,169 @@ def run_transformer_int4(repo_id, return result +def run_optimize_model(repo_id, + local_model_hub, + in_out_pairs, + warm_up, + num_trials): + from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer + from bigdl.llm import optimize_model + + model_path = get_model_path(repo_id, local_model_hub) + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + st = time.perf_counter() + if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: + model = AutoModel.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, trust_remote_code=True) + model = optimize_model(model) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + else: + model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True) + model = optimize_model(model) + tokenizer = AutoTokenizer.from_pretrained(model_path) + end = time.perf_counter() + print(">> loading of model costs {}s".format(end - st)) + + model = BenchmarkWrapper(model) + + result = {} + with torch.inference_mode(): + for in_out in in_out_pairs: + in_out_len = in_out.split("-") + in_len = int(in_out_len[0]) + out_len = int(in_out_len[1]) + input_str = open(f"prompt/{in_len}.txt", 'r').read() + # As different tokenizer has different encodings, + # slice the input_ids to ensure the prompt length is required length. + input_ids = tokenizer.encode(input_str, return_tensors="pt") + input_ids = input_ids[:, :in_len] + true_str = tokenizer.batch_decode(input_ids)[0] + input_ids = tokenizer.encode(true_str, return_tensors="pt") + result[in_out] = [] + for i in range(num_trials + warm_up): + st = time.perf_counter() + output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len) + end = time.perf_counter() + print("model generate cost: " + str(end - st)) + output = tokenizer.batch_decode(output_ids) + print(output[0]) + if i >= warm_up: + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + return result + + +def run_transformer_int4_gpu(repo_id, + local_model_hub, + in_out_pairs, + warm_up, + num_trials): + from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + from transformers import AutoTokenizer + import intel_extension_for_pytorch as ipex + if local_model_hub: + repo_model_name = repo_id.split("/")[1] + model_path = local_model_hub + "/" + repo_model_name + else: + model_path = repo_id + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + st = time.perf_counter() + if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: + model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True) + model = model.to('xpu') + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + else: + model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True) + model = model.to('xpu') + tokenizer = AutoTokenizer.from_pretrained(model_path) + end = time.perf_counter() + print(">> loading of model costs {}s".format(end - st)) + + model = BenchmarkWrapper(model) + + result = {} + with torch.inference_mode(): + for in_out in in_out_pairs: + in_out_len = in_out.split("-") + in_len = int(in_out_len[0]) + out_len = int(in_out_len[1]) + input_str = open(f"prompt/{in_len}.txt", 'r').read() + # As different tokenizer has different encodings, + # slice the input_ids to ensure the prompt length is required length. + input_ids = tokenizer.encode(input_str, return_tensors="pt").to('xpu') + input_ids = input_ids[:, :in_len] + result[in_out] = [] + for i in range(num_trials + warm_up): + st = time.perf_counter() + output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len) + torch.xpu.synchronize() + end = time.perf_counter() + output_ids = output_ids.cpu() + print("model generate cost: " + str(end - st)) + output = tokenizer.batch_decode(output_ids) + print(output[0]) + if i >= warm_up: + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + return result + + +def run_optimize_model_gpu(repo_id, + local_model_hub, + in_out_pairs, + warm_up, + num_trials): + from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer + from bigdl.llm import optimize_model + import intel_extension_for_pytorch as ipex + if local_model_hub: + repo_model_name = repo_id.split("/")[1] + model_path = local_model_hub + "/" + repo_model_name + else: + model_path = repo_id + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + st = time.perf_counter() + if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: + model = AutoModel.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, trust_remote_code=True) + model = optimize_model(model) + model = model.to('xpu') + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + else: + model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True) + model = optimize_model(model) + model = model.to('xpu') + tokenizer = AutoTokenizer.from_pretrained(model_path) + end = time.perf_counter() + print(">> loading of model costs {}s".format(end - st)) + + model = BenchmarkWrapper(model) + + result = {} + with torch.inference_mode(): + for in_out in in_out_pairs: + in_out_len = in_out.split("-") + in_len = int(in_out_len[0]) + out_len = int(in_out_len[1]) + input_str = open(f"prompt/{in_len}.txt", 'r').read() + # As different tokenizer has different encodings, + # slice the input_ids to ensure the prompt length is required length. + input_ids = tokenizer.encode(input_str, return_tensors="pt").to('xpu') + input_ids = input_ids[:, :in_len] + result[in_out] = [] + for i in range(num_trials + warm_up): + st = time.perf_counter() + output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len) + torch.xpu.synchronize() + end = time.perf_counter() + output_ids = output_ids.cpu() + print("model generate cost: " + str(end - st)) + output = tokenizer.batch_decode(output_ids) + print(output[0]) + if i >= warm_up: + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + return result + + if __name__ == '__main__': from omegaconf import OmegaConf conf = OmegaConf.load(f'{current_dir}/config.yaml') @@ -153,4 +323,4 @@ if __name__ == '__main__': run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials']) df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens']) df.to_csv(f'{current_dir}/{api}-results-{today}.csv') - result = [] \ No newline at end of file + results = [] \ No newline at end of file