From 37bb0cbf8f760c7d6f0beac4d2f8325beb5c7608 Mon Sep 17 00:00:00 2001 From: Xin Qiu Date: Tue, 19 Sep 2023 14:22:28 +0800 Subject: [PATCH] Speed up gpt-j in gpubenchmark (#9000) * Speedup gpt-j in gpubenchmark * meet code review --- python/llm/dev/benchmark/all-in-one/run.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index f3008059..fd35ceea 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -276,7 +276,7 @@ def run_transformer_int4_gpu(repo_id, warm_up, num_trials): from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM - from transformers import AutoTokenizer + from transformers import AutoTokenizer, GPTJForCausalLM import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) # Load model in 4 bit, @@ -286,14 +286,18 @@ def run_transformer_int4_gpu(repo_id, model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True, use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') else: model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True, trust_remote_code=True, use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') + if isinstance(model, GPTJForCausalLM): + # For gpt-j model family, this optimization can provide a better performance. + model = ipex.optimize(model.eval(), inplace=True) end = time.perf_counter() print(">> loading of model costs {}s".format(end - st)) - model = model.to('xpu') model = BenchmarkWrapper(model) result = {} @@ -330,7 +334,7 @@ def run_optimize_model_gpu(repo_id, in_out_pairs, warm_up, num_trials): - from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer + from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM from bigdl.llm import optimize_model import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) @@ -342,15 +346,19 @@ def run_optimize_model_gpu(repo_id, trust_remote_code=True, use_cache=True) model = optimize_model(model) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') else: model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, trust_remote_code=True, use_cache=True) model = optimize_model(model) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') + if isinstance(model, GPTJForCausalLM): + # For gpt-j model family, this optimization can provide a better performance. + model = ipex.optimize(model.eval(), inplace=True) end = time.perf_counter() print(">> loading of model costs {}s".format(end - st)) - model = model.to('xpu') model = BenchmarkWrapper(model) result = {} @@ -393,4 +401,4 @@ if __name__ == '__main__': run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials']) df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens']) df.to_csv(f'{current_dir}/{api}-results-{today}.csv') - results = [] \ No newline at end of file + results = []