From 64ee1d7689f1da651a25fce6febbe19ce1cf27d2 Mon Sep 17 00:00:00 2001 From: Xin Qiu Date: Fri, 15 Sep 2023 15:10:04 +0800 Subject: [PATCH] update run_transformer_int4_gpu (#8983) * xpuperf * update run.py * clean upo * uodate * update * meet code review --- python/llm/dev/benchmark/all-in-one/run.py | 41 +++++++++++----------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 6539fea1..51f33569 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -58,7 +58,10 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, def get_model_path(repo_id, local_model_hub): if local_model_hub: repo_model_name = repo_id.split("/")[1] - return local_model_hub + os.path.sep + repo_model_name + local_model_path = local_model_hub + os.path.sep + repo_model_name + invalidInputError(os.path.isdir(local_model_path), + local_model_path + " not exists!, Please check your models' folder.") + return local_model_path else: return repo_id @@ -275,25 +278,20 @@ def run_transformer_int4_gpu(repo_id, from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer import intel_extension_for_pytorch as ipex - if local_model_hub: - repo_model_name = repo_id.split("/")[1] - model_path = local_model_hub + "/" + repo_model_name - else: - model_path = repo_id + model_path = get_model_path(repo_id, local_model_hub) # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format st = time.perf_counter() if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True) - model = model.to('xpu') tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) else: - model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True) - model = model.to('xpu') - tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) end = time.perf_counter() print(">> loading of model costs {}s".format(end - st)) + model = model.to('xpu') model = BenchmarkWrapper(model) result = {} @@ -305,8 +303,10 @@ def run_transformer_int4_gpu(repo_id, input_str = open(f"prompt/{in_len}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. - input_ids = tokenizer.encode(input_str, return_tensors="pt").to('xpu') + input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] + true_str = tokenizer.batch_decode(input_ids)[0] + input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') result[in_out] = [] for i in range(num_trials + warm_up): st = time.perf_counter() @@ -319,6 +319,7 @@ def run_transformer_int4_gpu(repo_id, print(output[0]) if i >= warm_up: result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + torch.xpu.empty_cache() return result @@ -330,27 +331,22 @@ def run_optimize_model_gpu(repo_id, from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer from bigdl.llm import optimize_model import intel_extension_for_pytorch as ipex - if local_model_hub: - repo_model_name = repo_id.split("/")[1] - model_path = local_model_hub + "/" + repo_model_name - else: - model_path = repo_id + model_path = get_model_path(repo_id, local_model_hub) # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format st = time.perf_counter() if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: model = AutoModel.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, trust_remote_code=True) model = optimize_model(model) - model = model.to('xpu') tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) else: - model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True) + model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, trust_remote_code=True) model = optimize_model(model) - model = model.to('xpu') - tokenizer = AutoTokenizer.from_pretrained(model_path) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) end = time.perf_counter() print(">> loading of model costs {}s".format(end - st)) + model = model.to('xpu') model = BenchmarkWrapper(model) result = {} @@ -362,8 +358,10 @@ def run_optimize_model_gpu(repo_id, input_str = open(f"prompt/{in_len}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. - input_ids = tokenizer.encode(input_str, return_tensors="pt").to('xpu') + input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] + true_str = tokenizer.batch_decode(input_ids)[0] + input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') result[in_out] = [] for i in range(num_trials + warm_up): st = time.perf_counter() @@ -376,6 +374,7 @@ def run_optimize_model_gpu(repo_id, print(output[0]) if i >= warm_up: result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + torch.xpu.empty_cache() return result