diff --git a/python/llm/dev/benchmark/all-in-one/README.md b/python/llm/dev/benchmark/all-in-one/README.md index 24b22c7f..32d2f49b 100644 --- a/python/llm/dev/benchmark/all-in-one/README.md +++ b/python/llm/dev/benchmark/all-in-one/README.md @@ -20,6 +20,7 @@ test_api: - "transformer_int4" - "native_int4" - "optimize_model" + - "pytorch_autocast_bf16" # - "transformer_int4_gpu" # on arc # - "optimize_model_gpu" # on arc ``` diff --git a/python/llm/dev/benchmark/all-in-one/config.yaml b/python/llm/dev/benchmark/all-in-one/config.yaml index 39662ad6..b89132ba 100644 --- a/python/llm/dev/benchmark/all-in-one/config.yaml +++ b/python/llm/dev/benchmark/all-in-one/config.yaml @@ -12,5 +12,6 @@ test_api: - "transformer_int4" - "native_int4" - "optimize_model" + - "pytorch_autocast_bf16" # - "transformer_int4_gpu" # on arc # - "optimize_model_gpu" # on arc \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index a985b233..6539fea1 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -45,6 +45,8 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) elif test_api == 'optimize_model_gpu': result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) + elif test_api == 'pytorch_autocast_bf16': + result = run_pytorch_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) for in_out_pair in in_out_pairs: results.append([repo_id, @@ -106,7 +108,7 @@ def run_transformer_int4(repo_id, warm_up, num_trials): from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM - from transformers import AutoTokenizer + from transformers import AutoTokenizer, LlamaTokenizer model_path = get_model_path(repo_id, local_model_hub) # Load model in 4 bit, @@ -115,6 +117,18 @@ def run_transformer_int4(repo_id, if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: model = AutoModel.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, torch_dtype='auto') tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + elif repo_id in ['meta-llama/Llama-2-70b-chat-hf']: + # Can be removed when issue https://github.com/analytics-zoo/nano/issues/563 is resolved. + model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, + trust_remote_code=True, optimize_model=False) + # Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944 + tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) + elif repo_id in ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf', + 'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf', + 'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5', + 'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']: + model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) + tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -139,7 +153,7 @@ def run_transformer_int4(repo_id, result[in_out] = [] for i in range(num_trials + warm_up): st = time.perf_counter() - output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len) + output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len, use_cache=True) end = time.perf_counter() print("model generate cost: " + str(end - st)) output = tokenizer.batch_decode(output_ids) @@ -148,6 +162,59 @@ def run_transformer_int4(repo_id, result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) return result +def run_pytorch_autocast_bf16(repo_id, + local_model_hub, + in_out_pairs, + warm_up, + num_trials): + from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, LlamaTokenizer + + model_path = get_model_path(repo_id, local_model_hub) + st = time.perf_counter() + if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: + # TODO: need verify chatglm family run bf16. + model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float() + #model = AutoModel.from_pretrained(model_path, trust_remote_code=True).bfloat() + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + elif repo_id in ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf', + 'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf', + 'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5', + 'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']: + model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) + # Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944 + tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) + else: + model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + end = time.perf_counter() + print(">> loading of model costs {}s".format(end - st)) + + model = BenchmarkWrapper(model) + result = {} + with torch.inference_mode(), torch.autocast("cpu"): + for in_out in in_out_pairs: + in_out_len = in_out.split("-") + in_len = int(in_out_len[0]) + out_len = int(in_out_len[1]) + input_str = open(f"prompt/{in_len}.txt", 'r').read() + # As different tokenizer has different encodings, + # slice the input_ids to ensure the prompt length is required length. + input_ids = tokenizer.encode(input_str, return_tensors="pt") + input_ids = input_ids[:, :in_len] + true_str = tokenizer.batch_decode(input_ids)[0] + input_ids = tokenizer.encode(true_str, return_tensors="pt") + result[in_out] = [] + print("input tokens: {}".format(input_ids.shape[1])) + for i in range(num_trials + warm_up): + st = time.perf_counter() + output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len, use_cache=True) + end = time.perf_counter() + print("model generate cost: " + str(end - st)) + output = tokenizer.batch_decode(output_ids) + print(output[0]) + if i >= warm_up: + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + return result def run_optimize_model(repo_id, local_model_hub,