From 1c8d5da362af450b8abcfa12d5032f5b28691fe3 Mon Sep 17 00:00:00 2001 From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com> Date: Wed, 11 Oct 2023 13:39:39 +0800 Subject: [PATCH] LLM: fix llama tokenizer for all-in-one benchmark (#9129) * fix tokenizer for gpu benchmark * fix ipex fp16 * meet code review * fix --- python/llm/dev/benchmark/all-in-one/run.py | 54 ++++++++++++++-------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 088c21ff..597563c5 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -30,6 +30,11 @@ sys.path.append(benchmark_util_path) from benchmark_util import BenchmarkWrapper from bigdl.llm.utils.common.log4Error import invalidInputError +LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf', + 'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf', + 'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5', + 'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b'] + results = [] @@ -122,16 +127,7 @@ def run_transformer_int4(repo_id, if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: model = AutoModel.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, torch_dtype='auto') tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - elif repo_id in ['meta-llama/Llama-2-70b-chat-hf']: - # Can be removed when issue https://github.com/analytics-zoo/nano/issues/563 is resolved. - model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, - trust_remote_code=True, optimize_model=False) - # Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944 - tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) - elif repo_id in ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf', - 'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf', - 'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5', - 'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']: + elif repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) else: @@ -179,10 +175,7 @@ def run_pytorch_autocast_bf16(repo_id, if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: # TODO: need verify chatglm family run bf16. invalidInputError(False, "Currently pytorch do not support bfloat16 on cpu for chatglm models.") - elif repo_id in ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf', - 'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf', - 'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5', - 'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']: + elif repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) # Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944 tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -224,7 +217,7 @@ def run_optimize_model(repo_id, in_out_pairs, warm_up, num_trials): - from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer + from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer from bigdl.llm import optimize_model model_path = get_model_path(repo_id, local_model_hub) @@ -235,6 +228,11 @@ def run_optimize_model(repo_id, model = AutoModel.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, trust_remote_code=True) model = optimize_model(model) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + elif repo_id in LLAMA_IDS: + model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, + use_cache=True, low_cpu_mem_usage=True) + model = optimize_model(model) + tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True) model = optimize_model(model) @@ -276,17 +274,22 @@ def run_transformer_int4_gpu(repo_id, warm_up, num_trials): from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM - from transformers import AutoTokenizer, GPTJForCausalLM + from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format st = time.perf_counter() if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: - model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True, - use_cache=True) + model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, + trust_remote_code=True, use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') + elif repo_id in LLAMA_IDS: + model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, + use_cache=True) + tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') else: model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True, trust_remote_code=True, use_cache=True) @@ -334,7 +337,7 @@ def run_optimize_model_gpu(repo_id, in_out_pairs, warm_up, num_trials): - from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM + from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from bigdl.llm import optimize_model import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) @@ -347,6 +350,12 @@ def run_optimize_model_gpu(repo_id, model = optimize_model(model) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') + elif repo_id in LLAMA_IDS: + model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, + use_cache=True, low_cpu_mem_usage=True) + model = optimize_model(model) + tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') else: model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, trust_remote_code=True, use_cache=True) @@ -396,7 +405,7 @@ def run_ipex_fp16_gpu(repo_id, warm_up, num_trials): from transformers import AutoModel, AutoModelForCausalLM - from transformers import AutoTokenizer, GPTJForCausalLM + from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer import intel_extension_for_pytorch as ipex model_path = get_model_path(repo_id, local_model_hub) st = time.perf_counter() @@ -404,6 +413,11 @@ def run_ipex_fp16_gpu(repo_id, model = AutoModel.from_pretrained(model_path, trust_remote_code=True, use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.half().to('xpu') + elif repo_id in LLAMA_IDS: + model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, + use_cache=True) + tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.half().to('xpu') else: model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)