LLM: fix llama tokenizer for all-in-one benchmark (#9129)

* fix tokenizer for gpu benchmark

* fix ipex fp16

* meet code review

* fix
This commit is contained in:
Ruonan Wang 2023-10-11 13:39:39 +08:00 committed by GitHub
parent 2ad67a18b1
commit 1c8d5da362

View file

@ -30,6 +30,11 @@ sys.path.append(benchmark_util_path)
from benchmark_util import BenchmarkWrapper from benchmark_util import BenchmarkWrapper
from bigdl.llm.utils.common.log4Error import invalidInputError from bigdl.llm.utils.common.log4Error import invalidInputError
LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5',
'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']
results = [] results = []
@ -122,16 +127,7 @@ def run_transformer_int4(repo_id,
if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']:
model = AutoModel.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, torch_dtype='auto') model = AutoModel.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, torch_dtype='auto')
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in ['meta-llama/Llama-2-70b-chat-hf']: elif repo_id in LLAMA_IDS:
# Can be removed when issue https://github.com/analytics-zoo/nano/issues/563 is resolved.
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True,
trust_remote_code=True, optimize_model=False)
# Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5',
'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
else: else:
@ -179,10 +175,7 @@ def run_pytorch_autocast_bf16(repo_id,
if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']:
# TODO: need verify chatglm family run bf16. # TODO: need verify chatglm family run bf16.
invalidInputError(False, "Currently pytorch do not support bfloat16 on cpu for chatglm models.") invalidInputError(False, "Currently pytorch do not support bfloat16 on cpu for chatglm models.")
elif repo_id in ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf', elif repo_id in LLAMA_IDS:
'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5',
'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']:
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
# Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944 # Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
@ -224,7 +217,7 @@ def run_optimize_model(repo_id,
in_out_pairs, in_out_pairs,
warm_up, warm_up,
num_trials): num_trials):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from bigdl.llm import optimize_model from bigdl.llm import optimize_model
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -235,6 +228,11 @@ def run_optimize_model(repo_id,
model = AutoModel.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, trust_remote_code=True) model = AutoModel.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, trust_remote_code=True)
model = optimize_model(model) model = optimize_model(model)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True,
use_cache=True, low_cpu_mem_usage=True)
model = optimize_model(model)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True) model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True)
model = optimize_model(model) model = optimize_model(model)
@ -276,17 +274,22 @@ def run_transformer_int4_gpu(repo_id,
warm_up, warm_up,
num_trials): num_trials):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit, # Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format # which convert the relevant layers in the model into INT4 format
st = time.perf_counter() st = time.perf_counter()
if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']: if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']:
model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True, model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True,
use_cache=True) trust_remote_code=True, use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu') model = model.to('xpu')
elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True,
use_cache=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True, model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True,
trust_remote_code=True, use_cache=True) trust_remote_code=True, use_cache=True)
@ -334,7 +337,7 @@ def run_optimize_model_gpu(repo_id,
in_out_pairs, in_out_pairs,
warm_up, warm_up,
num_trials): num_trials):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from bigdl.llm import optimize_model from bigdl.llm import optimize_model
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -347,6 +350,12 @@ def run_optimize_model_gpu(repo_id,
model = optimize_model(model) model = optimize_model(model)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu') model = model.to('xpu')
elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True,
use_cache=True, low_cpu_mem_usage=True)
model = optimize_model(model)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True,
trust_remote_code=True, use_cache=True) trust_remote_code=True, use_cache=True)
@ -396,7 +405,7 @@ def run_ipex_fp16_gpu(repo_id,
warm_up, warm_up,
num_trials): num_trials):
from transformers import AutoModel, AutoModelForCausalLM from transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
st = time.perf_counter() st = time.perf_counter()
@ -404,6 +413,11 @@ def run_ipex_fp16_gpu(repo_id,
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, use_cache=True) model = AutoModel.from_pretrained(model_path, trust_remote_code=True, use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.half().to('xpu') model = model.half().to('xpu')
elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True,
use_cache=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.half().to('xpu')
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, use_cache=True) model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)