From 1c8d5da362af450b8abcfa12d5032f5b28691fe3 Mon Sep 17 00:00:00 2001
From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com>
Date: Wed, 11 Oct 2023 13:39:39 +0800
Subject: [PATCH] LLM: fix llama tokenizer for all-in-one benchmark (#9129)

* fix tokenizer for gpu benchmark

* fix ipex fp16

* meet code review

* fix
---
 python/llm/dev/benchmark/all-in-one/run.py | 54 ++++++++++++++--------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 088c21ff..597563c5 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -30,6 +30,11 @@ sys.path.append(benchmark_util_path)
 from benchmark_util import BenchmarkWrapper
 from bigdl.llm.utils.common.log4Error import invalidInputError
 
+LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
+             'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
+             'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5',
+             'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']
+
 results = []
 
 
@@ -122,16 +127,7 @@ def run_transformer_int4(repo_id,
     if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']:
         model = AutoModel.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, torch_dtype='auto')
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    elif repo_id in ['meta-llama/Llama-2-70b-chat-hf']:
-        # Can be removed when issue https://github.com/analytics-zoo/nano/issues/563 is resolved.
-        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True,
-                                                     trust_remote_code=True, optimize_model=False)
-        # Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944
-        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    elif repo_id in ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
-                     'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
-                     'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5',
-                     'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']:
+    elif repo_id in LLAMA_IDS:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True)
         tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
@@ -179,10 +175,7 @@ def run_pytorch_autocast_bf16(repo_id,
     if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']:
         # TODO: need verify chatglm family run bf16.
         invalidInputError(False, "Currently pytorch do not support bfloat16 on cpu for chatglm models.")
-    elif repo_id in ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
-                     'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
-                     'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5',
-                     'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']:
+    elif repo_id in LLAMA_IDS:
         model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
         # Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944
         tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
@@ -224,7 +217,7 @@ def run_optimize_model(repo_id,
                        in_out_pairs,
                        warm_up,
                        num_trials):
-    from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+    from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
     from bigdl.llm import optimize_model
 
     model_path = get_model_path(repo_id, local_model_hub)
@@ -235,6 +228,11 @@ def run_optimize_model(repo_id,
         model = AutoModel.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, trust_remote_code=True)
         model = optimize_model(model)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    elif repo_id in LLAMA_IDS:
+        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True,
+                                                     use_cache=True, low_cpu_mem_usage=True)
+        model = optimize_model(model)
+        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True)
         model = optimize_model(model)
@@ -276,17 +274,22 @@ def run_transformer_int4_gpu(repo_id,
                              warm_up,
                              num_trials):
     from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
-    from transformers import AutoTokenizer, GPTJForCausalLM
+    from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
     import intel_extension_for_pytorch as ipex
     model_path = get_model_path(repo_id, local_model_hub)
     # Load model in 4 bit,
     # which convert the relevant layers in the model into INT4 format
     st = time.perf_counter()
     if repo_id in ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b']:
-        model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True,
-                                          use_cache=True)
+        model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True,
+                                          trust_remote_code=True, use_cache=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         model = model.to('xpu')
+    elif repo_id in LLAMA_IDS:
+        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True,
+                                                     use_cache=True)
+        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = model.to('xpu')
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True,
                                                      trust_remote_code=True, use_cache=True)
@@ -334,7 +337,7 @@ def run_optimize_model_gpu(repo_id,
                            in_out_pairs,
                            warm_up,
                            num_trials):
-    from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM
+    from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
     from bigdl.llm import optimize_model
     import intel_extension_for_pytorch as ipex
     model_path = get_model_path(repo_id, local_model_hub)
@@ -347,6 +350,12 @@ def run_optimize_model_gpu(repo_id,
         model = optimize_model(model)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         model = model.to('xpu')
+    elif repo_id in LLAMA_IDS:
+        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True,
+                                                     use_cache=True, low_cpu_mem_usage=True)
+        model = optimize_model(model)
+        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = model.to('xpu')
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True,
                                                      trust_remote_code=True, use_cache=True)
@@ -396,7 +405,7 @@ def run_ipex_fp16_gpu(repo_id,
                       warm_up,
                       num_trials):
     from transformers import AutoModel, AutoModelForCausalLM
-    from transformers import AutoTokenizer, GPTJForCausalLM
+    from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
     import intel_extension_for_pytorch as ipex
     model_path = get_model_path(repo_id, local_model_hub)
     st = time.perf_counter()
@@ -404,6 +413,11 @@ def run_ipex_fp16_gpu(repo_id,
         model = AutoModel.from_pretrained(model_path, trust_remote_code=True, use_cache=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         model = model.half().to('xpu')
+    elif repo_id in LLAMA_IDS:
+        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True,
+                                                     use_cache=True)
+        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = model.half().to('xpu')
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, use_cache=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)