From f3fefdc9cee768ce330d5a340f0d491e12a63f08 Mon Sep 17 00:00:00 2001 From: Jiao Wang Date: Mon, 18 Mar 2024 23:30:28 -0700 Subject: [PATCH] fix pad_token_id issue (#10425) --- .../CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py | 1 + python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py | 1 + .../GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py | 1 + python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py | 3 ++- 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py index e33ec220..53a95623 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py @@ -60,6 +60,7 @@ if __name__ == '__main__': # it is important to set `use_cache=True` explicitly in the `generate` function # to obtain optimal performance with BigDL-LLM INT4 optimizations + model.generation_config.pad_token_id = model.generation_config.eos_token_id # Note that phi-2 uses GenerationConfig to enable 'use_cache' output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config) diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py index b66a3b2b..a4f54355 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py @@ -52,6 +52,7 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = PHI_2_V1_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt") + model.generation_config.pad_token_id = model.generation_config.eos_token_id st = time.time() output = model.generate(input_ids, max_new_tokens=args.n_predict, generation_config = generation_config) end = time.time() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py index 79c2fbfd..d5aa3a74 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py @@ -59,6 +59,7 @@ if __name__ == '__main__': prompt = PHI2_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + model.generation_config.pad_token_id = model.generation_config.eos_token_id # ipex model needs a warmup, then inference time can be accurate output = model.generate(input_ids, max_new_tokens=args.n_predict, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py index 242f815d..a91b64e7 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py @@ -55,7 +55,8 @@ if __name__ == '__main__': with torch.inference_mode(): prompt = PHI_2_V1_PROMPT_FORMAT.format(prompt=args.prompt) input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') - + + model.generation_config.pad_token_id = model.generation_config.eos_token_id # ipex model needs a warmup, then inference time can be accurate output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config) # start inference