From 7082844f3f0bc7061d96e59bfafa4e81c9d379a4 Mon Sep 17 00:00:00 2001 From: "Jin, Qiao" <89779290+JinBridger@users.noreply.github.com> Date: Tue, 3 Dec 2024 16:30:55 +0800 Subject: [PATCH] Fix NPU LLM example save/load tokenizer (#12485) --- .../LLM/Pipeline-Models/baichuan2.py | 4 +++- .../HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py | 4 +++- .../HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py | 4 +++- .../HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py | 4 +++- .../HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py | 4 +++- .../example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py | 4 +++- .../example/NPU/HF-Transformers-AutoModels/LLM/generate.py | 4 +++- .../llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py | 4 +++- .../llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py | 4 +++- .../llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py | 4 +++- python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py | 4 +++- 11 files changed, 33 insertions(+), 11 deletions(-) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py index f3e3ddbc..70efab14 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py @@ -79,6 +79,8 @@ if __name__ == "__main__": transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, save_directory=args.save_directory) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -90,8 +92,8 @@ if __name__ == "__main__": transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if args.disable_streaming: streamer = None diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py index cb640bc7..d11b1891 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py @@ -78,6 +78,8 @@ if __name__ == "__main__": attn_implementation="eager", transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -88,8 +90,8 @@ if __name__ == "__main__": pipeline=True, transpose_value_cache=not args.disable_transpose_value_cache, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if args.disable_streaming: streamer = None diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py index ac3433b9..baf92337 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py @@ -84,6 +84,8 @@ if __name__ == "__main__": attn_implementation="eager", transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -94,8 +96,8 @@ if __name__ == "__main__": pipeline=True, transpose_value_cache=not args.disable_transpose_value_cache, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if args.disable_streaming: streamer = None diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py index df5bd756..fe2868c2 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/minicpm.py @@ -66,6 +66,8 @@ if __name__ == "__main__": transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, save_directory=args.save_directory) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -77,8 +79,8 @@ if __name__ == "__main__": transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if args.disable_streaming: streamer = None diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py index ef5ded70..e1f4be49 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py @@ -70,6 +70,8 @@ if __name__ == "__main__": mixed_precision=True, trust_remote_code=True, save_directory=args.save_directory) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -79,8 +81,8 @@ if __name__ == "__main__": max_prompt_len=args.max_prompt_len, pipeline=True, transpose_value_cache=not args.disable_transpose_value_cache) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if args.disable_streaming: streamer = None diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py index 05c47076..cdf26af1 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py @@ -79,6 +79,8 @@ if __name__ == "__main__": transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -90,8 +92,8 @@ if __name__ == "__main__": transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) DEFAULT_SYSTEM_PROMPT = """\ """ diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py index 41a14e1a..d3abd13a 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py @@ -43,7 +43,6 @@ if __name__ == '__main__': args = parser.parse_args() model_path = args.repo_id_or_model_path - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if not args.lowbit_path or not os.path.exists(args.lowbit_path): model = AutoModelForCausalLM.from_pretrained( @@ -52,6 +51,8 @@ if __name__ == '__main__': load_in_low_bit=args.load_in_low_bit, attn_implementation="eager" ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.lowbit_path) else: model = AutoModelForCausalLM.load_low_bit( args.lowbit_path, @@ -59,6 +60,7 @@ if __name__ == '__main__': bigdl_transformers_low_bit=args.load_in_low_bit, attn_implementation="eager" ) + tokenizer = AutoTokenizer.from_pretrained(args.lowbit_path, trust_remote_code=True) print(model) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py index 83fe6d89..d981f39f 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py @@ -79,6 +79,8 @@ if __name__ == "__main__": transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -89,8 +91,8 @@ if __name__ == "__main__": max_prompt_len=args.max_prompt_len, transpose_value_cache=not args.disable_transpose_value_cache, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) DEFAULT_SYSTEM_PROMPT = """\ """ diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py index 85cca7fd..35ee4902 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py @@ -80,6 +80,8 @@ if __name__ == "__main__": transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -90,8 +92,8 @@ if __name__ == "__main__": max_prompt_len=args.max_prompt_len, transpose_value_cache=not args.disable_transpose_value_cache, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) DEFAULT_SYSTEM_PROMPT = """\ """ diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py index 5ec0bf72..b177042c 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py @@ -65,6 +65,8 @@ if __name__ == "__main__": transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -76,7 +78,7 @@ if __name__ == "__main__": transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, ) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) print("-" * 80) print("done") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py index 9f03c908..caf6d1b3 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py @@ -71,6 +71,8 @@ if __name__ == "__main__": quantization_group_size=args.quantization_group_size, save_directory=args.save_directory ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(args.save_directory) else: model = AutoModelForCausalLM.load_low_bit( args.save_directory, @@ -81,8 +83,8 @@ if __name__ == "__main__": max_prompt_len=args.max_prompt_len, transpose_value_cache=not args.disable_transpose_value_cache, ) + tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) print("-" * 80) print("done")