From 3fe2ea308145e6a801a8dc473aab2777aa69cf64 Mon Sep 17 00:00:00 2001 From: Ruonan Wang Date: Mon, 28 Oct 2024 16:05:49 +0800 Subject: [PATCH] [NPU] Reuse prefill of acc lib for pipeline (#12279) * first commit * update example * fix style * update example * embedding as const * fix generate * code refactor * meet code review * fix style * change max_output_len to max_context_len * fix all-in-one * fix example * add check for new tokens --- python/llm/dev/benchmark/all-in-one/run.py | 16 +- .../LLM/Pipeline-Models/llama2.py | 10 +- .../LLM/Pipeline-Models/llama3.py | 10 +- .../LLM/baichuan2.py | 6 +- .../HF-Transformers-AutoModels/LLM/llama.py | 6 +- .../HF-Transformers-AutoModels/LLM/minicpm.py | 6 +- .../HF-Transformers-AutoModels/LLM/qwen.py | 6 +- .../Multimodal/minicpm-llama3-v2.5.py | 4 +- .../Multimodal/minicpm_v_2_6.py | 4 +- .../src/ipex_llm/transformers/npu_model.py | 21 +- .../transformers/npu_models/convert_mp.py | 90 +++++---- .../npu_pipeline_model/convert_pipeline.py | 188 +++++++++++------- .../transformers/npu_pipeline_model/llama.py | 3 +- 13 files changed, 224 insertions(+), 146 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index e9e4e740..e56c8752 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -617,23 +617,23 @@ def transformers_int4_npu_win(repo_id, model_path = get_model_path(repo_id, local_model_hub) in_out_len = in_out_pairs[0].split("-") - max_output_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024) + max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024) # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format st = time.perf_counter() if repo_id in CHATGLM_IDS: model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, - optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, + optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, torch_dtype=torch.float16, attn_implementation="eager").eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) elif repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16, - optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, + optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, use_cache=True, attn_implementation="eager").eval() tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16, - optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, + optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, use_cache=True, attn_implementation="eager").eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) end = time.perf_counter() @@ -690,23 +690,23 @@ def run_transformer_int4_loadlowbit_npu_win(repo_id, model_path = get_model_path(repo_id, local_model_hub) in_out_len = in_out_pairs[0].split("-") - max_output_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024) + max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024) # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format st = time.perf_counter() if repo_id in CHATGLM_IDS: model = AutoModel.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True, - optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, + optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, torch_dtype=torch.float16, attn_implementation="eager").eval() tokenizer = AutoTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True) elif repo_id in LLAMA_IDS: model = AutoModelForCausalLM.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True, torch_dtype=torch.float16, - optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, + optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, use_cache=True, attn_implementation="eager").eval() tokenizer = LlamaTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True) else: model = AutoModelForCausalLM.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True, torch_dtype=torch.float16, - optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, + optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache, use_cache=True, attn_implementation="eager").eval() tokenizer = AutoTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True) end = time.perf_counter() diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py index 08bbb55e..35d7826a 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py @@ -51,7 +51,9 @@ if __name__ == "__main__": parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") - parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-context-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=960) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -59,9 +61,11 @@ if __name__ == "__main__": model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, pipeline=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, torch_dtype=torch.float16, - attn_implementation="eager") + attn_implementation="eager", + transpose_value_cache=not args.disable_transpose_value_cache) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py index 801efa10..a3a8bf41 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py @@ -57,7 +57,9 @@ if __name__ == "__main__": parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") - parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-context-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=960) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -66,8 +68,10 @@ if __name__ == "__main__": torch_dtype=torch.float16, optimize_model=True, pipeline=True, - max_output_len=args.max_output_len, - attn_implementation="eager") + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, + attn_implementation="eager", + transpose_value_cache=not args.disable_transpose_value_cache) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py index d9af25df..1d528357 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py @@ -59,7 +59,7 @@ if __name__ == "__main__": parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") - parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--intra-pp", type=int, default=2) @@ -76,7 +76,7 @@ if __name__ == "__main__": attn_implementation="eager", load_in_low_bit="sym_int4", optimize_model=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, @@ -88,7 +88,7 @@ if __name__ == "__main__": attn_implementation="eager", torch_dtype=torch.bfloat16, optimize_model=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py index 19138da5..97aed851 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py @@ -59,7 +59,7 @@ if __name__ == "__main__": parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") - parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--intra-pp", type=int, default=2) @@ -76,7 +76,7 @@ if __name__ == "__main__": attn_implementation="eager", load_in_low_bit="sym_int4", optimize_model=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, @@ -88,7 +88,7 @@ if __name__ == "__main__": attn_implementation="eager", torch_dtype=torch.float16, optimize_model=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py index 79786167..8ac322e2 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py @@ -46,7 +46,7 @@ if __name__ == "__main__": parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") - parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--intra-pp", type=int, default=2) @@ -62,7 +62,7 @@ if __name__ == "__main__": attn_implementation="eager", load_in_low_bit="sym_int4", optimize_model=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, @@ -74,7 +74,7 @@ if __name__ == "__main__": attn_implementation="eager", torch_dtype=torch.float16, optimize_model=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py index dd1e958c..ce69a52a 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py @@ -46,7 +46,7 @@ if __name__ == "__main__": parser.add_argument('--prompt', type=str, default="AI是什么?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") - parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--intra-pp", type=int, default=None) @@ -64,7 +64,7 @@ if __name__ == "__main__": attn_implementation="eager", load_in_low_bit="sym_int4", optimize_model=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, @@ -77,7 +77,7 @@ if __name__ == "__main__": attn_implementation="eager", torch_dtype=torch.float16, optimize_model=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py index 86b417b2..42d4fdd7 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py @@ -45,7 +45,7 @@ if __name__ == "__main__": parser.add_argument('--prompt', type=str, default="What is in the image?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") - parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--intra-pp", type=int, default=2) @@ -61,7 +61,7 @@ if __name__ == "__main__": attn_implementation="eager", load_in_low_bit="sym_int4", optimize_model=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py index 259b8c12..24ee9053 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py @@ -36,7 +36,7 @@ if __name__ == '__main__': parser.add_argument('--prompt', type=str, default="What is in this image?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") - parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--intra-pp", type=int, default=None) @@ -52,7 +52,7 @@ if __name__ == '__main__': attn_implementation="eager", load_in_low_bit="sym_int4", optimize_model=True, - max_output_len=args.max_output_len, + max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index aca3fb25..1a855bb0 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -124,8 +124,8 @@ class _BaseAutoModelClass: ignore_argument(kwargs, "pipeline_parallel_stages") optimize_model = kwargs.pop("optimize_model", False) pipeline = kwargs.pop("pipeline", False) - max_output_len = kwargs.pop("max_output_len", 1024) - max_output_len = max_output_len - 1 + max_context_len = kwargs.pop("max_context_len", 1024) + max_context_len = max_context_len - 1 max_prompt_len = kwargs.pop("max_prompt_len", 512) inter_pp = kwargs.pop("inter_pp", None) intra_pp = kwargs.pop("intra_pp", None) @@ -169,10 +169,10 @@ class _BaseAutoModelClass: if optimize_model: invalidInputError( - max_prompt_len < max_output_len, + max_prompt_len < max_context_len, ( f"max_prompt_len ({max_prompt_len}) should be less" - " than max_output_len ({max_output_len})" + " than max_context_len ({max_context_len})" ), ) optimize_kwargs = { @@ -182,7 +182,7 @@ class _BaseAutoModelClass: "quantization_group_size": quantization_group_size, "modules_to_not_convert": modules_to_not_convert, "pipeline": pipeline, - "max_output_len": max_output_len, + "max_context_len": max_context_len, "max_prompt_len": max_prompt_len, "inter_pp": inter_pp, "intra_pp": intra_pp, @@ -219,7 +219,7 @@ class _BaseAutoModelClass: quantization_group_size = kwargs.pop("quantization_group_size", 0) modules_to_not_convert = kwargs.pop("modules_to_not_convert", []) pipeline = kwargs.pop("pipeline", False) - max_output_len = kwargs.pop("max_output_len", 1024) + max_context_len = kwargs.pop("max_context_len", 1024) max_prompt_len = kwargs.pop("max_prompt_len", 512) inter_pp = kwargs.pop("inter_pp", None) intra_pp = kwargs.pop("intra_pp", None) @@ -246,7 +246,7 @@ class _BaseAutoModelClass: if not pipeline: optimize_llm( llm, - max_output_len=max_output_len, + max_context_len=max_context_len, max_prompt_len=max_prompt_len, inter_pp=inter_pp, intra_pp=intra_pp, @@ -258,7 +258,8 @@ class _BaseAutoModelClass: from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \ import convert_llm convert_llm(llm, - kv_len=max_output_len, + kv_len=max_context_len, + max_prompt_len=max_prompt_len, transpose_value_cache=transpose_value_cache) return model @@ -598,7 +599,7 @@ class FunAsrAutoModel(_BaseAutoModelClass): model = kwargs.pop("model") qtype = kwargs.pop("qtype", "sym_int8") modules_to_not_convert = kwargs.pop("modules_to_not_convert", []) - max_output_len = kwargs.pop("max_output_len", 1024) + max_context_len = kwargs.pop("max_context_len", 1024) max_prompt_len = kwargs.pop("max_prompt_len", 512) inter_pp = kwargs.pop("inter_pp", None) intra_pp = kwargs.pop("intra_pp", None) @@ -618,7 +619,7 @@ class FunAsrAutoModel(_BaseAutoModelClass): optimize_funasr( model, - max_output_len=max_output_len, + max_context_len=max_context_len, max_prompt_len=max_prompt_len, inter_pp=inter_pp, intra_pp=intra_pp, diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index e2c37294..cb4e9432 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -154,9 +154,47 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, model.lm_head = new_linear +def convert_llama( + model: torch.nn.Module, + max_output_len=1024, + max_prompt_len=1024, + decoder=False, + inter_pp=None, + intra_pp=None, + transpose_value_cache=True, +): + from ipex_llm.transformers.npu_models.llama_mp import gen_llama_fused_model_forward + from ipex_llm.transformers.npu_models.llama_mp import DecodeRunner, PrefillRunner + from transformers.models.llama.modeling_llama import LlamaModel + + if decoder: + decode_runner = DecodeRunner( + model, + max_seq_len=max_output_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + ) + else: + decode_runner = None + prefill_runner = PrefillRunner( + model, + max_output_len=max_output_len, + max_prompt_len=max_prompt_len, + transpose_value_cache=transpose_value_cache, + ) + llama_model_forward = gen_llama_fused_model_forward( + prefill_runner=prefill_runner, decode_runner=decode_runner + ) + convert_forward(model, LlamaModel, llama_model_forward) + from transformers.models.llama.modeling_llama import LlamaForCausalLM + from ipex_llm.transformers.npu_models.llama_mp import llama2_casullm_forward + convert_forward(model, LlamaForCausalLM, llama2_casullm_forward) + + def optimize_llm( model: torch.nn.Module, - max_output_len=1024, + max_context_len=1024, max_prompt_len=1024, inter_pp=None, intra_pp=None, @@ -168,31 +206,13 @@ def optimize_llm( intra_pp = 2 if inter_pp is None: inter_pp = 2 if group_size == 0 else 8 - - from ipex_llm.transformers.npu_models.llama_mp import gen_llama_fused_model_forward - from ipex_llm.transformers.npu_models.llama_mp import DecodeRunner, PrefillRunner - from transformers.models.llama.modeling_llama import LlamaModel - - decode_runner = DecodeRunner( - model, - max_seq_len=max_output_len, - inter_pp=inter_pp, - intra_pp=intra_pp, - transpose_value_cache=transpose_value_cache, - ) - prefill_runner = PrefillRunner( - model, - max_output_len=max_output_len, - max_prompt_len=max_prompt_len, - transpose_value_cache=transpose_value_cache, - ) - llama_model_forward = gen_llama_fused_model_forward( - prefill_runner=prefill_runner, decode_runner=decode_runner - ) - convert_forward(model, LlamaModel, llama_model_forward) - from transformers.models.llama.modeling_llama import LlamaForCausalLM - from ipex_llm.transformers.npu_models.llama_mp import llama2_casullm_forward - convert_forward(model, LlamaForCausalLM, llama2_casullm_forward) + convert_llama(model, + max_output_len=max_context_len, + max_prompt_len=max_prompt_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + decoder=True, + transpose_value_cache=transpose_value_cache) elif model.config.model_type == "qwen2" and model.config.num_hidden_layers == 28: # for qwen2-1.5B and qwen2-7B if intra_pp is None: @@ -212,14 +232,14 @@ def optimize_llm( decode_runner = DecodeRunner( model, - max_seq_len=max_output_len, + max_seq_len=max_context_len, inter_pp=inter_pp, intra_pp=intra_pp, transpose_value_cache=transpose_value_cache, ) prefill_runner = PrefillRunner( model, - max_output_len=max_output_len, + max_output_len=max_context_len, max_prompt_len=max_prompt_len, transpose_value_cache=transpose_value_cache, ) @@ -252,14 +272,14 @@ def optimize_llm( decode_runner = DecodeRunner( model, - max_seq_len=max_output_len, + max_seq_len=max_context_len, inter_pp=inter_pp, intra_pp=intra_pp, transpose_value_cache=transpose_cache, ) prefill_runner = PrefillRunner( model, - max_output_len=max_output_len, + max_output_len=max_context_len, max_prompt_len=max_prompt_len, transpose_value_cache=transpose_cache, ) @@ -281,14 +301,14 @@ def optimize_llm( from ipex_llm.transformers.npu_models.baichuan_mp import DecodeRunner, PrefillRunner decode_runner = DecodeRunner( model, - max_seq_len=max_output_len, + max_seq_len=max_context_len, inter_pp=inter_pp, intra_pp=intra_pp, transpose_value_cache=transpose_value_cache, ) prefill_runner = PrefillRunner( model, - max_output_len=max_output_len, + max_output_len=max_context_len, max_prompt_len=max_prompt_len, transpose_value_cache=transpose_value_cache, ) @@ -305,7 +325,7 @@ def optimize_llm( def optimize_funasr( model: torch.nn.Module, - max_output_len=1024, + max_context_len=1024, max_prompt_len=1024, inter_pp=None, intra_pp=None, @@ -320,7 +340,7 @@ def optimize_funasr( from ipex_llm.transformers.npu_models.paraformer_mp import PrefillRunner, DecodeRunner prefill_runner = PrefillRunner( model, - max_output_len=max_output_len, + max_output_len=max_context_len, max_prompt_len=max_prompt_len, transpose_value_cache=transpose_value_cache, ) @@ -329,7 +349,7 @@ def optimize_funasr( ) decode_runner = DecodeRunner( model, - max_seq_len=max_output_len, + max_seq_len=max_context_len, inter_pp=inter_pp, intra_pp=intra_pp, transpose_value_cache=transpose_value_cache, diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 883616c8..34407b93 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -61,87 +61,129 @@ def generate( new_tokens = new_generate_kwargs['max_new_tokens'] invalidInputError(input_length + new_tokens <= self.kv_len + 1, - "Input plus output tokens should not exceed max_output_len.") + "Input plus output tokens should not exceed max_context_len.") + # TODO: may optimize this part later + invalidInputError(new_tokens < 1024, + f"Generated tokens ({new_tokens}) exceed named pipeline limitation.") - # start generate_serve by Thread - thread = threading.Thread(target=generate_serve, - args=(self.kv_len, self.num_head, - self.head_dim, self.num_layers, - self.transpose_value_cache, - new_tokens - 1)) - thread.start() - - in_pipe_path = "\\\\.\\pipe\\llminputpipe" - out_pipe_path = "\\\\.\\pipe\\llmoutputpipe" - - while True: - try: - input_pipe = open(in_pipe_path, "wb") - except: - print('Waiting for input pipe') - time.sleep(1) - else: - break - - while True: - try: - output_pipe = open(out_pipe_path, "rb") - except: - print('Waiting for output pipe') - time.sleep(1) - else: - break - - bdata = b'' - for i in range(0, input_length): - d = int(numpy_input[i]) - bdata = bdata + d.to_bytes(4, sys.byteorder) - - if "eos_token_id" not in new_generate_kwargs: - eos = 0xffffffff - else: - eos = new_generate_kwargs["eos_token_id"] - - bdata = bdata + eos.to_bytes(4, sys.byteorder) - - time_start = time.perf_counter() - - input_pipe.write(bytearray(bdata)) - input_pipe.flush() - - buffersize = 4 output_tokens = [] - while True: - data = output_pipe.read(buffersize) - if len(data) == 0: - break - token = int.from_bytes(data, sys.byteorder) - idx += 1 - if time_t1 is None: - time_t1 = time.perf_counter() + + with tempfile.TemporaryDirectory() as temp_dir: + # run prefill with PrefillRunner + output = self(input_ids=inputs, + attention_mask=torch.ones(1, inputs.shape[1]).int()) + logits = output.logits + input_id = torch.argmax(logits[:, -1, :], dim=1) + input_id.to(torch.int32).numpy().tofile(os.path.join(temp_dir, "input_id.bin")) + position = np.int64(inputs.shape[1]) + position.tofile(os.path.join(temp_dir, "position.bin")) + past_key_values = output.past_key_values + key_cache = past_key_values.key_cache + value_cache = past_key_values.value_cache + for layer in range(self.num_layers): + key_ = key_cache[layer] + val_ = value_cache[layer] + new_size = ( + key_.size(0), + key_.size(1), + self.kv_len, + key_.size(3), + ) + key = key_.as_strided(new_size, key_.stride(), storage_offset=0) + if not self.transpose_value_cache: + val = val_.as_strided(new_size, val_.stride(), storage_offset=0) + else: + new_size = ( + val_.size(0), + val_.size(1), + val_.size(3), + self.kv_len, + ) + val_cache = val_.transpose(-1, -2) + val = val_cache.as_strided(new_size, val_cache.stride(), storage_offset=0) + key.to(torch.float16).numpy().tofile(os.path.join(temp_dir, f"key_cache_{layer}.bin")) + val.to(torch.float16).numpy().tofile(os.path.join(temp_dir, f"value_cache_{layer}.bin")) + + token = input_id.to(torch.int32).item() output_tokens.append(torch.tensor([token])) if streamer is not None: streamer.put(torch.tensor([token])) - if token == eos: - break - output = torch.stack(output_tokens, dim=1) - output = torch.cat((inputs, output), dim=1) - if streamer is not None: - streamer.end() + if "eos_token_id" not in new_generate_kwargs: + eos = 0xffffffff + else: + eos = new_generate_kwargs["eos_token_id"] + + time_t1 = time.perf_counter() + idx += 1 + + # start generate_serve by Thread + thread = threading.Thread(target=generate_serve, + args=(self.kv_len, self.num_head, + self.head_dim, self.num_layers, + self.transpose_value_cache, + new_tokens - 2)) + thread.start() + + in_pipe_path = "\\\\.\\pipe\\llminputpipe" + out_pipe_path = "\\\\.\\pipe\\llmoutputpipe" + + while True: + try: + input_pipe = open(in_pipe_path, "wb") + except: + print('Waiting for input pipe') + time.sleep(1) + else: + break + + while True: + try: + output_pipe = open(out_pipe_path, "rb") + except: + print('Waiting for output pipe') + time.sleep(1) + else: + break + + time_start = time.perf_counter() + + bdata = str.encode(str(temp_dir)) + invalidInputError(len(bdata) <= 2000, + f"Leng of input directory is too long ({len(bdata)}), " + "which may cause read error.") + input_pipe.write(bdata) + input_pipe.flush() + + buffersize = 4 + while True: + data = output_pipe.read(buffersize) + if len(data) == 0: + break + token = int.from_bytes(data, sys.byteorder) + idx += 1 + output_tokens.append(torch.tensor([token])) + if streamer is not None: + streamer.put(torch.tensor([token])) + if token == eos: + break + + output = torch.stack(output_tokens, dim=1) + output = torch.cat((inputs, output), dim=1) + if streamer is not None: + streamer.end() thread.join() time_end = time.perf_counter() if do_print: - print(f" Start the thread and connect the pipe time: {(time_start - time_start_all):.2f} s") + print(f" Start the thread and connect the pipe time: {(time_start - time_t1):.2f} s") print(f" Number of input tokens: {input_length}") print(f" Generated tokens: {idx}") - print(f" First token generation time: {(time_t1 - time_start):.2f} s") - print(f" Generation average latency: {(time_end - time_t1)*1000 /(idx - 1):.2f} ms, " - f"({(idx - 1)/(time_end - time_t1):.2f} token/s)") - print(f" Generation time: {(time_end - time_start):.2f} s\n") - + print(f" First token generation time: {(time_t1 - time_start_all):.2f} s") + print(f" Generation average latency: {(time_end - time_start) * 1000 /(idx - 1):.2f} ms, " + f"({(idx - 1)/(time_end - time_start):.2f} token/s)") + print(f" Generation time: {(time_end - time_start_all - (time_start - time_t1)):.2f} s\n") return output @@ -182,8 +224,15 @@ def update_names_of_IR_and_export_blob(model, model_name, dir): def convert_llm(model: torch.nn.Module, kv_len: int, + max_prompt_len: int, transpose_value_cache: bool): if model.config.model_type == "llama": + from ipex_llm.transformers.npu_models.convert_mp import convert_llama + convert_llama(model, + max_output_len=kv_len, + max_prompt_len=max_prompt_len, + decoder=False, + transpose_value_cache=transpose_value_cache) from .llama import LowBitLlamaLMHead, LlamaEmbedding with tempfile.TemporaryDirectory() as temp_dir: # generate lm_head blob @@ -231,13 +280,12 @@ def convert_llm(model: torch.nn.Module, new_embedding = LlamaEmbedding( vocab_size=model.config.vocab_size, embedding_dim=model.config.hidden_size, + embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(), padding_idx=model.config.pad_token_id, dtype=np.float16, ) first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding", temp_dir) - bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin") - embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file) # generate decoder layer blob from ipex_llm.transformers.npu_models.llama_mp import LowBitLlamaMultiDecoderlayer diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py index 56462fb5..9ad6acc1 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py @@ -80,6 +80,7 @@ class LlamaEmbedding(NNFactory): self, vocab_size, embedding_dim, + embedding_weight, padding_idx, dtype, # fp16 device: str = "NPU", @@ -91,7 +92,7 @@ class LlamaEmbedding(NNFactory): self.dtype = dtype # define input - weight = self.parameter((vocab_size, embedding_dim)) + weight = self.constant(embedding_weight) input = self.parameter((1, 1), dtype=np.int32) if padding_idx == -1: