[NPU] Reuse prefill of acc lib for pipeline (#12279)
* first commit * update example * fix style * update example * embedding as const * fix generate * code refactor * meet code review * fix style * change max_output_len to max_context_len * fix all-in-one * fix example * add check for new tokens
This commit is contained in:
		
							parent
							
								
									42a528ded9
								
							
						
					
					
						commit
						3fe2ea3081
					
				
					 13 changed files with 224 additions and 146 deletions
				
			
		| 
						 | 
				
			
			@ -617,23 +617,23 @@ def transformers_int4_npu_win(repo_id,
 | 
			
		|||
 | 
			
		||||
    model_path = get_model_path(repo_id, local_model_hub)
 | 
			
		||||
    in_out_len = in_out_pairs[0].split("-")
 | 
			
		||||
    max_output_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
 | 
			
		||||
    max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
 | 
			
		||||
    # Load model in 4 bit,
 | 
			
		||||
    # which convert the relevant layers in the model into INT4 format
 | 
			
		||||
    st = time.perf_counter()
 | 
			
		||||
    if repo_id in CHATGLM_IDS:
 | 
			
		||||
        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
 | 
			
		||||
                                          optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                          optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                          torch_dtype=torch.float16, attn_implementation="eager").eval()
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    elif repo_id in LLAMA_IDS:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
 | 
			
		||||
                                                     optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                                     optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                                     use_cache=True, attn_implementation="eager").eval()
 | 
			
		||||
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    else:
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
 | 
			
		||||
                                                     optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                                     optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                                     use_cache=True, attn_implementation="eager").eval()
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
    end = time.perf_counter()
 | 
			
		||||
| 
						 | 
				
			
			@ -690,23 +690,23 @@ def run_transformer_int4_loadlowbit_npu_win(repo_id,
 | 
			
		|||
 | 
			
		||||
    model_path = get_model_path(repo_id, local_model_hub)
 | 
			
		||||
    in_out_len = in_out_pairs[0].split("-")
 | 
			
		||||
    max_output_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
 | 
			
		||||
    max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
 | 
			
		||||
    # Load model in 4 bit,
 | 
			
		||||
    # which convert the relevant layers in the model into INT4 format
 | 
			
		||||
    st = time.perf_counter()
 | 
			
		||||
    if repo_id in CHATGLM_IDS:
 | 
			
		||||
        model = AutoModel.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True,
 | 
			
		||||
                                        optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                        optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                        torch_dtype=torch.float16, attn_implementation="eager").eval()
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True)
 | 
			
		||||
    elif repo_id in LLAMA_IDS:
 | 
			
		||||
        model = AutoModelForCausalLM.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True, torch_dtype=torch.float16,
 | 
			
		||||
                                                    optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                                    optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                                    use_cache=True, attn_implementation="eager").eval()
 | 
			
		||||
        tokenizer = LlamaTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True)
 | 
			
		||||
    else:
 | 
			
		||||
        model = AutoModelForCausalLM.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True, torch_dtype=torch.float16,
 | 
			
		||||
                                                     optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                                     optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
 | 
			
		||||
                                                     use_cache=True, attn_implementation="eager").eval()
 | 
			
		||||
        tokenizer = AutoTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True)
 | 
			
		||||
    end = time.perf_counter()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,7 +51,9 @@ if __name__ == "__main__":
 | 
			
		|||
    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
			
		||||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
			
		||||
    parser.add_argument("--max-output-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=960)
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    model_path = args.repo_id_or_model_path
 | 
			
		||||
| 
						 | 
				
			
			@ -59,9 +61,11 @@ if __name__ == "__main__":
 | 
			
		|||
    model = AutoModelForCausalLM.from_pretrained(model_path,
 | 
			
		||||
                                                 optimize_model=True,
 | 
			
		||||
                                                 pipeline=True,
 | 
			
		||||
                                                 max_output_len=args.max_output_len,
 | 
			
		||||
                                                 max_context_len=args.max_context_len,
 | 
			
		||||
                                                 max_prompt_len=args.max_prompt_len,
 | 
			
		||||
                                                 torch_dtype=torch.float16,
 | 
			
		||||
                                                 attn_implementation="eager")
 | 
			
		||||
                                                 attn_implementation="eager",
 | 
			
		||||
                                                 transpose_value_cache=not args.disable_transpose_value_cache)
 | 
			
		||||
 | 
			
		||||
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -57,7 +57,9 @@ if __name__ == "__main__":
 | 
			
		|||
    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
			
		||||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
			
		||||
    parser.add_argument("--max-output-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=960)
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    model_path = args.repo_id_or_model_path
 | 
			
		||||
| 
						 | 
				
			
			@ -66,8 +68,10 @@ if __name__ == "__main__":
 | 
			
		|||
                                                 torch_dtype=torch.float16,
 | 
			
		||||
                                                 optimize_model=True,
 | 
			
		||||
                                                 pipeline=True,
 | 
			
		||||
                                                 max_output_len=args.max_output_len,
 | 
			
		||||
                                                 attn_implementation="eager")
 | 
			
		||||
                                                 max_context_len=args.max_context_len,
 | 
			
		||||
                                                 max_prompt_len=args.max_prompt_len,
 | 
			
		||||
                                                 attn_implementation="eager",
 | 
			
		||||
                                                 transpose_value_cache=not args.disable_transpose_value_cache)
 | 
			
		||||
 | 
			
		||||
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -59,7 +59,7 @@ if __name__ == "__main__":
 | 
			
		|||
    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
			
		||||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
			
		||||
    parser.add_argument("--max-output-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
    parser.add_argument("--intra-pp", type=int, default=2)
 | 
			
		||||
| 
						 | 
				
			
			@ -76,7 +76,7 @@ if __name__ == "__main__":
 | 
			
		|||
            attn_implementation="eager",
 | 
			
		||||
            load_in_low_bit="sym_int4",
 | 
			
		||||
            optimize_model=True,
 | 
			
		||||
            max_output_len=args.max_output_len,
 | 
			
		||||
            max_context_len=args.max_context_len,
 | 
			
		||||
            max_prompt_len=args.max_prompt_len,
 | 
			
		||||
            intra_pp=args.intra_pp,
 | 
			
		||||
            inter_pp=args.inter_pp,
 | 
			
		||||
| 
						 | 
				
			
			@ -88,7 +88,7 @@ if __name__ == "__main__":
 | 
			
		|||
            attn_implementation="eager",
 | 
			
		||||
            torch_dtype=torch.bfloat16,
 | 
			
		||||
            optimize_model=True,
 | 
			
		||||
            max_output_len=args.max_output_len,
 | 
			
		||||
            max_context_len=args.max_context_len,
 | 
			
		||||
            max_prompt_len=args.max_prompt_len,
 | 
			
		||||
            intra_pp=args.intra_pp,
 | 
			
		||||
            inter_pp=args.inter_pp,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -59,7 +59,7 @@ if __name__ == "__main__":
 | 
			
		|||
    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
			
		||||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
			
		||||
    parser.add_argument("--max-output-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
    parser.add_argument("--intra-pp", type=int, default=2)
 | 
			
		||||
| 
						 | 
				
			
			@ -76,7 +76,7 @@ if __name__ == "__main__":
 | 
			
		|||
            attn_implementation="eager",
 | 
			
		||||
            load_in_low_bit="sym_int4",
 | 
			
		||||
            optimize_model=True,
 | 
			
		||||
            max_output_len=args.max_output_len,
 | 
			
		||||
            max_context_len=args.max_context_len,
 | 
			
		||||
            max_prompt_len=args.max_prompt_len,
 | 
			
		||||
            intra_pp=args.intra_pp,
 | 
			
		||||
            inter_pp=args.inter_pp,
 | 
			
		||||
| 
						 | 
				
			
			@ -88,7 +88,7 @@ if __name__ == "__main__":
 | 
			
		|||
            attn_implementation="eager",
 | 
			
		||||
            torch_dtype=torch.float16,
 | 
			
		||||
            optimize_model=True,
 | 
			
		||||
            max_output_len=args.max_output_len,
 | 
			
		||||
            max_context_len=args.max_context_len,
 | 
			
		||||
            max_prompt_len=args.max_prompt_len,
 | 
			
		||||
            intra_pp=args.intra_pp,
 | 
			
		||||
            inter_pp=args.inter_pp,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -46,7 +46,7 @@ if __name__ == "__main__":
 | 
			
		|||
    parser.add_argument('--prompt', type=str, default="What is AI?",
 | 
			
		||||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
			
		||||
    parser.add_argument("--max-output-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
    parser.add_argument("--intra-pp", type=int, default=2)
 | 
			
		||||
| 
						 | 
				
			
			@ -62,7 +62,7 @@ if __name__ == "__main__":
 | 
			
		|||
            attn_implementation="eager",
 | 
			
		||||
            load_in_low_bit="sym_int4",
 | 
			
		||||
            optimize_model=True,
 | 
			
		||||
            max_output_len=args.max_output_len,
 | 
			
		||||
            max_context_len=args.max_context_len,
 | 
			
		||||
            max_prompt_len=args.max_prompt_len,
 | 
			
		||||
            intra_pp=args.intra_pp,
 | 
			
		||||
            inter_pp=args.inter_pp,
 | 
			
		||||
| 
						 | 
				
			
			@ -74,7 +74,7 @@ if __name__ == "__main__":
 | 
			
		|||
            attn_implementation="eager",
 | 
			
		||||
            torch_dtype=torch.float16,
 | 
			
		||||
            optimize_model=True,
 | 
			
		||||
            max_output_len=args.max_output_len,
 | 
			
		||||
            max_context_len=args.max_context_len,
 | 
			
		||||
            max_prompt_len=args.max_prompt_len,
 | 
			
		||||
            intra_pp=args.intra_pp,
 | 
			
		||||
            inter_pp=args.inter_pp,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -46,7 +46,7 @@ if __name__ == "__main__":
 | 
			
		|||
    parser.add_argument('--prompt', type=str, default="AI是什么?",
 | 
			
		||||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
			
		||||
    parser.add_argument("--max-output-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
    parser.add_argument("--intra-pp", type=int, default=None)
 | 
			
		||||
| 
						 | 
				
			
			@ -64,7 +64,7 @@ if __name__ == "__main__":
 | 
			
		|||
            attn_implementation="eager",
 | 
			
		||||
            load_in_low_bit="sym_int4",
 | 
			
		||||
            optimize_model=True,
 | 
			
		||||
            max_output_len=args.max_output_len,
 | 
			
		||||
            max_context_len=args.max_context_len,
 | 
			
		||||
            max_prompt_len=args.max_prompt_len,
 | 
			
		||||
            intra_pp=args.intra_pp,
 | 
			
		||||
            inter_pp=args.inter_pp,
 | 
			
		||||
| 
						 | 
				
			
			@ -77,7 +77,7 @@ if __name__ == "__main__":
 | 
			
		|||
            attn_implementation="eager",
 | 
			
		||||
            torch_dtype=torch.float16,
 | 
			
		||||
            optimize_model=True,
 | 
			
		||||
            max_output_len=args.max_output_len,
 | 
			
		||||
            max_context_len=args.max_context_len,
 | 
			
		||||
            max_prompt_len=args.max_prompt_len,
 | 
			
		||||
            intra_pp=args.intra_pp,
 | 
			
		||||
            inter_pp=args.inter_pp,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -45,7 +45,7 @@ if __name__ == "__main__":
 | 
			
		|||
    parser.add_argument('--prompt', type=str, default="What is in the image?",
 | 
			
		||||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
			
		||||
    parser.add_argument("--max-output-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
    parser.add_argument("--intra-pp", type=int, default=2)
 | 
			
		||||
| 
						 | 
				
			
			@ -61,7 +61,7 @@ if __name__ == "__main__":
 | 
			
		|||
        attn_implementation="eager",
 | 
			
		||||
        load_in_low_bit="sym_int4",
 | 
			
		||||
        optimize_model=True,
 | 
			
		||||
        max_output_len=args.max_output_len,
 | 
			
		||||
        max_context_len=args.max_context_len,
 | 
			
		||||
        max_prompt_len=args.max_prompt_len,
 | 
			
		||||
        intra_pp=args.intra_pp,
 | 
			
		||||
        inter_pp=args.inter_pp,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -36,7 +36,7 @@ if __name__ == '__main__':
 | 
			
		|||
    parser.add_argument('--prompt', type=str, default="What is in this image?",
 | 
			
		||||
                        help='Prompt to infer')
 | 
			
		||||
    parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
 | 
			
		||||
    parser.add_argument("--max-output-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-context-len", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--max-prompt-len", type=int, default=512)
 | 
			
		||||
    parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 | 
			
		||||
    parser.add_argument("--intra-pp", type=int, default=None)
 | 
			
		||||
| 
						 | 
				
			
			@ -52,7 +52,7 @@ if __name__ == '__main__':
 | 
			
		|||
                                      attn_implementation="eager",
 | 
			
		||||
                                      load_in_low_bit="sym_int4",
 | 
			
		||||
                                      optimize_model=True,
 | 
			
		||||
                                      max_output_len=args.max_output_len,
 | 
			
		||||
                                      max_context_len=args.max_context_len,
 | 
			
		||||
                                      max_prompt_len=args.max_prompt_len,
 | 
			
		||||
                                      intra_pp=args.intra_pp,
 | 
			
		||||
                                      inter_pp=args.inter_pp,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -124,8 +124,8 @@ class _BaseAutoModelClass:
 | 
			
		|||
        ignore_argument(kwargs, "pipeline_parallel_stages")
 | 
			
		||||
        optimize_model = kwargs.pop("optimize_model", False)
 | 
			
		||||
        pipeline = kwargs.pop("pipeline", False)
 | 
			
		||||
        max_output_len = kwargs.pop("max_output_len", 1024)
 | 
			
		||||
        max_output_len = max_output_len - 1
 | 
			
		||||
        max_context_len = kwargs.pop("max_context_len", 1024)
 | 
			
		||||
        max_context_len = max_context_len - 1
 | 
			
		||||
        max_prompt_len = kwargs.pop("max_prompt_len", 512)
 | 
			
		||||
        inter_pp = kwargs.pop("inter_pp", None)
 | 
			
		||||
        intra_pp = kwargs.pop("intra_pp", None)
 | 
			
		||||
| 
						 | 
				
			
			@ -169,10 +169,10 @@ class _BaseAutoModelClass:
 | 
			
		|||
 | 
			
		||||
        if optimize_model:
 | 
			
		||||
            invalidInputError(
 | 
			
		||||
                max_prompt_len < max_output_len,
 | 
			
		||||
                max_prompt_len < max_context_len,
 | 
			
		||||
                (
 | 
			
		||||
                    f"max_prompt_len ({max_prompt_len}) should be less"
 | 
			
		||||
                    " than max_output_len ({max_output_len})"
 | 
			
		||||
                    " than max_context_len ({max_context_len})"
 | 
			
		||||
                ),
 | 
			
		||||
            )
 | 
			
		||||
            optimize_kwargs = {
 | 
			
		||||
| 
						 | 
				
			
			@ -182,7 +182,7 @@ class _BaseAutoModelClass:
 | 
			
		|||
                "quantization_group_size": quantization_group_size,
 | 
			
		||||
                "modules_to_not_convert": modules_to_not_convert,
 | 
			
		||||
                "pipeline": pipeline,
 | 
			
		||||
                "max_output_len": max_output_len,
 | 
			
		||||
                "max_context_len": max_context_len,
 | 
			
		||||
                "max_prompt_len": max_prompt_len,
 | 
			
		||||
                "inter_pp": inter_pp,
 | 
			
		||||
                "intra_pp": intra_pp,
 | 
			
		||||
| 
						 | 
				
			
			@ -219,7 +219,7 @@ class _BaseAutoModelClass:
 | 
			
		|||
        quantization_group_size = kwargs.pop("quantization_group_size", 0)
 | 
			
		||||
        modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
 | 
			
		||||
        pipeline = kwargs.pop("pipeline", False)
 | 
			
		||||
        max_output_len = kwargs.pop("max_output_len", 1024)
 | 
			
		||||
        max_context_len = kwargs.pop("max_context_len", 1024)
 | 
			
		||||
        max_prompt_len = kwargs.pop("max_prompt_len", 512)
 | 
			
		||||
        inter_pp = kwargs.pop("inter_pp", None)
 | 
			
		||||
        intra_pp = kwargs.pop("intra_pp", None)
 | 
			
		||||
| 
						 | 
				
			
			@ -246,7 +246,7 @@ class _BaseAutoModelClass:
 | 
			
		|||
        if not pipeline:
 | 
			
		||||
            optimize_llm(
 | 
			
		||||
                llm,
 | 
			
		||||
                max_output_len=max_output_len,
 | 
			
		||||
                max_context_len=max_context_len,
 | 
			
		||||
                max_prompt_len=max_prompt_len,
 | 
			
		||||
                inter_pp=inter_pp,
 | 
			
		||||
                intra_pp=intra_pp,
 | 
			
		||||
| 
						 | 
				
			
			@ -258,7 +258,8 @@ class _BaseAutoModelClass:
 | 
			
		|||
            from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
 | 
			
		||||
                import convert_llm
 | 
			
		||||
            convert_llm(llm,
 | 
			
		||||
                        kv_len=max_output_len,
 | 
			
		||||
                        kv_len=max_context_len,
 | 
			
		||||
                        max_prompt_len=max_prompt_len,
 | 
			
		||||
                        transpose_value_cache=transpose_value_cache)
 | 
			
		||||
 | 
			
		||||
        return model
 | 
			
		||||
| 
						 | 
				
			
			@ -598,7 +599,7 @@ class FunAsrAutoModel(_BaseAutoModelClass):
 | 
			
		|||
        model = kwargs.pop("model")
 | 
			
		||||
        qtype = kwargs.pop("qtype", "sym_int8")
 | 
			
		||||
        modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
 | 
			
		||||
        max_output_len = kwargs.pop("max_output_len", 1024)
 | 
			
		||||
        max_context_len = kwargs.pop("max_context_len", 1024)
 | 
			
		||||
        max_prompt_len = kwargs.pop("max_prompt_len", 512)
 | 
			
		||||
        inter_pp = kwargs.pop("inter_pp", None)
 | 
			
		||||
        intra_pp = kwargs.pop("intra_pp", None)
 | 
			
		||||
| 
						 | 
				
			
			@ -618,7 +619,7 @@ class FunAsrAutoModel(_BaseAutoModelClass):
 | 
			
		|||
 | 
			
		||||
        optimize_funasr(
 | 
			
		||||
            model,
 | 
			
		||||
            max_output_len=max_output_len,
 | 
			
		||||
            max_context_len=max_context_len,
 | 
			
		||||
            max_prompt_len=max_prompt_len,
 | 
			
		||||
            inter_pp=inter_pp,
 | 
			
		||||
            intra_pp=intra_pp,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -154,25 +154,20 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
 | 
			
		|||
        model.lm_head = new_linear
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def optimize_llm(
 | 
			
		||||
def convert_llama(
 | 
			
		||||
        model: torch.nn.Module,
 | 
			
		||||
        max_output_len=1024,
 | 
			
		||||
        max_prompt_len=1024,
 | 
			
		||||
        decoder=False,
 | 
			
		||||
        inter_pp=None,
 | 
			
		||||
        intra_pp=None,
 | 
			
		||||
        transpose_value_cache=True,
 | 
			
		||||
    group_size=0
 | 
			
		||||
):
 | 
			
		||||
    if model.config.model_type == "llama":
 | 
			
		||||
        if intra_pp is None:
 | 
			
		||||
            intra_pp = 2
 | 
			
		||||
        if inter_pp is None:
 | 
			
		||||
            inter_pp = 2 if group_size == 0 else 8
 | 
			
		||||
 | 
			
		||||
    from ipex_llm.transformers.npu_models.llama_mp import gen_llama_fused_model_forward
 | 
			
		||||
    from ipex_llm.transformers.npu_models.llama_mp import DecodeRunner, PrefillRunner
 | 
			
		||||
    from transformers.models.llama.modeling_llama import LlamaModel
 | 
			
		||||
 | 
			
		||||
    if decoder:
 | 
			
		||||
        decode_runner = DecodeRunner(
 | 
			
		||||
            model,
 | 
			
		||||
            max_seq_len=max_output_len,
 | 
			
		||||
| 
						 | 
				
			
			@ -180,6 +175,8 @@ def optimize_llm(
 | 
			
		|||
            intra_pp=intra_pp,
 | 
			
		||||
            transpose_value_cache=transpose_value_cache,
 | 
			
		||||
        )
 | 
			
		||||
    else:
 | 
			
		||||
        decode_runner = None
 | 
			
		||||
    prefill_runner = PrefillRunner(
 | 
			
		||||
        model,
 | 
			
		||||
        max_output_len=max_output_len,
 | 
			
		||||
| 
						 | 
				
			
			@ -193,6 +190,29 @@ def optimize_llm(
 | 
			
		|||
    from transformers.models.llama.modeling_llama import LlamaForCausalLM
 | 
			
		||||
    from ipex_llm.transformers.npu_models.llama_mp import llama2_casullm_forward
 | 
			
		||||
    convert_forward(model, LlamaForCausalLM, llama2_casullm_forward)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def optimize_llm(
 | 
			
		||||
    model: torch.nn.Module,
 | 
			
		||||
    max_context_len=1024,
 | 
			
		||||
    max_prompt_len=1024,
 | 
			
		||||
    inter_pp=None,
 | 
			
		||||
    intra_pp=None,
 | 
			
		||||
    transpose_value_cache=True,
 | 
			
		||||
    group_size=0
 | 
			
		||||
):
 | 
			
		||||
    if model.config.model_type == "llama":
 | 
			
		||||
        if intra_pp is None:
 | 
			
		||||
            intra_pp = 2
 | 
			
		||||
        if inter_pp is None:
 | 
			
		||||
            inter_pp = 2 if group_size == 0 else 8
 | 
			
		||||
        convert_llama(model,
 | 
			
		||||
                      max_output_len=max_context_len,
 | 
			
		||||
                      max_prompt_len=max_prompt_len,
 | 
			
		||||
                      inter_pp=inter_pp,
 | 
			
		||||
                      intra_pp=intra_pp,
 | 
			
		||||
                      decoder=True,
 | 
			
		||||
                      transpose_value_cache=transpose_value_cache)
 | 
			
		||||
    elif model.config.model_type == "qwen2" and model.config.num_hidden_layers == 28:
 | 
			
		||||
        # for qwen2-1.5B and qwen2-7B
 | 
			
		||||
        if intra_pp is None:
 | 
			
		||||
| 
						 | 
				
			
			@ -212,14 +232,14 @@ def optimize_llm(
 | 
			
		|||
 | 
			
		||||
        decode_runner = DecodeRunner(
 | 
			
		||||
            model,
 | 
			
		||||
            max_seq_len=max_output_len,
 | 
			
		||||
            max_seq_len=max_context_len,
 | 
			
		||||
            inter_pp=inter_pp,
 | 
			
		||||
            intra_pp=intra_pp,
 | 
			
		||||
            transpose_value_cache=transpose_value_cache,
 | 
			
		||||
        )
 | 
			
		||||
        prefill_runner = PrefillRunner(
 | 
			
		||||
            model,
 | 
			
		||||
            max_output_len=max_output_len,
 | 
			
		||||
            max_output_len=max_context_len,
 | 
			
		||||
            max_prompt_len=max_prompt_len,
 | 
			
		||||
            transpose_value_cache=transpose_value_cache,
 | 
			
		||||
        )
 | 
			
		||||
| 
						 | 
				
			
			@ -252,14 +272,14 @@ def optimize_llm(
 | 
			
		|||
 | 
			
		||||
        decode_runner = DecodeRunner(
 | 
			
		||||
            model,
 | 
			
		||||
            max_seq_len=max_output_len,
 | 
			
		||||
            max_seq_len=max_context_len,
 | 
			
		||||
            inter_pp=inter_pp,
 | 
			
		||||
            intra_pp=intra_pp,
 | 
			
		||||
            transpose_value_cache=transpose_cache,
 | 
			
		||||
        )
 | 
			
		||||
        prefill_runner = PrefillRunner(
 | 
			
		||||
            model,
 | 
			
		||||
            max_output_len=max_output_len,
 | 
			
		||||
            max_output_len=max_context_len,
 | 
			
		||||
            max_prompt_len=max_prompt_len,
 | 
			
		||||
            transpose_value_cache=transpose_cache,
 | 
			
		||||
        )
 | 
			
		||||
| 
						 | 
				
			
			@ -281,14 +301,14 @@ def optimize_llm(
 | 
			
		|||
        from ipex_llm.transformers.npu_models.baichuan_mp import DecodeRunner, PrefillRunner
 | 
			
		||||
        decode_runner = DecodeRunner(
 | 
			
		||||
            model,
 | 
			
		||||
            max_seq_len=max_output_len,
 | 
			
		||||
            max_seq_len=max_context_len,
 | 
			
		||||
            inter_pp=inter_pp,
 | 
			
		||||
            intra_pp=intra_pp,
 | 
			
		||||
            transpose_value_cache=transpose_value_cache,
 | 
			
		||||
        )
 | 
			
		||||
        prefill_runner = PrefillRunner(
 | 
			
		||||
            model,
 | 
			
		||||
            max_output_len=max_output_len,
 | 
			
		||||
            max_output_len=max_context_len,
 | 
			
		||||
            max_prompt_len=max_prompt_len,
 | 
			
		||||
            transpose_value_cache=transpose_value_cache,
 | 
			
		||||
        )
 | 
			
		||||
| 
						 | 
				
			
			@ -305,7 +325,7 @@ def optimize_llm(
 | 
			
		|||
 | 
			
		||||
def optimize_funasr(
 | 
			
		||||
    model: torch.nn.Module,
 | 
			
		||||
    max_output_len=1024,
 | 
			
		||||
    max_context_len=1024,
 | 
			
		||||
    max_prompt_len=1024,
 | 
			
		||||
    inter_pp=None,
 | 
			
		||||
    intra_pp=None,
 | 
			
		||||
| 
						 | 
				
			
			@ -320,7 +340,7 @@ def optimize_funasr(
 | 
			
		|||
    from ipex_llm.transformers.npu_models.paraformer_mp import PrefillRunner, DecodeRunner
 | 
			
		||||
    prefill_runner = PrefillRunner(
 | 
			
		||||
        model,
 | 
			
		||||
        max_output_len=max_output_len,
 | 
			
		||||
        max_output_len=max_context_len,
 | 
			
		||||
        max_prompt_len=max_prompt_len,
 | 
			
		||||
        transpose_value_cache=transpose_value_cache,
 | 
			
		||||
    )
 | 
			
		||||
| 
						 | 
				
			
			@ -329,7 +349,7 @@ def optimize_funasr(
 | 
			
		|||
    )
 | 
			
		||||
    decode_runner = DecodeRunner(
 | 
			
		||||
        model,
 | 
			
		||||
        max_seq_len=max_output_len,
 | 
			
		||||
        max_seq_len=max_context_len,
 | 
			
		||||
        inter_pp=inter_pp,
 | 
			
		||||
        intra_pp=intra_pp,
 | 
			
		||||
        transpose_value_cache=transpose_value_cache,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -61,14 +61,68 @@ def generate(
 | 
			
		|||
 | 
			
		||||
    new_tokens = new_generate_kwargs['max_new_tokens']
 | 
			
		||||
    invalidInputError(input_length + new_tokens <= self.kv_len + 1,
 | 
			
		||||
                      "Input plus output tokens should not exceed max_output_len.")
 | 
			
		||||
                      "Input plus output tokens should not exceed max_context_len.")
 | 
			
		||||
    # TODO: may optimize this part later
 | 
			
		||||
    invalidInputError(new_tokens < 1024,
 | 
			
		||||
                      f"Generated tokens ({new_tokens}) exceed named pipeline limitation.")
 | 
			
		||||
 | 
			
		||||
    output_tokens = []
 | 
			
		||||
 | 
			
		||||
    with tempfile.TemporaryDirectory() as temp_dir:
 | 
			
		||||
        # run prefill with PrefillRunner
 | 
			
		||||
        output = self(input_ids=inputs,
 | 
			
		||||
                      attention_mask=torch.ones(1, inputs.shape[1]).int())
 | 
			
		||||
        logits = output.logits
 | 
			
		||||
        input_id = torch.argmax(logits[:, -1, :], dim=1)
 | 
			
		||||
        input_id.to(torch.int32).numpy().tofile(os.path.join(temp_dir, "input_id.bin"))
 | 
			
		||||
        position = np.int64(inputs.shape[1])
 | 
			
		||||
        position.tofile(os.path.join(temp_dir, "position.bin"))
 | 
			
		||||
        past_key_values = output.past_key_values
 | 
			
		||||
        key_cache = past_key_values.key_cache
 | 
			
		||||
        value_cache = past_key_values.value_cache
 | 
			
		||||
        for layer in range(self.num_layers):
 | 
			
		||||
            key_ = key_cache[layer]
 | 
			
		||||
            val_ = value_cache[layer]
 | 
			
		||||
            new_size = (
 | 
			
		||||
                key_.size(0),
 | 
			
		||||
                key_.size(1),
 | 
			
		||||
                self.kv_len,
 | 
			
		||||
                key_.size(3),
 | 
			
		||||
            )
 | 
			
		||||
            key = key_.as_strided(new_size, key_.stride(), storage_offset=0)
 | 
			
		||||
            if not self.transpose_value_cache:
 | 
			
		||||
                val = val_.as_strided(new_size, val_.stride(), storage_offset=0)
 | 
			
		||||
            else:
 | 
			
		||||
                new_size = (
 | 
			
		||||
                    val_.size(0),
 | 
			
		||||
                    val_.size(1),
 | 
			
		||||
                    val_.size(3),
 | 
			
		||||
                    self.kv_len,
 | 
			
		||||
                )
 | 
			
		||||
                val_cache = val_.transpose(-1, -2)
 | 
			
		||||
                val = val_cache.as_strided(new_size, val_cache.stride(), storage_offset=0)
 | 
			
		||||
            key.to(torch.float16).numpy().tofile(os.path.join(temp_dir, f"key_cache_{layer}.bin"))
 | 
			
		||||
            val.to(torch.float16).numpy().tofile(os.path.join(temp_dir, f"value_cache_{layer}.bin"))
 | 
			
		||||
 | 
			
		||||
        token = input_id.to(torch.int32).item()
 | 
			
		||||
        output_tokens.append(torch.tensor([token]))
 | 
			
		||||
        if streamer is not None:
 | 
			
		||||
            streamer.put(torch.tensor([token]))
 | 
			
		||||
 | 
			
		||||
        if "eos_token_id" not in new_generate_kwargs:
 | 
			
		||||
            eos = 0xffffffff
 | 
			
		||||
        else:
 | 
			
		||||
            eos = new_generate_kwargs["eos_token_id"]
 | 
			
		||||
 | 
			
		||||
        time_t1 = time.perf_counter()
 | 
			
		||||
        idx += 1
 | 
			
		||||
 | 
			
		||||
        # start generate_serve by Thread
 | 
			
		||||
        thread = threading.Thread(target=generate_serve,
 | 
			
		||||
                                  args=(self.kv_len, self.num_head,
 | 
			
		||||
                                        self.head_dim, self.num_layers,
 | 
			
		||||
                                        self.transpose_value_cache,
 | 
			
		||||
                                    new_tokens - 1))
 | 
			
		||||
                                        new_tokens - 2))
 | 
			
		||||
        thread.start()
 | 
			
		||||
 | 
			
		||||
        in_pipe_path = "\\\\.\\pipe\\llminputpipe"
 | 
			
		||||
| 
						 | 
				
			
			@ -92,33 +146,22 @@ def generate(
 | 
			
		|||
            else:
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
    bdata = b''
 | 
			
		||||
    for i in range(0, input_length):
 | 
			
		||||
        d = int(numpy_input[i])
 | 
			
		||||
        bdata = bdata + d.to_bytes(4, sys.byteorder)
 | 
			
		||||
 | 
			
		||||
    if "eos_token_id" not in new_generate_kwargs:
 | 
			
		||||
        eos = 0xffffffff
 | 
			
		||||
    else:
 | 
			
		||||
        eos = new_generate_kwargs["eos_token_id"]
 | 
			
		||||
 | 
			
		||||
    bdata = bdata + eos.to_bytes(4, sys.byteorder)
 | 
			
		||||
 | 
			
		||||
        time_start = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
    input_pipe.write(bytearray(bdata))
 | 
			
		||||
        bdata = str.encode(str(temp_dir))
 | 
			
		||||
        invalidInputError(len(bdata) <= 2000,
 | 
			
		||||
                          f"Leng of input directory is too long ({len(bdata)}), "
 | 
			
		||||
                          "which may cause read error.")
 | 
			
		||||
        input_pipe.write(bdata)
 | 
			
		||||
        input_pipe.flush()
 | 
			
		||||
 | 
			
		||||
        buffersize = 4
 | 
			
		||||
    output_tokens = []
 | 
			
		||||
        while True:
 | 
			
		||||
            data = output_pipe.read(buffersize)
 | 
			
		||||
            if len(data) == 0:
 | 
			
		||||
                break
 | 
			
		||||
            token = int.from_bytes(data, sys.byteorder)
 | 
			
		||||
            idx += 1
 | 
			
		||||
        if time_t1 is None:
 | 
			
		||||
            time_t1 = time.perf_counter()
 | 
			
		||||
            output_tokens.append(torch.tensor([token]))
 | 
			
		||||
            if streamer is not None:
 | 
			
		||||
                streamer.put(torch.tensor([token]))
 | 
			
		||||
| 
						 | 
				
			
			@ -134,14 +177,13 @@ def generate(
 | 
			
		|||
    time_end = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
    if do_print:
 | 
			
		||||
        print(f" Start the thread and connect the pipe time: {(time_start - time_start_all):.2f} s")
 | 
			
		||||
        print(f" Start the thread and connect the pipe time: {(time_start - time_t1):.2f} s")
 | 
			
		||||
        print(f" Number of input tokens: {input_length}")
 | 
			
		||||
        print(f" Generated tokens: {idx}")
 | 
			
		||||
        print(f" First token generation time: {(time_t1 - time_start):.2f} s")
 | 
			
		||||
        print(f" Generation average latency: {(time_end - time_t1)*1000 /(idx - 1):.2f} ms, "
 | 
			
		||||
              f"({(idx - 1)/(time_end - time_t1):.2f} token/s)")
 | 
			
		||||
        print(f" Generation time: {(time_end - time_start):.2f} s\n")
 | 
			
		||||
 | 
			
		||||
        print(f" First token generation time: {(time_t1 - time_start_all):.2f} s")
 | 
			
		||||
        print(f" Generation average latency: {(time_end - time_start) * 1000 /(idx - 1):.2f} ms, "
 | 
			
		||||
              f"({(idx - 1)/(time_end - time_start):.2f} token/s)")
 | 
			
		||||
        print(f" Generation time: {(time_end - time_start_all - (time_start - time_t1)):.2f} s\n")
 | 
			
		||||
    return output
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -182,8 +224,15 @@ def update_names_of_IR_and_export_blob(model, model_name, dir):
 | 
			
		|||
 | 
			
		||||
def convert_llm(model: torch.nn.Module,
 | 
			
		||||
                kv_len: int,
 | 
			
		||||
                max_prompt_len: int,
 | 
			
		||||
                transpose_value_cache: bool):
 | 
			
		||||
    if model.config.model_type == "llama":
 | 
			
		||||
        from ipex_llm.transformers.npu_models.convert_mp import convert_llama
 | 
			
		||||
        convert_llama(model,
 | 
			
		||||
                      max_output_len=kv_len,
 | 
			
		||||
                      max_prompt_len=max_prompt_len,
 | 
			
		||||
                      decoder=False,
 | 
			
		||||
                      transpose_value_cache=transpose_value_cache)
 | 
			
		||||
        from .llama import LowBitLlamaLMHead, LlamaEmbedding
 | 
			
		||||
        with tempfile.TemporaryDirectory() as temp_dir:
 | 
			
		||||
            # generate lm_head blob
 | 
			
		||||
| 
						 | 
				
			
			@ -231,13 +280,12 @@ def convert_llm(model: torch.nn.Module,
 | 
			
		|||
            new_embedding = LlamaEmbedding(
 | 
			
		||||
                vocab_size=model.config.vocab_size,
 | 
			
		||||
                embedding_dim=model.config.hidden_size,
 | 
			
		||||
                embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
 | 
			
		||||
                padding_idx=model.config.pad_token_id,
 | 
			
		||||
                dtype=np.float16,
 | 
			
		||||
            )
 | 
			
		||||
            first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
 | 
			
		||||
                                                                 temp_dir)
 | 
			
		||||
            bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
 | 
			
		||||
            embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
 | 
			
		||||
 | 
			
		||||
            # generate decoder layer blob
 | 
			
		||||
            from ipex_llm.transformers.npu_models.llama_mp import LowBitLlamaMultiDecoderlayer
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -80,6 +80,7 @@ class LlamaEmbedding(NNFactory):
 | 
			
		|||
        self,
 | 
			
		||||
        vocab_size,
 | 
			
		||||
        embedding_dim,
 | 
			
		||||
        embedding_weight,
 | 
			
		||||
        padding_idx,
 | 
			
		||||
        dtype,  # fp16
 | 
			
		||||
        device: str = "NPU",
 | 
			
		||||
| 
						 | 
				
			
			@ -91,7 +92,7 @@ class LlamaEmbedding(NNFactory):
 | 
			
		|||
        self.dtype = dtype
 | 
			
		||||
 | 
			
		||||
        # define input
 | 
			
		||||
        weight = self.parameter((vocab_size, embedding_dim))
 | 
			
		||||
        weight = self.constant(embedding_weight)
 | 
			
		||||
        input = self.parameter((1, 1), dtype=np.int32)
 | 
			
		||||
 | 
			
		||||
        if padding_idx == -1:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue