[NPU] optimize qwen2 prefill performance for C++ (#12451)
This commit is contained in:
		
							parent
							
								
									8331875f34
								
							
						
					
					
						commit
						f8c2bb2943
					
				
					 2 changed files with 10 additions and 1 deletions
				
			
		| 
						 | 
				
			
			@ -24,6 +24,8 @@ from transformers.utils import logging
 | 
			
		|||
from packaging import version
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = logging.get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -55,6 +57,7 @@ if __name__ == "__main__":
 | 
			
		|||
    model_path = args.repo_id_or_model_path
 | 
			
		||||
    save_dir = args.save_directory
 | 
			
		||||
 | 
			
		||||
    t0 = time.perf_counter()
 | 
			
		||||
    model = AutoModelForCausalLM.from_pretrained(model_path,
 | 
			
		||||
                                                 optimize_model=True,
 | 
			
		||||
                                                 pipeline=True,
 | 
			
		||||
| 
						 | 
				
			
			@ -69,6 +72,7 @@ if __name__ == "__main__":
 | 
			
		|||
                                                 trust_remote_code=True,
 | 
			
		||||
                                                 convert_model=True,
 | 
			
		||||
                                                 save_directory=save_dir)
 | 
			
		||||
    t1 = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -81,5 +85,6 @@ if __name__ == "__main__":
 | 
			
		|||
        tokenizer.save_pretrained(save_dir)
 | 
			
		||||
 | 
			
		||||
    print("-" * 80)
 | 
			
		||||
    print(f"Convert model cost {t1 - t0}s.")
 | 
			
		||||
    print(f"finish save model to {save_dir}")
 | 
			
		||||
    print("success shut down")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -135,9 +135,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
			
		|||
    if mode == "decode":
 | 
			
		||||
        input_len = 1
 | 
			
		||||
        decoder_name = f"decoder_layer_{layer_idx}"
 | 
			
		||||
        npu_dpu_groups = None
 | 
			
		||||
    else:
 | 
			
		||||
        input_len = kv_len
 | 
			
		||||
        decoder_name = "decoder_layer_prefill"
 | 
			
		||||
        npu_dpu_groups = 6
 | 
			
		||||
 | 
			
		||||
    single_decoder = LowBitQwenMultiDecoderlayer(
 | 
			
		||||
        [1, input_len, num_heads * head_dim],
 | 
			
		||||
        input_layernorm_weights=None,
 | 
			
		||||
| 
						 | 
				
			
			@ -162,7 +165,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
			
		|||
    )
 | 
			
		||||
    rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
 | 
			
		||||
                                                        decoder_name,
 | 
			
		||||
                                                        temp_dir, True, False)
 | 
			
		||||
                                                        temp_dir, True, False,
 | 
			
		||||
                                                        npu_dpu_groups=npu_dpu_groups)
 | 
			
		||||
 | 
			
		||||
    # 0, 1, 2 are input_embed/attention_mask/position_id
 | 
			
		||||
    if mode == "decode":
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue