[NPU] optimize qwen2 prefill performance for C++ (#12451)

This commit is contained in:
Ruonan Wang 2024-11-26 18:46:18 -08:00 committed by GitHub
parent 8331875f34
commit f8c2bb2943
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 10 additions and 1 deletions

View file

@ -24,6 +24,8 @@ from transformers.utils import logging
from packaging import version from packaging import version
import os import os
import shutil import shutil
import time
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
@ -55,6 +57,7 @@ if __name__ == "__main__":
model_path = args.repo_id_or_model_path model_path = args.repo_id_or_model_path
save_dir = args.save_directory save_dir = args.save_directory
t0 = time.perf_counter()
model = AutoModelForCausalLM.from_pretrained(model_path, model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True, optimize_model=True,
pipeline=True, pipeline=True,
@ -69,6 +72,7 @@ if __name__ == "__main__":
trust_remote_code=True, trust_remote_code=True,
convert_model=True, convert_model=True,
save_directory=save_dir) save_directory=save_dir)
t1 = time.perf_counter()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
@ -81,5 +85,6 @@ if __name__ == "__main__":
tokenizer.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir)
print("-" * 80) print("-" * 80)
print(f"Convert model cost {t1 - t0}s.")
print(f"finish save model to {save_dir}") print(f"finish save model to {save_dir}")
print("success shut down") print("success shut down")

View file

@ -135,9 +135,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
if mode == "decode": if mode == "decode":
input_len = 1 input_len = 1
decoder_name = f"decoder_layer_{layer_idx}" decoder_name = f"decoder_layer_{layer_idx}"
npu_dpu_groups = None
else: else:
input_len = kv_len input_len = kv_len
decoder_name = "decoder_layer_prefill" decoder_name = "decoder_layer_prefill"
npu_dpu_groups = 6
single_decoder = LowBitQwenMultiDecoderlayer( single_decoder = LowBitQwenMultiDecoderlayer(
[1, input_len, num_heads * head_dim], [1, input_len, num_heads * head_dim],
input_layernorm_weights=None, input_layernorm_weights=None,
@ -162,7 +165,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
) )
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
decoder_name, decoder_name,
temp_dir, True, False) temp_dir, True, False,
npu_dpu_groups=npu_dpu_groups)
# 0, 1, 2 are input_embed/attention_mask/position_id # 0, 1, 2 are input_embed/attention_mask/position_id
if mode == "decode": if mode == "decode":