[NPU] optimize qwen2 prefill performance for C++ (#12451)
This commit is contained in:
parent
8331875f34
commit
f8c2bb2943
2 changed files with 10 additions and 1 deletions
|
|
@ -24,6 +24,8 @@ from transformers.utils import logging
|
||||||
from packaging import version
|
from packaging import version
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
@ -55,6 +57,7 @@ if __name__ == "__main__":
|
||||||
model_path = args.repo_id_or_model_path
|
model_path = args.repo_id_or_model_path
|
||||||
save_dir = args.save_directory
|
save_dir = args.save_directory
|
||||||
|
|
||||||
|
t0 = time.perf_counter()
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
optimize_model=True,
|
optimize_model=True,
|
||||||
pipeline=True,
|
pipeline=True,
|
||||||
|
|
@ -69,6 +72,7 @@ if __name__ == "__main__":
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
convert_model=True,
|
convert_model=True,
|
||||||
save_directory=save_dir)
|
save_directory=save_dir)
|
||||||
|
t1 = time.perf_counter()
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
|
|
@ -81,5 +85,6 @@ if __name__ == "__main__":
|
||||||
tokenizer.save_pretrained(save_dir)
|
tokenizer.save_pretrained(save_dir)
|
||||||
|
|
||||||
print("-" * 80)
|
print("-" * 80)
|
||||||
|
print(f"Convert model cost {t1 - t0}s.")
|
||||||
print(f"finish save model to {save_dir}")
|
print(f"finish save model to {save_dir}")
|
||||||
print("success shut down")
|
print("success shut down")
|
||||||
|
|
|
||||||
|
|
@ -135,9 +135,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
if mode == "decode":
|
if mode == "decode":
|
||||||
input_len = 1
|
input_len = 1
|
||||||
decoder_name = f"decoder_layer_{layer_idx}"
|
decoder_name = f"decoder_layer_{layer_idx}"
|
||||||
|
npu_dpu_groups = None
|
||||||
else:
|
else:
|
||||||
input_len = kv_len
|
input_len = kv_len
|
||||||
decoder_name = "decoder_layer_prefill"
|
decoder_name = "decoder_layer_prefill"
|
||||||
|
npu_dpu_groups = 6
|
||||||
|
|
||||||
single_decoder = LowBitQwenMultiDecoderlayer(
|
single_decoder = LowBitQwenMultiDecoderlayer(
|
||||||
[1, input_len, num_heads * head_dim],
|
[1, input_len, num_heads * head_dim],
|
||||||
input_layernorm_weights=None,
|
input_layernorm_weights=None,
|
||||||
|
|
@ -162,7 +165,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
)
|
)
|
||||||
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
||||||
decoder_name,
|
decoder_name,
|
||||||
temp_dir, True, False)
|
temp_dir, True, False,
|
||||||
|
npu_dpu_groups=npu_dpu_groups)
|
||||||
|
|
||||||
# 0, 1, 2 are input_embed/attention_mask/position_id
|
# 0, 1, 2 are input_embed/attention_mask/position_id
|
||||||
if mode == "decode":
|
if mode == "decode":
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue