[NPU] Reuse prefill of acc lib for pipeline (#12279)
* first commit * update example * fix style * update example * embedding as const * fix generate * code refactor * meet code review * fix style * change max_output_len to max_context_len * fix all-in-one * fix example * add check for new tokens
This commit is contained in:
parent
42a528ded9
commit
3fe2ea3081
13 changed files with 224 additions and 146 deletions
|
|
@ -617,23 +617,23 @@ def transformers_int4_npu_win(repo_id,
|
|||
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
in_out_len = in_out_pairs[0].split("-")
|
||||
max_output_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
|
||||
max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
|
||||
# Load model in 4 bit,
|
||||
# which convert the relevant layers in the model into INT4 format
|
||||
st = time.perf_counter()
|
||||
if repo_id in CHATGLM_IDS:
|
||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
|
||||
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
torch_dtype=torch.float16, attn_implementation="eager").eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
elif repo_id in LLAMA_IDS:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
|
||||
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
use_cache=True, attn_implementation="eager").eval()
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
|
||||
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
use_cache=True, attn_implementation="eager").eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
end = time.perf_counter()
|
||||
|
|
@ -690,23 +690,23 @@ def run_transformer_int4_loadlowbit_npu_win(repo_id,
|
|||
|
||||
model_path = get_model_path(repo_id, local_model_hub)
|
||||
in_out_len = in_out_pairs[0].split("-")
|
||||
max_output_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
|
||||
max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
|
||||
# Load model in 4 bit,
|
||||
# which convert the relevant layers in the model into INT4 format
|
||||
st = time.perf_counter()
|
||||
if repo_id in CHATGLM_IDS:
|
||||
model = AutoModel.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True,
|
||||
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
torch_dtype=torch.float16, attn_implementation="eager").eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True)
|
||||
elif repo_id in LLAMA_IDS:
|
||||
model = AutoModelForCausalLM.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True, torch_dtype=torch.float16,
|
||||
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
use_cache=True, attn_implementation="eager").eval()
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True)
|
||||
else:
|
||||
model = AutoModelForCausalLM.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True, torch_dtype=torch.float16,
|
||||
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
|
||||
use_cache=True, attn_implementation="eager").eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True)
|
||||
end = time.perf_counter()
|
||||
|
|
|
|||
|
|
@ -51,7 +51,9 @@ if __name__ == "__main__":
|
|||
parser.add_argument('--prompt', type=str, default="What is AI?",
|
||||
help='Prompt to infer')
|
||||
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||
parser.add_argument("--max-output-len", type=int, default=1024)
|
||||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=960)
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
model_path = args.repo_id_or_model_path
|
||||
|
|
@ -59,9 +61,11 @@ if __name__ == "__main__":
|
|||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
optimize_model=True,
|
||||
pipeline=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="eager")
|
||||
attn_implementation="eager",
|
||||
transpose_value_cache=not args.disable_transpose_value_cache)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -57,7 +57,9 @@ if __name__ == "__main__":
|
|||
parser.add_argument('--prompt', type=str, default="What is AI?",
|
||||
help='Prompt to infer')
|
||||
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||
parser.add_argument("--max-output-len", type=int, default=1024)
|
||||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=960)
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
model_path = args.repo_id_or_model_path
|
||||
|
|
@ -66,8 +68,10 @@ if __name__ == "__main__":
|
|||
torch_dtype=torch.float16,
|
||||
optimize_model=True,
|
||||
pipeline=True,
|
||||
max_output_len=args.max_output_len,
|
||||
attn_implementation="eager")
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
attn_implementation="eager",
|
||||
transpose_value_cache=not args.disable_transpose_value_cache)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument('--prompt', type=str, default="What is AI?",
|
||||
help='Prompt to infer')
|
||||
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||
parser.add_argument("--max-output-len", type=int, default=1024)
|
||||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=512)
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
parser.add_argument("--intra-pp", type=int, default=2)
|
||||
|
|
@ -76,7 +76,7 @@ if __name__ == "__main__":
|
|||
attn_implementation="eager",
|
||||
load_in_low_bit="sym_int4",
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
|
|
@ -88,7 +88,7 @@ if __name__ == "__main__":
|
|||
attn_implementation="eager",
|
||||
torch_dtype=torch.bfloat16,
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument('--prompt', type=str, default="What is AI?",
|
||||
help='Prompt to infer')
|
||||
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||
parser.add_argument("--max-output-len", type=int, default=1024)
|
||||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=512)
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
parser.add_argument("--intra-pp", type=int, default=2)
|
||||
|
|
@ -76,7 +76,7 @@ if __name__ == "__main__":
|
|||
attn_implementation="eager",
|
||||
load_in_low_bit="sym_int4",
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
|
|
@ -88,7 +88,7 @@ if __name__ == "__main__":
|
|||
attn_implementation="eager",
|
||||
torch_dtype=torch.float16,
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument('--prompt', type=str, default="What is AI?",
|
||||
help='Prompt to infer')
|
||||
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||
parser.add_argument("--max-output-len", type=int, default=1024)
|
||||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=512)
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
parser.add_argument("--intra-pp", type=int, default=2)
|
||||
|
|
@ -62,7 +62,7 @@ if __name__ == "__main__":
|
|||
attn_implementation="eager",
|
||||
load_in_low_bit="sym_int4",
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
|
|
@ -74,7 +74,7 @@ if __name__ == "__main__":
|
|||
attn_implementation="eager",
|
||||
torch_dtype=torch.float16,
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument('--prompt', type=str, default="AI是什么?",
|
||||
help='Prompt to infer')
|
||||
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||
parser.add_argument("--max-output-len", type=int, default=1024)
|
||||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=512)
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
parser.add_argument("--intra-pp", type=int, default=None)
|
||||
|
|
@ -64,7 +64,7 @@ if __name__ == "__main__":
|
|||
attn_implementation="eager",
|
||||
load_in_low_bit="sym_int4",
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
|
|
@ -77,7 +77,7 @@ if __name__ == "__main__":
|
|||
attn_implementation="eager",
|
||||
torch_dtype=torch.float16,
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument('--prompt', type=str, default="What is in the image?",
|
||||
help='Prompt to infer')
|
||||
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||
parser.add_argument("--max-output-len", type=int, default=1024)
|
||||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=512)
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
parser.add_argument("--intra-pp", type=int, default=2)
|
||||
|
|
@ -61,7 +61,7 @@ if __name__ == "__main__":
|
|||
attn_implementation="eager",
|
||||
load_in_low_bit="sym_int4",
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
|||
parser.add_argument('--prompt', type=str, default="What is in this image?",
|
||||
help='Prompt to infer')
|
||||
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||
parser.add_argument("--max-output-len", type=int, default=1024)
|
||||
parser.add_argument("--max-context-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=512)
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
parser.add_argument("--intra-pp", type=int, default=None)
|
||||
|
|
@ -52,7 +52,7 @@ if __name__ == '__main__':
|
|||
attn_implementation="eager",
|
||||
load_in_low_bit="sym_int4",
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_context_len=args.max_context_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
|
|
|
|||
|
|
@ -124,8 +124,8 @@ class _BaseAutoModelClass:
|
|||
ignore_argument(kwargs, "pipeline_parallel_stages")
|
||||
optimize_model = kwargs.pop("optimize_model", False)
|
||||
pipeline = kwargs.pop("pipeline", False)
|
||||
max_output_len = kwargs.pop("max_output_len", 1024)
|
||||
max_output_len = max_output_len - 1
|
||||
max_context_len = kwargs.pop("max_context_len", 1024)
|
||||
max_context_len = max_context_len - 1
|
||||
max_prompt_len = kwargs.pop("max_prompt_len", 512)
|
||||
inter_pp = kwargs.pop("inter_pp", None)
|
||||
intra_pp = kwargs.pop("intra_pp", None)
|
||||
|
|
@ -169,10 +169,10 @@ class _BaseAutoModelClass:
|
|||
|
||||
if optimize_model:
|
||||
invalidInputError(
|
||||
max_prompt_len < max_output_len,
|
||||
max_prompt_len < max_context_len,
|
||||
(
|
||||
f"max_prompt_len ({max_prompt_len}) should be less"
|
||||
" than max_output_len ({max_output_len})"
|
||||
" than max_context_len ({max_context_len})"
|
||||
),
|
||||
)
|
||||
optimize_kwargs = {
|
||||
|
|
@ -182,7 +182,7 @@ class _BaseAutoModelClass:
|
|||
"quantization_group_size": quantization_group_size,
|
||||
"modules_to_not_convert": modules_to_not_convert,
|
||||
"pipeline": pipeline,
|
||||
"max_output_len": max_output_len,
|
||||
"max_context_len": max_context_len,
|
||||
"max_prompt_len": max_prompt_len,
|
||||
"inter_pp": inter_pp,
|
||||
"intra_pp": intra_pp,
|
||||
|
|
@ -219,7 +219,7 @@ class _BaseAutoModelClass:
|
|||
quantization_group_size = kwargs.pop("quantization_group_size", 0)
|
||||
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
|
||||
pipeline = kwargs.pop("pipeline", False)
|
||||
max_output_len = kwargs.pop("max_output_len", 1024)
|
||||
max_context_len = kwargs.pop("max_context_len", 1024)
|
||||
max_prompt_len = kwargs.pop("max_prompt_len", 512)
|
||||
inter_pp = kwargs.pop("inter_pp", None)
|
||||
intra_pp = kwargs.pop("intra_pp", None)
|
||||
|
|
@ -246,7 +246,7 @@ class _BaseAutoModelClass:
|
|||
if not pipeline:
|
||||
optimize_llm(
|
||||
llm,
|
||||
max_output_len=max_output_len,
|
||||
max_context_len=max_context_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
inter_pp=inter_pp,
|
||||
intra_pp=intra_pp,
|
||||
|
|
@ -258,7 +258,8 @@ class _BaseAutoModelClass:
|
|||
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
|
||||
import convert_llm
|
||||
convert_llm(llm,
|
||||
kv_len=max_output_len,
|
||||
kv_len=max_context_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
transpose_value_cache=transpose_value_cache)
|
||||
|
||||
return model
|
||||
|
|
@ -598,7 +599,7 @@ class FunAsrAutoModel(_BaseAutoModelClass):
|
|||
model = kwargs.pop("model")
|
||||
qtype = kwargs.pop("qtype", "sym_int8")
|
||||
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
|
||||
max_output_len = kwargs.pop("max_output_len", 1024)
|
||||
max_context_len = kwargs.pop("max_context_len", 1024)
|
||||
max_prompt_len = kwargs.pop("max_prompt_len", 512)
|
||||
inter_pp = kwargs.pop("inter_pp", None)
|
||||
intra_pp = kwargs.pop("intra_pp", None)
|
||||
|
|
@ -618,7 +619,7 @@ class FunAsrAutoModel(_BaseAutoModelClass):
|
|||
|
||||
optimize_funasr(
|
||||
model,
|
||||
max_output_len=max_output_len,
|
||||
max_context_len=max_context_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
inter_pp=inter_pp,
|
||||
intra_pp=intra_pp,
|
||||
|
|
|
|||
|
|
@ -154,9 +154,47 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
|
|||
model.lm_head = new_linear
|
||||
|
||||
|
||||
def convert_llama(
|
||||
model: torch.nn.Module,
|
||||
max_output_len=1024,
|
||||
max_prompt_len=1024,
|
||||
decoder=False,
|
||||
inter_pp=None,
|
||||
intra_pp=None,
|
||||
transpose_value_cache=True,
|
||||
):
|
||||
from ipex_llm.transformers.npu_models.llama_mp import gen_llama_fused_model_forward
|
||||
from ipex_llm.transformers.npu_models.llama_mp import DecodeRunner, PrefillRunner
|
||||
from transformers.models.llama.modeling_llama import LlamaModel
|
||||
|
||||
if decoder:
|
||||
decode_runner = DecodeRunner(
|
||||
model,
|
||||
max_seq_len=max_output_len,
|
||||
inter_pp=inter_pp,
|
||||
intra_pp=intra_pp,
|
||||
transpose_value_cache=transpose_value_cache,
|
||||
)
|
||||
else:
|
||||
decode_runner = None
|
||||
prefill_runner = PrefillRunner(
|
||||
model,
|
||||
max_output_len=max_output_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
transpose_value_cache=transpose_value_cache,
|
||||
)
|
||||
llama_model_forward = gen_llama_fused_model_forward(
|
||||
prefill_runner=prefill_runner, decode_runner=decode_runner
|
||||
)
|
||||
convert_forward(model, LlamaModel, llama_model_forward)
|
||||
from transformers.models.llama.modeling_llama import LlamaForCausalLM
|
||||
from ipex_llm.transformers.npu_models.llama_mp import llama2_casullm_forward
|
||||
convert_forward(model, LlamaForCausalLM, llama2_casullm_forward)
|
||||
|
||||
|
||||
def optimize_llm(
|
||||
model: torch.nn.Module,
|
||||
max_output_len=1024,
|
||||
max_context_len=1024,
|
||||
max_prompt_len=1024,
|
||||
inter_pp=None,
|
||||
intra_pp=None,
|
||||
|
|
@ -168,31 +206,13 @@ def optimize_llm(
|
|||
intra_pp = 2
|
||||
if inter_pp is None:
|
||||
inter_pp = 2 if group_size == 0 else 8
|
||||
|
||||
from ipex_llm.transformers.npu_models.llama_mp import gen_llama_fused_model_forward
|
||||
from ipex_llm.transformers.npu_models.llama_mp import DecodeRunner, PrefillRunner
|
||||
from transformers.models.llama.modeling_llama import LlamaModel
|
||||
|
||||
decode_runner = DecodeRunner(
|
||||
model,
|
||||
max_seq_len=max_output_len,
|
||||
inter_pp=inter_pp,
|
||||
intra_pp=intra_pp,
|
||||
transpose_value_cache=transpose_value_cache,
|
||||
)
|
||||
prefill_runner = PrefillRunner(
|
||||
model,
|
||||
max_output_len=max_output_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
transpose_value_cache=transpose_value_cache,
|
||||
)
|
||||
llama_model_forward = gen_llama_fused_model_forward(
|
||||
prefill_runner=prefill_runner, decode_runner=decode_runner
|
||||
)
|
||||
convert_forward(model, LlamaModel, llama_model_forward)
|
||||
from transformers.models.llama.modeling_llama import LlamaForCausalLM
|
||||
from ipex_llm.transformers.npu_models.llama_mp import llama2_casullm_forward
|
||||
convert_forward(model, LlamaForCausalLM, llama2_casullm_forward)
|
||||
convert_llama(model,
|
||||
max_output_len=max_context_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
inter_pp=inter_pp,
|
||||
intra_pp=intra_pp,
|
||||
decoder=True,
|
||||
transpose_value_cache=transpose_value_cache)
|
||||
elif model.config.model_type == "qwen2" and model.config.num_hidden_layers == 28:
|
||||
# for qwen2-1.5B and qwen2-7B
|
||||
if intra_pp is None:
|
||||
|
|
@ -212,14 +232,14 @@ def optimize_llm(
|
|||
|
||||
decode_runner = DecodeRunner(
|
||||
model,
|
||||
max_seq_len=max_output_len,
|
||||
max_seq_len=max_context_len,
|
||||
inter_pp=inter_pp,
|
||||
intra_pp=intra_pp,
|
||||
transpose_value_cache=transpose_value_cache,
|
||||
)
|
||||
prefill_runner = PrefillRunner(
|
||||
model,
|
||||
max_output_len=max_output_len,
|
||||
max_output_len=max_context_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
transpose_value_cache=transpose_value_cache,
|
||||
)
|
||||
|
|
@ -252,14 +272,14 @@ def optimize_llm(
|
|||
|
||||
decode_runner = DecodeRunner(
|
||||
model,
|
||||
max_seq_len=max_output_len,
|
||||
max_seq_len=max_context_len,
|
||||
inter_pp=inter_pp,
|
||||
intra_pp=intra_pp,
|
||||
transpose_value_cache=transpose_cache,
|
||||
)
|
||||
prefill_runner = PrefillRunner(
|
||||
model,
|
||||
max_output_len=max_output_len,
|
||||
max_output_len=max_context_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
transpose_value_cache=transpose_cache,
|
||||
)
|
||||
|
|
@ -281,14 +301,14 @@ def optimize_llm(
|
|||
from ipex_llm.transformers.npu_models.baichuan_mp import DecodeRunner, PrefillRunner
|
||||
decode_runner = DecodeRunner(
|
||||
model,
|
||||
max_seq_len=max_output_len,
|
||||
max_seq_len=max_context_len,
|
||||
inter_pp=inter_pp,
|
||||
intra_pp=intra_pp,
|
||||
transpose_value_cache=transpose_value_cache,
|
||||
)
|
||||
prefill_runner = PrefillRunner(
|
||||
model,
|
||||
max_output_len=max_output_len,
|
||||
max_output_len=max_context_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
transpose_value_cache=transpose_value_cache,
|
||||
)
|
||||
|
|
@ -305,7 +325,7 @@ def optimize_llm(
|
|||
|
||||
def optimize_funasr(
|
||||
model: torch.nn.Module,
|
||||
max_output_len=1024,
|
||||
max_context_len=1024,
|
||||
max_prompt_len=1024,
|
||||
inter_pp=None,
|
||||
intra_pp=None,
|
||||
|
|
@ -320,7 +340,7 @@ def optimize_funasr(
|
|||
from ipex_llm.transformers.npu_models.paraformer_mp import PrefillRunner, DecodeRunner
|
||||
prefill_runner = PrefillRunner(
|
||||
model,
|
||||
max_output_len=max_output_len,
|
||||
max_output_len=max_context_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
transpose_value_cache=transpose_value_cache,
|
||||
)
|
||||
|
|
@ -329,7 +349,7 @@ def optimize_funasr(
|
|||
)
|
||||
decode_runner = DecodeRunner(
|
||||
model,
|
||||
max_seq_len=max_output_len,
|
||||
max_seq_len=max_context_len,
|
||||
inter_pp=inter_pp,
|
||||
intra_pp=intra_pp,
|
||||
transpose_value_cache=transpose_value_cache,
|
||||
|
|
|
|||
|
|
@ -61,87 +61,129 @@ def generate(
|
|||
|
||||
new_tokens = new_generate_kwargs['max_new_tokens']
|
||||
invalidInputError(input_length + new_tokens <= self.kv_len + 1,
|
||||
"Input plus output tokens should not exceed max_output_len.")
|
||||
"Input plus output tokens should not exceed max_context_len.")
|
||||
# TODO: may optimize this part later
|
||||
invalidInputError(new_tokens < 1024,
|
||||
f"Generated tokens ({new_tokens}) exceed named pipeline limitation.")
|
||||
|
||||
# start generate_serve by Thread
|
||||
thread = threading.Thread(target=generate_serve,
|
||||
args=(self.kv_len, self.num_head,
|
||||
self.head_dim, self.num_layers,
|
||||
self.transpose_value_cache,
|
||||
new_tokens - 1))
|
||||
thread.start()
|
||||
|
||||
in_pipe_path = "\\\\.\\pipe\\llminputpipe"
|
||||
out_pipe_path = "\\\\.\\pipe\\llmoutputpipe"
|
||||
|
||||
while True:
|
||||
try:
|
||||
input_pipe = open(in_pipe_path, "wb")
|
||||
except:
|
||||
print('Waiting for input pipe')
|
||||
time.sleep(1)
|
||||
else:
|
||||
break
|
||||
|
||||
while True:
|
||||
try:
|
||||
output_pipe = open(out_pipe_path, "rb")
|
||||
except:
|
||||
print('Waiting for output pipe')
|
||||
time.sleep(1)
|
||||
else:
|
||||
break
|
||||
|
||||
bdata = b''
|
||||
for i in range(0, input_length):
|
||||
d = int(numpy_input[i])
|
||||
bdata = bdata + d.to_bytes(4, sys.byteorder)
|
||||
|
||||
if "eos_token_id" not in new_generate_kwargs:
|
||||
eos = 0xffffffff
|
||||
else:
|
||||
eos = new_generate_kwargs["eos_token_id"]
|
||||
|
||||
bdata = bdata + eos.to_bytes(4, sys.byteorder)
|
||||
|
||||
time_start = time.perf_counter()
|
||||
|
||||
input_pipe.write(bytearray(bdata))
|
||||
input_pipe.flush()
|
||||
|
||||
buffersize = 4
|
||||
output_tokens = []
|
||||
while True:
|
||||
data = output_pipe.read(buffersize)
|
||||
if len(data) == 0:
|
||||
break
|
||||
token = int.from_bytes(data, sys.byteorder)
|
||||
idx += 1
|
||||
if time_t1 is None:
|
||||
time_t1 = time.perf_counter()
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# run prefill with PrefillRunner
|
||||
output = self(input_ids=inputs,
|
||||
attention_mask=torch.ones(1, inputs.shape[1]).int())
|
||||
logits = output.logits
|
||||
input_id = torch.argmax(logits[:, -1, :], dim=1)
|
||||
input_id.to(torch.int32).numpy().tofile(os.path.join(temp_dir, "input_id.bin"))
|
||||
position = np.int64(inputs.shape[1])
|
||||
position.tofile(os.path.join(temp_dir, "position.bin"))
|
||||
past_key_values = output.past_key_values
|
||||
key_cache = past_key_values.key_cache
|
||||
value_cache = past_key_values.value_cache
|
||||
for layer in range(self.num_layers):
|
||||
key_ = key_cache[layer]
|
||||
val_ = value_cache[layer]
|
||||
new_size = (
|
||||
key_.size(0),
|
||||
key_.size(1),
|
||||
self.kv_len,
|
||||
key_.size(3),
|
||||
)
|
||||
key = key_.as_strided(new_size, key_.stride(), storage_offset=0)
|
||||
if not self.transpose_value_cache:
|
||||
val = val_.as_strided(new_size, val_.stride(), storage_offset=0)
|
||||
else:
|
||||
new_size = (
|
||||
val_.size(0),
|
||||
val_.size(1),
|
||||
val_.size(3),
|
||||
self.kv_len,
|
||||
)
|
||||
val_cache = val_.transpose(-1, -2)
|
||||
val = val_cache.as_strided(new_size, val_cache.stride(), storage_offset=0)
|
||||
key.to(torch.float16).numpy().tofile(os.path.join(temp_dir, f"key_cache_{layer}.bin"))
|
||||
val.to(torch.float16).numpy().tofile(os.path.join(temp_dir, f"value_cache_{layer}.bin"))
|
||||
|
||||
token = input_id.to(torch.int32).item()
|
||||
output_tokens.append(torch.tensor([token]))
|
||||
if streamer is not None:
|
||||
streamer.put(torch.tensor([token]))
|
||||
if token == eos:
|
||||
break
|
||||
|
||||
output = torch.stack(output_tokens, dim=1)
|
||||
output = torch.cat((inputs, output), dim=1)
|
||||
if streamer is not None:
|
||||
streamer.end()
|
||||
if "eos_token_id" not in new_generate_kwargs:
|
||||
eos = 0xffffffff
|
||||
else:
|
||||
eos = new_generate_kwargs["eos_token_id"]
|
||||
|
||||
time_t1 = time.perf_counter()
|
||||
idx += 1
|
||||
|
||||
# start generate_serve by Thread
|
||||
thread = threading.Thread(target=generate_serve,
|
||||
args=(self.kv_len, self.num_head,
|
||||
self.head_dim, self.num_layers,
|
||||
self.transpose_value_cache,
|
||||
new_tokens - 2))
|
||||
thread.start()
|
||||
|
||||
in_pipe_path = "\\\\.\\pipe\\llminputpipe"
|
||||
out_pipe_path = "\\\\.\\pipe\\llmoutputpipe"
|
||||
|
||||
while True:
|
||||
try:
|
||||
input_pipe = open(in_pipe_path, "wb")
|
||||
except:
|
||||
print('Waiting for input pipe')
|
||||
time.sleep(1)
|
||||
else:
|
||||
break
|
||||
|
||||
while True:
|
||||
try:
|
||||
output_pipe = open(out_pipe_path, "rb")
|
||||
except:
|
||||
print('Waiting for output pipe')
|
||||
time.sleep(1)
|
||||
else:
|
||||
break
|
||||
|
||||
time_start = time.perf_counter()
|
||||
|
||||
bdata = str.encode(str(temp_dir))
|
||||
invalidInputError(len(bdata) <= 2000,
|
||||
f"Leng of input directory is too long ({len(bdata)}), "
|
||||
"which may cause read error.")
|
||||
input_pipe.write(bdata)
|
||||
input_pipe.flush()
|
||||
|
||||
buffersize = 4
|
||||
while True:
|
||||
data = output_pipe.read(buffersize)
|
||||
if len(data) == 0:
|
||||
break
|
||||
token = int.from_bytes(data, sys.byteorder)
|
||||
idx += 1
|
||||
output_tokens.append(torch.tensor([token]))
|
||||
if streamer is not None:
|
||||
streamer.put(torch.tensor([token]))
|
||||
if token == eos:
|
||||
break
|
||||
|
||||
output = torch.stack(output_tokens, dim=1)
|
||||
output = torch.cat((inputs, output), dim=1)
|
||||
if streamer is not None:
|
||||
streamer.end()
|
||||
|
||||
thread.join()
|
||||
time_end = time.perf_counter()
|
||||
|
||||
if do_print:
|
||||
print(f" Start the thread and connect the pipe time: {(time_start - time_start_all):.2f} s")
|
||||
print(f" Start the thread and connect the pipe time: {(time_start - time_t1):.2f} s")
|
||||
print(f" Number of input tokens: {input_length}")
|
||||
print(f" Generated tokens: {idx}")
|
||||
print(f" First token generation time: {(time_t1 - time_start):.2f} s")
|
||||
print(f" Generation average latency: {(time_end - time_t1)*1000 /(idx - 1):.2f} ms, "
|
||||
f"({(idx - 1)/(time_end - time_t1):.2f} token/s)")
|
||||
print(f" Generation time: {(time_end - time_start):.2f} s\n")
|
||||
|
||||
print(f" First token generation time: {(time_t1 - time_start_all):.2f} s")
|
||||
print(f" Generation average latency: {(time_end - time_start) * 1000 /(idx - 1):.2f} ms, "
|
||||
f"({(idx - 1)/(time_end - time_start):.2f} token/s)")
|
||||
print(f" Generation time: {(time_end - time_start_all - (time_start - time_t1)):.2f} s\n")
|
||||
return output
|
||||
|
||||
|
||||
|
|
@ -182,8 +224,15 @@ def update_names_of_IR_and_export_blob(model, model_name, dir):
|
|||
|
||||
def convert_llm(model: torch.nn.Module,
|
||||
kv_len: int,
|
||||
max_prompt_len: int,
|
||||
transpose_value_cache: bool):
|
||||
if model.config.model_type == "llama":
|
||||
from ipex_llm.transformers.npu_models.convert_mp import convert_llama
|
||||
convert_llama(model,
|
||||
max_output_len=kv_len,
|
||||
max_prompt_len=max_prompt_len,
|
||||
decoder=False,
|
||||
transpose_value_cache=transpose_value_cache)
|
||||
from .llama import LowBitLlamaLMHead, LlamaEmbedding
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# generate lm_head blob
|
||||
|
|
@ -231,13 +280,12 @@ def convert_llm(model: torch.nn.Module,
|
|||
new_embedding = LlamaEmbedding(
|
||||
vocab_size=model.config.vocab_size,
|
||||
embedding_dim=model.config.hidden_size,
|
||||
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
||||
padding_idx=model.config.pad_token_id,
|
||||
dtype=np.float16,
|
||||
)
|
||||
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
||||
temp_dir)
|
||||
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
||||
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
||||
|
||||
# generate decoder layer blob
|
||||
from ipex_llm.transformers.npu_models.llama_mp import LowBitLlamaMultiDecoderlayer
|
||||
|
|
|
|||
|
|
@ -80,6 +80,7 @@ class LlamaEmbedding(NNFactory):
|
|||
self,
|
||||
vocab_size,
|
||||
embedding_dim,
|
||||
embedding_weight,
|
||||
padding_idx,
|
||||
dtype, # fp16
|
||||
device: str = "NPU",
|
||||
|
|
@ -91,7 +92,7 @@ class LlamaEmbedding(NNFactory):
|
|||
self.dtype = dtype
|
||||
|
||||
# define input
|
||||
weight = self.parameter((vocab_size, embedding_dim))
|
||||
weight = self.constant(embedding_weight)
|
||||
input = self.parameter((1, 1), dtype=np.int32)
|
||||
|
||||
if padding_idx == -1:
|
||||
|
|
|
|||
Loading…
Reference in a new issue