[NPU] Reuse prefill of acc lib for pipeline (#12279)

* first commit

* update example

* fix style

* update example

* embedding as const

* fix generate

* code  refactor

* meet code review

* fix style

* change max_output_len to max_context_len

* fix all-in-one

* fix example

* add check for new tokens
This commit is contained in:
Ruonan Wang 2024-10-28 16:05:49 +08:00 committed by GitHub
parent 42a528ded9
commit 3fe2ea3081
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 224 additions and 146 deletions

View file

@ -617,23 +617,23 @@ def transformers_int4_npu_win(repo_id,
model_path = get_model_path(repo_id, local_model_hub)
in_out_len = in_out_pairs[0].split("-")
max_output_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
st = time.perf_counter()
if repo_id in CHATGLM_IDS:
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
torch_dtype=torch.float16, attn_implementation="eager").eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
use_cache=True, attn_implementation="eager").eval()
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
else:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
use_cache=True, attn_implementation="eager").eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
end = time.perf_counter()
@ -690,23 +690,23 @@ def run_transformer_int4_loadlowbit_npu_win(repo_id,
model_path = get_model_path(repo_id, local_model_hub)
in_out_len = in_out_pairs[0].split("-")
max_output_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024)
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
st = time.perf_counter()
if repo_id in CHATGLM_IDS:
model = AutoModel.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True,
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
torch_dtype=torch.float16, attn_implementation="eager").eval()
tokenizer = AutoTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True)
elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
use_cache=True, attn_implementation="eager").eval()
tokenizer = LlamaTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True)
else:
model = AutoModelForCausalLM.load_low_bit(model_path+'-npu-'+low_bit, trust_remote_code=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=transpose_value_cache,
use_cache=True, attn_implementation="eager").eval()
tokenizer = AutoTokenizer.from_pretrained(model_path+'-npu-'+low_bit, trust_remote_code=True)
end = time.perf_counter()

View file

@ -51,7 +51,9 @@ if __name__ == "__main__":
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-output-len", type=int, default=1024)
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
args = parser.parse_args()
model_path = args.repo_id_or_model_path
@ -59,9 +61,11 @@ if __name__ == "__main__":
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
torch_dtype=torch.float16,
attn_implementation="eager")
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

View file

@ -57,7 +57,9 @@ if __name__ == "__main__":
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-output-len", type=int, default=1024)
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
args = parser.parse_args()
model_path = args.repo_id_or_model_path
@ -66,8 +68,10 @@ if __name__ == "__main__":
torch_dtype=torch.float16,
optimize_model=True,
pipeline=True,
max_output_len=args.max_output_len,
attn_implementation="eager")
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

View file

@ -59,7 +59,7 @@ if __name__ == "__main__":
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-output-len", type=int, default=1024)
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--intra-pp", type=int, default=2)
@ -76,7 +76,7 @@ if __name__ == "__main__":
attn_implementation="eager",
load_in_low_bit="sym_int4",
optimize_model=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,
@ -88,7 +88,7 @@ if __name__ == "__main__":
attn_implementation="eager",
torch_dtype=torch.bfloat16,
optimize_model=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,

View file

@ -59,7 +59,7 @@ if __name__ == "__main__":
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-output-len", type=int, default=1024)
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--intra-pp", type=int, default=2)
@ -76,7 +76,7 @@ if __name__ == "__main__":
attn_implementation="eager",
load_in_low_bit="sym_int4",
optimize_model=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,
@ -88,7 +88,7 @@ if __name__ == "__main__":
attn_implementation="eager",
torch_dtype=torch.float16,
optimize_model=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,

View file

@ -46,7 +46,7 @@ if __name__ == "__main__":
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-output-len", type=int, default=1024)
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--intra-pp", type=int, default=2)
@ -62,7 +62,7 @@ if __name__ == "__main__":
attn_implementation="eager",
load_in_low_bit="sym_int4",
optimize_model=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,
@ -74,7 +74,7 @@ if __name__ == "__main__":
attn_implementation="eager",
torch_dtype=torch.float16,
optimize_model=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,

View file

@ -46,7 +46,7 @@ if __name__ == "__main__":
parser.add_argument('--prompt', type=str, default="AI是什么?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-output-len", type=int, default=1024)
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--intra-pp", type=int, default=None)
@ -64,7 +64,7 @@ if __name__ == "__main__":
attn_implementation="eager",
load_in_low_bit="sym_int4",
optimize_model=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,
@ -77,7 +77,7 @@ if __name__ == "__main__":
attn_implementation="eager",
torch_dtype=torch.float16,
optimize_model=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,

View file

@ -45,7 +45,7 @@ if __name__ == "__main__":
parser.add_argument('--prompt', type=str, default="What is in the image?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-output-len", type=int, default=1024)
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--intra-pp", type=int, default=2)
@ -61,7 +61,7 @@ if __name__ == "__main__":
attn_implementation="eager",
load_in_low_bit="sym_int4",
optimize_model=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,

View file

@ -36,7 +36,7 @@ if __name__ == '__main__':
parser.add_argument('--prompt', type=str, default="What is in this image?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-output-len", type=int, default=1024)
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--intra-pp", type=int, default=None)
@ -52,7 +52,7 @@ if __name__ == '__main__':
attn_implementation="eager",
load_in_low_bit="sym_int4",
optimize_model=True,
max_output_len=args.max_output_len,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,

View file

@ -124,8 +124,8 @@ class _BaseAutoModelClass:
ignore_argument(kwargs, "pipeline_parallel_stages")
optimize_model = kwargs.pop("optimize_model", False)
pipeline = kwargs.pop("pipeline", False)
max_output_len = kwargs.pop("max_output_len", 1024)
max_output_len = max_output_len - 1
max_context_len = kwargs.pop("max_context_len", 1024)
max_context_len = max_context_len - 1
max_prompt_len = kwargs.pop("max_prompt_len", 512)
inter_pp = kwargs.pop("inter_pp", None)
intra_pp = kwargs.pop("intra_pp", None)
@ -169,10 +169,10 @@ class _BaseAutoModelClass:
if optimize_model:
invalidInputError(
max_prompt_len < max_output_len,
max_prompt_len < max_context_len,
(
f"max_prompt_len ({max_prompt_len}) should be less"
" than max_output_len ({max_output_len})"
" than max_context_len ({max_context_len})"
),
)
optimize_kwargs = {
@ -182,7 +182,7 @@ class _BaseAutoModelClass:
"quantization_group_size": quantization_group_size,
"modules_to_not_convert": modules_to_not_convert,
"pipeline": pipeline,
"max_output_len": max_output_len,
"max_context_len": max_context_len,
"max_prompt_len": max_prompt_len,
"inter_pp": inter_pp,
"intra_pp": intra_pp,
@ -219,7 +219,7 @@ class _BaseAutoModelClass:
quantization_group_size = kwargs.pop("quantization_group_size", 0)
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
pipeline = kwargs.pop("pipeline", False)
max_output_len = kwargs.pop("max_output_len", 1024)
max_context_len = kwargs.pop("max_context_len", 1024)
max_prompt_len = kwargs.pop("max_prompt_len", 512)
inter_pp = kwargs.pop("inter_pp", None)
intra_pp = kwargs.pop("intra_pp", None)
@ -246,7 +246,7 @@ class _BaseAutoModelClass:
if not pipeline:
optimize_llm(
llm,
max_output_len=max_output_len,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
@ -258,7 +258,8 @@ class _BaseAutoModelClass:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
convert_llm(llm,
kv_len=max_output_len,
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache)
return model
@ -598,7 +599,7 @@ class FunAsrAutoModel(_BaseAutoModelClass):
model = kwargs.pop("model")
qtype = kwargs.pop("qtype", "sym_int8")
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
max_output_len = kwargs.pop("max_output_len", 1024)
max_context_len = kwargs.pop("max_context_len", 1024)
max_prompt_len = kwargs.pop("max_prompt_len", 512)
inter_pp = kwargs.pop("inter_pp", None)
intra_pp = kwargs.pop("intra_pp", None)
@ -618,7 +619,7 @@ class FunAsrAutoModel(_BaseAutoModelClass):
optimize_funasr(
model,
max_output_len=max_output_len,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,

View file

@ -154,9 +154,47 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
model.lm_head = new_linear
def convert_llama(
model: torch.nn.Module,
max_output_len=1024,
max_prompt_len=1024,
decoder=False,
inter_pp=None,
intra_pp=None,
transpose_value_cache=True,
):
from ipex_llm.transformers.npu_models.llama_mp import gen_llama_fused_model_forward
from ipex_llm.transformers.npu_models.llama_mp import DecodeRunner, PrefillRunner
from transformers.models.llama.modeling_llama import LlamaModel
if decoder:
decode_runner = DecodeRunner(
model,
max_seq_len=max_output_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
)
else:
decode_runner = None
prefill_runner = PrefillRunner(
model,
max_output_len=max_output_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
)
llama_model_forward = gen_llama_fused_model_forward(
prefill_runner=prefill_runner, decode_runner=decode_runner
)
convert_forward(model, LlamaModel, llama_model_forward)
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from ipex_llm.transformers.npu_models.llama_mp import llama2_casullm_forward
convert_forward(model, LlamaForCausalLM, llama2_casullm_forward)
def optimize_llm(
model: torch.nn.Module,
max_output_len=1024,
max_context_len=1024,
max_prompt_len=1024,
inter_pp=None,
intra_pp=None,
@ -168,31 +206,13 @@ def optimize_llm(
intra_pp = 2
if inter_pp is None:
inter_pp = 2 if group_size == 0 else 8
from ipex_llm.transformers.npu_models.llama_mp import gen_llama_fused_model_forward
from ipex_llm.transformers.npu_models.llama_mp import DecodeRunner, PrefillRunner
from transformers.models.llama.modeling_llama import LlamaModel
decode_runner = DecodeRunner(
model,
max_seq_len=max_output_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
)
prefill_runner = PrefillRunner(
model,
max_output_len=max_output_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
)
llama_model_forward = gen_llama_fused_model_forward(
prefill_runner=prefill_runner, decode_runner=decode_runner
)
convert_forward(model, LlamaModel, llama_model_forward)
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from ipex_llm.transformers.npu_models.llama_mp import llama2_casullm_forward
convert_forward(model, LlamaForCausalLM, llama2_casullm_forward)
convert_llama(model,
max_output_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
decoder=True,
transpose_value_cache=transpose_value_cache)
elif model.config.model_type == "qwen2" and model.config.num_hidden_layers == 28:
# for qwen2-1.5B and qwen2-7B
if intra_pp is None:
@ -212,14 +232,14 @@ def optimize_llm(
decode_runner = DecodeRunner(
model,
max_seq_len=max_output_len,
max_seq_len=max_context_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
)
prefill_runner = PrefillRunner(
model,
max_output_len=max_output_len,
max_output_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
)
@ -252,14 +272,14 @@ def optimize_llm(
decode_runner = DecodeRunner(
model,
max_seq_len=max_output_len,
max_seq_len=max_context_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_cache,
)
prefill_runner = PrefillRunner(
model,
max_output_len=max_output_len,
max_output_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_cache,
)
@ -281,14 +301,14 @@ def optimize_llm(
from ipex_llm.transformers.npu_models.baichuan_mp import DecodeRunner, PrefillRunner
decode_runner = DecodeRunner(
model,
max_seq_len=max_output_len,
max_seq_len=max_context_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
)
prefill_runner = PrefillRunner(
model,
max_output_len=max_output_len,
max_output_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
)
@ -305,7 +325,7 @@ def optimize_llm(
def optimize_funasr(
model: torch.nn.Module,
max_output_len=1024,
max_context_len=1024,
max_prompt_len=1024,
inter_pp=None,
intra_pp=None,
@ -320,7 +340,7 @@ def optimize_funasr(
from ipex_llm.transformers.npu_models.paraformer_mp import PrefillRunner, DecodeRunner
prefill_runner = PrefillRunner(
model,
max_output_len=max_output_len,
max_output_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
)
@ -329,7 +349,7 @@ def optimize_funasr(
)
decode_runner = DecodeRunner(
model,
max_seq_len=max_output_len,
max_seq_len=max_context_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,

View file

@ -61,87 +61,129 @@ def generate(
new_tokens = new_generate_kwargs['max_new_tokens']
invalidInputError(input_length + new_tokens <= self.kv_len + 1,
"Input plus output tokens should not exceed max_output_len.")
"Input plus output tokens should not exceed max_context_len.")
# TODO: may optimize this part later
invalidInputError(new_tokens < 1024,
f"Generated tokens ({new_tokens}) exceed named pipeline limitation.")
# start generate_serve by Thread
thread = threading.Thread(target=generate_serve,
args=(self.kv_len, self.num_head,
self.head_dim, self.num_layers,
self.transpose_value_cache,
new_tokens - 1))
thread.start()
in_pipe_path = "\\\\.\\pipe\\llminputpipe"
out_pipe_path = "\\\\.\\pipe\\llmoutputpipe"
while True:
try:
input_pipe = open(in_pipe_path, "wb")
except:
print('Waiting for input pipe')
time.sleep(1)
else:
break
while True:
try:
output_pipe = open(out_pipe_path, "rb")
except:
print('Waiting for output pipe')
time.sleep(1)
else:
break
bdata = b''
for i in range(0, input_length):
d = int(numpy_input[i])
bdata = bdata + d.to_bytes(4, sys.byteorder)
if "eos_token_id" not in new_generate_kwargs:
eos = 0xffffffff
else:
eos = new_generate_kwargs["eos_token_id"]
bdata = bdata + eos.to_bytes(4, sys.byteorder)
time_start = time.perf_counter()
input_pipe.write(bytearray(bdata))
input_pipe.flush()
buffersize = 4
output_tokens = []
while True:
data = output_pipe.read(buffersize)
if len(data) == 0:
break
token = int.from_bytes(data, sys.byteorder)
idx += 1
if time_t1 is None:
time_t1 = time.perf_counter()
with tempfile.TemporaryDirectory() as temp_dir:
# run prefill with PrefillRunner
output = self(input_ids=inputs,
attention_mask=torch.ones(1, inputs.shape[1]).int())
logits = output.logits
input_id = torch.argmax(logits[:, -1, :], dim=1)
input_id.to(torch.int32).numpy().tofile(os.path.join(temp_dir, "input_id.bin"))
position = np.int64(inputs.shape[1])
position.tofile(os.path.join(temp_dir, "position.bin"))
past_key_values = output.past_key_values
key_cache = past_key_values.key_cache
value_cache = past_key_values.value_cache
for layer in range(self.num_layers):
key_ = key_cache[layer]
val_ = value_cache[layer]
new_size = (
key_.size(0),
key_.size(1),
self.kv_len,
key_.size(3),
)
key = key_.as_strided(new_size, key_.stride(), storage_offset=0)
if not self.transpose_value_cache:
val = val_.as_strided(new_size, val_.stride(), storage_offset=0)
else:
new_size = (
val_.size(0),
val_.size(1),
val_.size(3),
self.kv_len,
)
val_cache = val_.transpose(-1, -2)
val = val_cache.as_strided(new_size, val_cache.stride(), storage_offset=0)
key.to(torch.float16).numpy().tofile(os.path.join(temp_dir, f"key_cache_{layer}.bin"))
val.to(torch.float16).numpy().tofile(os.path.join(temp_dir, f"value_cache_{layer}.bin"))
token = input_id.to(torch.int32).item()
output_tokens.append(torch.tensor([token]))
if streamer is not None:
streamer.put(torch.tensor([token]))
if token == eos:
break
output = torch.stack(output_tokens, dim=1)
output = torch.cat((inputs, output), dim=1)
if streamer is not None:
streamer.end()
if "eos_token_id" not in new_generate_kwargs:
eos = 0xffffffff
else:
eos = new_generate_kwargs["eos_token_id"]
time_t1 = time.perf_counter()
idx += 1
# start generate_serve by Thread
thread = threading.Thread(target=generate_serve,
args=(self.kv_len, self.num_head,
self.head_dim, self.num_layers,
self.transpose_value_cache,
new_tokens - 2))
thread.start()
in_pipe_path = "\\\\.\\pipe\\llminputpipe"
out_pipe_path = "\\\\.\\pipe\\llmoutputpipe"
while True:
try:
input_pipe = open(in_pipe_path, "wb")
except:
print('Waiting for input pipe')
time.sleep(1)
else:
break
while True:
try:
output_pipe = open(out_pipe_path, "rb")
except:
print('Waiting for output pipe')
time.sleep(1)
else:
break
time_start = time.perf_counter()
bdata = str.encode(str(temp_dir))
invalidInputError(len(bdata) <= 2000,
f"Leng of input directory is too long ({len(bdata)}), "
"which may cause read error.")
input_pipe.write(bdata)
input_pipe.flush()
buffersize = 4
while True:
data = output_pipe.read(buffersize)
if len(data) == 0:
break
token = int.from_bytes(data, sys.byteorder)
idx += 1
output_tokens.append(torch.tensor([token]))
if streamer is not None:
streamer.put(torch.tensor([token]))
if token == eos:
break
output = torch.stack(output_tokens, dim=1)
output = torch.cat((inputs, output), dim=1)
if streamer is not None:
streamer.end()
thread.join()
time_end = time.perf_counter()
if do_print:
print(f" Start the thread and connect the pipe time: {(time_start - time_start_all):.2f} s")
print(f" Start the thread and connect the pipe time: {(time_start - time_t1):.2f} s")
print(f" Number of input tokens: {input_length}")
print(f" Generated tokens: {idx}")
print(f" First token generation time: {(time_t1 - time_start):.2f} s")
print(f" Generation average latency: {(time_end - time_t1)*1000 /(idx - 1):.2f} ms, "
f"({(idx - 1)/(time_end - time_t1):.2f} token/s)")
print(f" Generation time: {(time_end - time_start):.2f} s\n")
print(f" First token generation time: {(time_t1 - time_start_all):.2f} s")
print(f" Generation average latency: {(time_end - time_start) * 1000 /(idx - 1):.2f} ms, "
f"({(idx - 1)/(time_end - time_start):.2f} token/s)")
print(f" Generation time: {(time_end - time_start_all - (time_start - time_t1)):.2f} s\n")
return output
@ -182,8 +224,15 @@ def update_names_of_IR_and_export_blob(model, model_name, dir):
def convert_llm(model: torch.nn.Module,
kv_len: int,
max_prompt_len: int,
transpose_value_cache: bool):
if model.config.model_type == "llama":
from ipex_llm.transformers.npu_models.convert_mp import convert_llama
convert_llama(model,
max_output_len=kv_len,
max_prompt_len=max_prompt_len,
decoder=False,
transpose_value_cache=transpose_value_cache)
from .llama import LowBitLlamaLMHead, LlamaEmbedding
with tempfile.TemporaryDirectory() as temp_dir:
# generate lm_head blob
@ -231,13 +280,12 @@ def convert_llm(model: torch.nn.Module,
new_embedding = LlamaEmbedding(
vocab_size=model.config.vocab_size,
embedding_dim=model.config.hidden_size,
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
padding_idx=model.config.pad_token_id,
dtype=np.float16,
)
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
temp_dir)
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
# generate decoder layer blob
from ipex_llm.transformers.npu_models.llama_mp import LowBitLlamaMultiDecoderlayer

View file

@ -80,6 +80,7 @@ class LlamaEmbedding(NNFactory):
self,
vocab_size,
embedding_dim,
embedding_weight,
padding_idx,
dtype, # fp16
device: str = "NPU",
@ -91,7 +92,7 @@ class LlamaEmbedding(NNFactory):
self.dtype = dtype
# define input
weight = self.parameter((vocab_size, embedding_dim))
weight = self.constant(embedding_weight)
input = self.parameter((1, 1), dtype=np.int32)
if padding_idx == -1: