[NPU] Support streaming in Python (cpp backend) (#12488)

* Support streaming in NPU Python (cpp backend)

* Small fix
This commit is contained in:
Yuwen Hu 2024-12-03 17:17:26 +08:00 committed by GitHub
parent 7082844f3f
commit 4ac66db034
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -314,8 +314,14 @@ def generate(
new_generate_kwargs[var] = value new_generate_kwargs[var] = value
if isinstance(inputs[0], torch.Tensor): if isinstance(inputs[0], torch.Tensor):
if streamer is not None:
# input ids
streamer.put(inputs[0])
input_list = inputs[0].flatten().tolist() input_list = inputs[0].flatten().tolist()
else: else:
if streamer is not None:
# input ids
streamer.put(torch.Tensor(inputs[0]))
input_list = inputs[0] input_list = inputs[0]
input_length = len(input_list) input_length = len(input_list)
@ -335,6 +341,9 @@ def generate(
from .npu_llm_cpp import run_decode, run_prefill, reset from .npu_llm_cpp import run_decode, run_prefill, reset
token = run_prefill(self.model_ptr, input_list, self.vocab_size) token = run_prefill(self.model_ptr, input_list, self.vocab_size)
if streamer is not None:
# 1st tokens
streamer.put(torch.tensor([token]))
idx = 1 idx = 1
time_t2 = time.perf_counter() time_t2 = time.perf_counter()
output_tokens.append(torch.tensor([token])) output_tokens.append(torch.tensor([token]))
@ -342,12 +351,18 @@ def generate(
if token == eos: if token == eos:
break break
token = run_decode(self.model_ptr, token, self.vocab_size) token = run_decode(self.model_ptr, token, self.vocab_size)
if streamer is not None:
# rest tokens
streamer.put(torch.tensor([token]))
idx += 1 idx += 1
output_tokens.append(torch.tensor([token])) output_tokens.append(torch.tensor([token]))
output = torch.stack(output_tokens, dim=1) output = torch.stack(output_tokens, dim=1)
output = torch.cat((inputs, output), dim=1) output = torch.cat((inputs, output), dim=1)
time_t3 = time.perf_counter() time_t3 = time.perf_counter()
if streamer is not None:
streamer.end()
reset(self.model_ptr) reset(self.model_ptr)
self.first_cost = time_t2 - time_t1 # seconds self.first_cost = time_t2 - time_t1 # seconds
self.rest_cost_mean = (time_t3 - time_t2) / (idx - 1) # seconds self.rest_cost_mean = (time_t3 - time_t2) / (idx - 1) # seconds