LLM: using separate threads to do inference (#9727)

* using separate threads to do inference

* resolve some comments

* resolve some comments

* revert llm_performance_tests.yml file
This commit is contained in:
WeiguangHan 2023-12-21 17:56:43 +08:00 committed by GitHub
parent 426660b88e
commit 474c099559
2 changed files with 38 additions and 38 deletions

View file

@ -20,6 +20,7 @@ import torch
import time import time
import gc import gc
import traceback import traceback
import threading
import numpy as np import numpy as np
from datetime import date from datetime import date
@ -44,6 +45,21 @@ LLAVA_IDS = ['liuhaotian/llava-v1.5-7b']
results = [] results = []
excludes = [] excludes = []
def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials):
for i in range(num_trials + warm_up):
st = time.perf_counter()
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
num_beams=num_beams)
torch.xpu.synchronize()
end = time.perf_counter()
output_ids = output_ids.cpu()
print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids)
print(output[0])
actual_out_len = output_ids.shape[1] - actual_in_len
if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
actual_in_len, actual_out_len])
def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False): def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False):
# TODO: make a parameter # TODO: make a parameter
@ -368,42 +384,27 @@ def run_transformer_int4_gpu(repo_id,
result = {} result = {}
with torch.inference_mode(): with torch.inference_mode():
for in_out in in_out_pairs: for in_out in in_out_pairs:
try: in_out_len = in_out.split("-")
in_out_len = in_out.split("-") in_len = int(in_out_len[0])
in_len = int(in_out_len[0]) out_len = int(in_out_len[1])
out_len = int(in_out_len[1]) # As different tokenizer has different encodings,
# As different tokenizer has different encodings, # in_len.txt maybe shorter than we need,
# in_len.txt maybe shorter than we need, # use much longer context to make sure input length
# use much longer context to make sure input length test_length = min(in_len*2, 8192)
test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]:
while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2
test_length = test_length * 2 input_str = open(f"prompt/{test_length}.txt", 'r').read()
input_str = open(f"prompt/{test_length}.txt", 'r').read() # As different tokenizer has different encodings,
# As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length.
# slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len]
input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0]
true_str = tokenizer.batch_decode(input_ids)[0] input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') actual_in_len = input_ids.shape[1]
actual_in_len = input_ids.shape[1] result[in_out] = []
result[in_out] = [] thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials))
for i in range(num_trials + warm_up): thread.start()
st = time.perf_counter() thread.join()
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
num_beams=num_beams)
torch.xpu.synchronize()
end = time.perf_counter()
output_ids = output_ids.cpu()
print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids)
print(output[0])
actual_out_len = output_ids.shape[1] - actual_in_len
if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
actual_in_len, actual_out_len])
except RuntimeError:
traceback.print_exc()
pass
del model del model
torch.xpu.empty_cache() torch.xpu.empty_cache()
return result return result

View file

@ -5,7 +5,7 @@ repo_id:
- 'tiiuae/falcon-7b-instruct-with-patch' - 'tiiuae/falcon-7b-instruct-with-patch'
- 'mosaicml/mpt-7b-chat' - 'mosaicml/mpt-7b-chat'
- 'redpajama/gptneox-7b-redpajama-bf16' - 'redpajama/gptneox-7b-redpajama-bf16'
# - 'bigcode/starcoder-15.5b' - 'bigcode/starcoder-15.5b'
- 'databricks/dolly-v1-6b' - 'databricks/dolly-v1-6b'
- 'databricks/dolly-v2-7b' - 'databricks/dolly-v2-7b'
- 'databricks/dolly-v2-12b' - 'databricks/dolly-v2-12b'
@ -30,4 +30,3 @@ cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu w
exclude: exclude:
- 'fnlp/moss-moon-003-sft:1024' - 'fnlp/moss-moon-003-sft:1024'
- 'fnlp/moss-moon-003-sft:2048' - 'fnlp/moss-moon-003-sft:2048'
- 'bigscience/bloomz-7b1:2048'