LLM: add more models and skip runtime error (#9349)
* add more models and skip runtime error * upgrade transformers * temporarily removed Mistral-7B-v0.1 * temporarily disable the upload of arc perf result
This commit is contained in:
parent
fae6db3ddc
commit
84ab614aab
3 changed files with 43 additions and 34 deletions
2
.github/workflows/llm_performance_tests.yml
vendored
2
.github/workflows/llm_performance_tests.yml
vendored
|
|
@ -108,6 +108,7 @@ jobs:
|
||||||
python -m pip install --upgrade einops
|
python -m pip install --upgrade einops
|
||||||
python -m pip install --upgrade transformers_stream_generator
|
python -m pip install --upgrade transformers_stream_generator
|
||||||
python -m pip install --upgrade tiktoken
|
python -m pip install --upgrade tiktoken
|
||||||
|
python -m pip install transformers==4.34.0
|
||||||
|
|
||||||
- name: Download llm binary
|
- name: Download llm binary
|
||||||
uses: ./.github/actions/llm/download-llm-binary
|
uses: ./.github/actions/llm/download-llm-binary
|
||||||
|
|
@ -134,7 +135,6 @@ jobs:
|
||||||
export http_proxy=${HTTP_PROXY}
|
export http_proxy=${HTTP_PROXY}
|
||||||
export https_proxy=${HTTPS_PROXY}
|
export https_proxy=${HTTPS_PROXY}
|
||||||
python run.py
|
python run.py
|
||||||
curl -T ./*.csv ${LLM_FTP_URL}/llm/ggml-actions/perf/
|
|
||||||
cp ./*.csv /mnt/disk1/nightly_perf_gpu/
|
cp ./*.csv /mnt/disk1/nightly_perf_gpu/
|
||||||
cd ../../../test/benchmark
|
cd ../../../test/benchmark
|
||||||
python csv_to_html.py -f /mnt/disk1/nightly_perf_gpu/
|
python csv_to_html.py -f /mnt/disk1/nightly_perf_gpu/
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
|
||||||
result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
|
result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
|
||||||
|
|
||||||
for in_out_pair in in_out_pairs:
|
for in_out_pair in in_out_pairs:
|
||||||
if result:
|
if result and result[in_out_pair]:
|
||||||
results.append([repo_id,
|
results.append([repo_id,
|
||||||
round(np.mean(result[in_out_pair], axis=0)[0]*1000.0, 2),
|
round(np.mean(result[in_out_pair], axis=0)[0]*1000.0, 2),
|
||||||
round(np.mean(result[in_out_pair], axis=0)[1]*1000.0, 2),
|
round(np.mean(result[in_out_pair], axis=0)[1]*1000.0, 2),
|
||||||
|
|
@ -357,38 +357,41 @@ def run_transformer_int4_gpu(repo_id,
|
||||||
result = {}
|
result = {}
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
for in_out in in_out_pairs:
|
for in_out in in_out_pairs:
|
||||||
in_out_len = in_out.split("-")
|
try:
|
||||||
in_len = int(in_out_len[0])
|
in_out_len = in_out.split("-")
|
||||||
out_len = int(in_out_len[1])
|
in_len = int(in_out_len[0])
|
||||||
# As different tokenizer has different encodings,
|
out_len = int(in_out_len[1])
|
||||||
# in_len.txt maybe shorter than we need,
|
# As different tokenizer has different encodings,
|
||||||
# use much longer context to make sure input length
|
# in_len.txt maybe shorter than we need,
|
||||||
test_length = min(in_len*2, 8192)
|
# use much longer context to make sure input length
|
||||||
while test_length not in [32, 256, 1024, 2048, 8192]:
|
test_length = min(in_len*2, 8192)
|
||||||
test_length = test_length * 2
|
while test_length not in [32, 256, 1024, 2048, 8192]:
|
||||||
input_str = open(f"prompt/{test_length}.txt", 'r').read()
|
test_length = test_length * 2
|
||||||
# As different tokenizer has different encodings,
|
input_str = open(f"prompt/{test_length}.txt", 'r').read()
|
||||||
# slice the input_ids to ensure the prompt length is required length.
|
# As different tokenizer has different encodings,
|
||||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
# slice the input_ids to ensure the prompt length is required length.
|
||||||
input_ids = input_ids[:, :in_len]
|
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
input_ids = input_ids[:, :in_len]
|
||||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||||
actual_in_len = input_ids.shape[1]
|
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||||
result[in_out] = []
|
actual_in_len = input_ids.shape[1]
|
||||||
for i in range(num_trials + warm_up):
|
result[in_out] = []
|
||||||
st = time.perf_counter()
|
for i in range(num_trials + warm_up):
|
||||||
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
|
st = time.perf_counter()
|
||||||
num_beams=num_beams)
|
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
|
||||||
torch.xpu.synchronize()
|
num_beams=num_beams)
|
||||||
end = time.perf_counter()
|
torch.xpu.synchronize()
|
||||||
output_ids = output_ids.cpu()
|
end = time.perf_counter()
|
||||||
print("model generate cost: " + str(end - st))
|
output_ids = output_ids.cpu()
|
||||||
output = tokenizer.batch_decode(output_ids)
|
print("model generate cost: " + str(end - st))
|
||||||
print(output[0])
|
output = tokenizer.batch_decode(output_ids)
|
||||||
actual_out_len = output_ids.shape[1] - actual_in_len
|
print(output[0])
|
||||||
if i >= warm_up:
|
actual_out_len = output_ids.shape[1] - actual_in_len
|
||||||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
if i >= warm_up:
|
||||||
actual_in_len, actual_out_len])
|
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||||
|
actual_in_len, actual_out_len])
|
||||||
|
except RuntimeError:
|
||||||
|
pass
|
||||||
torch.xpu.empty_cache()
|
torch.xpu.empty_cache()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,13 +4,19 @@ repo_id:
|
||||||
- 'THUDM/chatglm2-6b'
|
- 'THUDM/chatglm2-6b'
|
||||||
- 'tiiuae/falcon-7b-instruct-with-patch'
|
- 'tiiuae/falcon-7b-instruct-with-patch'
|
||||||
- 'mosaicml/mpt-7b-chat'
|
- 'mosaicml/mpt-7b-chat'
|
||||||
|
# - 'bigscience/bloomz-7b1' # temporarily removed
|
||||||
- 'redpajama/gptneox-7b-redpajama-bf16'
|
- 'redpajama/gptneox-7b-redpajama-bf16'
|
||||||
|
- 'bigcode/starcoder-15.5b'
|
||||||
- 'databricks/dolly-v1-6b'
|
- 'databricks/dolly-v1-6b'
|
||||||
- 'databricks/dolly-v2-7b'
|
- 'databricks/dolly-v2-7b'
|
||||||
- 'databricks/dolly-v2-12b'
|
- 'databricks/dolly-v2-12b'
|
||||||
- 'internlm/internlm-chat-7b-8k'
|
- 'internlm/internlm-chat-7b-8k'
|
||||||
|
- 'baichuan-inc/Baichuan-13B-Chat'
|
||||||
|
- 'fnlp/moss-moon-003-sft'
|
||||||
- 'Qwen/Qwen-7B-Chat-10-12'
|
- 'Qwen/Qwen-7B-Chat-10-12'
|
||||||
- 'BAAI/AquilaChat-7B'
|
- 'BAAI/AquilaChat-7B'
|
||||||
|
- 'baichuan-inc/Baichuan2-7B-Chat'
|
||||||
|
# - 'mistralai/Mistral-7B-v0.1' # temporarily removed
|
||||||
local_model_hub: '/mnt/disk1/models'
|
local_model_hub: '/mnt/disk1/models'
|
||||||
warm_up: 1
|
warm_up: 1
|
||||||
num_trials: 3
|
num_trials: 3
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue