diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml index 0492a637..036ce2e6 100644 --- a/.github/workflows/llm_performance_tests.yml +++ b/.github/workflows/llm_performance_tests.yml @@ -108,6 +108,7 @@ jobs: python -m pip install --upgrade einops python -m pip install --upgrade transformers_stream_generator python -m pip install --upgrade tiktoken + python -m pip install transformers==4.34.0 - name: Download llm binary uses: ./.github/actions/llm/download-llm-binary @@ -134,7 +135,6 @@ jobs: export http_proxy=${HTTP_PROXY} export https_proxy=${HTTPS_PROXY} python run.py - curl -T ./*.csv ${LLM_FTP_URL}/llm/ggml-actions/perf/ cp ./*.csv /mnt/disk1/nightly_perf_gpu/ cd ../../../test/benchmark python csv_to_html.py -f /mnt/disk1/nightly_perf_gpu/ diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index f7fe119d..09954e76 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -59,7 +59,7 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, result = run_deepspeed_transformer_int4_cpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit) for in_out_pair in in_out_pairs: - if result: + if result and result[in_out_pair]: results.append([repo_id, round(np.mean(result[in_out_pair], axis=0)[0]*1000.0, 2), round(np.mean(result[in_out_pair], axis=0)[1]*1000.0, 2), @@ -357,38 +357,41 @@ def run_transformer_int4_gpu(repo_id, result = {} with torch.inference_mode(): for in_out in in_out_pairs: - in_out_len = in_out.split("-") - in_len = int(in_out_len[0]) - out_len = int(in_out_len[1]) - # As different tokenizer has different encodings, - # in_len.txt maybe shorter than we need, - # use much longer context to make sure input length - test_length = min(in_len*2, 8192) - while test_length not in [32, 256, 1024, 2048, 8192]: - test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() - # As different tokenizer has different encodings, - # slice the input_ids to ensure the prompt length is required length. - input_ids = tokenizer.encode(input_str, return_tensors="pt") - input_ids = input_ids[:, :in_len] - true_str = tokenizer.batch_decode(input_ids)[0] - input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') - actual_in_len = input_ids.shape[1] - result[in_out] = [] - for i in range(num_trials + warm_up): - st = time.perf_counter() - output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len, - num_beams=num_beams) - torch.xpu.synchronize() - end = time.perf_counter() - output_ids = output_ids.cpu() - print("model generate cost: " + str(end - st)) - output = tokenizer.batch_decode(output_ids) - print(output[0]) - actual_out_len = output_ids.shape[1] - actual_in_len - if i >= warm_up: - result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, - actual_in_len, actual_out_len]) + try: + in_out_len = in_out.split("-") + in_len = int(in_out_len[0]) + out_len = int(in_out_len[1]) + # As different tokenizer has different encodings, + # in_len.txt maybe shorter than we need, + # use much longer context to make sure input length + test_length = min(in_len*2, 8192) + while test_length not in [32, 256, 1024, 2048, 8192]: + test_length = test_length * 2 + input_str = open(f"prompt/{test_length}.txt", 'r').read() + # As different tokenizer has different encodings, + # slice the input_ids to ensure the prompt length is required length. + input_ids = tokenizer.encode(input_str, return_tensors="pt") + input_ids = input_ids[:, :in_len] + true_str = tokenizer.batch_decode(input_ids)[0] + input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') + actual_in_len = input_ids.shape[1] + result[in_out] = [] + for i in range(num_trials + warm_up): + st = time.perf_counter() + output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len, + num_beams=num_beams) + torch.xpu.synchronize() + end = time.perf_counter() + output_ids = output_ids.cpu() + print("model generate cost: " + str(end - st)) + output = tokenizer.batch_decode(output_ids) + print(output[0]) + actual_out_len = output_ids.shape[1] - actual_in_len + if i >= warm_up: + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, + actual_in_len, actual_out_len]) + except RuntimeError: + pass torch.xpu.empty_cache() return result diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml index 2b6bbffb..031db953 100644 --- a/python/llm/test/benchmark/arc-perf-test.yaml +++ b/python/llm/test/benchmark/arc-perf-test.yaml @@ -4,13 +4,19 @@ repo_id: - 'THUDM/chatglm2-6b' - 'tiiuae/falcon-7b-instruct-with-patch' - 'mosaicml/mpt-7b-chat' + # - 'bigscience/bloomz-7b1' # temporarily removed - 'redpajama/gptneox-7b-redpajama-bf16' + - 'bigcode/starcoder-15.5b' - 'databricks/dolly-v1-6b' - 'databricks/dolly-v2-7b' - 'databricks/dolly-v2-12b' - 'internlm/internlm-chat-7b-8k' + - 'baichuan-inc/Baichuan-13B-Chat' + - 'fnlp/moss-moon-003-sft' - 'Qwen/Qwen-7B-Chat-10-12' - 'BAAI/AquilaChat-7B' + - 'baichuan-inc/Baichuan2-7B-Chat' + # - 'mistralai/Mistral-7B-v0.1' # temporarily removed local_model_hub: '/mnt/disk1/models' warm_up: 1 num_trials: 3