From 4d7d5d4c598c36ac9c6a7307109a3b65c47cb93a Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 1 Dec 2023 14:01:14 +0800 Subject: [PATCH] Add 3 leaderboard tasks (#9566) * update leaderboard map * download model and dataset without overwritten * fix task drop * run on all available devices --- .github/workflows/llm-harness-evaluation.yml | 14 ++++++------- python/llm/dev/benchmark/harness/bigdl_llm.py | 12 ++++++++--- .../harness/harness_to_leaderboard.py | 20 +++++++++++++++++-- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 24adf36a..2fcc0e73 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -30,13 +30,13 @@ jobs: # python-version: "3.9" # model_name: "stablelm-3b-4e1t" # task: "arc" - # precision: "sym_int4" #options: sym_int4, fp4, nf4, mixed_4bit, fp8 + # precision: "sym_int4" #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8 python-version: ["3.9"] model_name: [stablelm-3b-4e1t] - task: ["truthfulqa"] - precision: [sym_int4] #options: sym_int4, fp4, nf4, mixed_4bit, fp8 + task: [winogrande, drop, gsm8k] # truthfulqa, arc, hellaswag, mmlu, winogrande, drop, gsm8k + precision: [sym_int4] #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8 - runs-on: [self-hosted, llm, accuracy, temp-arc01] + runs-on: [self-hosted, llm, accuracy] env: ANALYTICS_ZOO_ROOT: ${{ github.workspace }} ORIGIN_DIR: /mnt/disk1/models @@ -78,11 +78,9 @@ jobs: MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/ if [ ! -d $HARNESS_HF_HOME ]; then mkdir -p $HARNESS_HF_HOME - wget -r -nH -l inf --no-verbose --cut-dirs=2 ${LLM_FTP_URL}/llm/LeaderBoard_Datasets/ -P $HARNESS_HF_HOME/ - fi - if [ ! -d $MODEL_PATH ]; then - wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR} fi + wget -r -nH -nc -l inf --no-verbose --cut-dirs=2 ${LLM_FTP_URL}/llm/LeaderBoard_Datasets/ -P $HARNESS_HF_HOME/ + wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR} - name: Upgrade packages shell: bash diff --git a/python/llm/dev/benchmark/harness/bigdl_llm.py b/python/llm/dev/benchmark/harness/bigdl_llm.py index 39dd0058..c9a27b79 100644 --- a/python/llm/dev/benchmark/harness/bigdl_llm.py +++ b/python/llm/dev/benchmark/harness/bigdl_llm.py @@ -78,7 +78,7 @@ class BigDLLM(BaseLM): @property def eot_token_id(self): # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* - return self.model.token_eos() + return self.tokenizer.eos_token_id @property def max_length(self): @@ -103,7 +103,7 @@ class BigDLLM(BaseLM): return input_ids def tok_decode(self, tokens): - return self.tokenizer.decode(output[0], skip_special_tokens=True) + return self.tokenizer.decode(tokens, skip_special_tokens=True) def _model_call(self, inps): """ @@ -119,4 +119,10 @@ class BigDLLM(BaseLM): return res def _model_generate(self, context, max_length, eos_token_id): - return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True) + generation_kwargs = {"do_sample": False, "max_length": max_length} + if eos_token_id is not None: + generation_kwargs["eos_token_id"] = eos_token_id + generation_kwargs[ + "pad_token_id" + ] = eos_token_id # setting eos_token_id as pad token + return self.model.generate(context, **generation_kwargs) \ No newline at end of file diff --git a/python/llm/dev/benchmark/harness/harness_to_leaderboard.py b/python/llm/dev/benchmark/harness/harness_to_leaderboard.py index ce8b2620..428f4822 100644 --- a/python/llm/dev/benchmark/harness/harness_to_leaderboard.py +++ b/python/llm/dev/benchmark/harness/harness_to_leaderboard.py @@ -20,7 +20,10 @@ task_map = dict( hellaswag="hellaswag", arc="arc_challenge", truthfulqa="truthfulqa_mc", - mmlu="hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions" + mmlu="hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions", + winogrande='winogrande', + gsm8k='gsm8k', + drop='drop' ) @@ -28,10 +31,23 @@ task_to_n_few_shots = dict( hellaswag=10, arc=25, truthfulqa=0, - mmlu=5 + mmlu=5, + winogrande=5, + gsm8k=5, + drop=3 ) +task_to_metric = dict( + hellaswag='acc_norm', + arc='acc_norm', + truthfulqa='mc2', + mmlu='acc', + winogrande='acc', + gsm8k='acc', + drop='f1' +) + def parse_precision(precision, model="bigdl-llm"): result = match(r"([a-zA-Z_]*)(\d+)", precision) datatype = result.group(1)