Add 3 leaderboard tasks (#9566)
* update leaderboard map * download model and dataset without overwritten * fix task drop * run on all available devices
This commit is contained in:
parent
74fd7077a2
commit
4d7d5d4c59
3 changed files with 33 additions and 13 deletions
14
.github/workflows/llm-harness-evaluation.yml
vendored
14
.github/workflows/llm-harness-evaluation.yml
vendored
|
|
@ -30,13 +30,13 @@ jobs:
|
||||||
# python-version: "3.9"
|
# python-version: "3.9"
|
||||||
# model_name: "stablelm-3b-4e1t"
|
# model_name: "stablelm-3b-4e1t"
|
||||||
# task: "arc"
|
# task: "arc"
|
||||||
# precision: "sym_int4" #options: sym_int4, fp4, nf4, mixed_4bit, fp8
|
# precision: "sym_int4" #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8
|
||||||
python-version: ["3.9"]
|
python-version: ["3.9"]
|
||||||
model_name: [stablelm-3b-4e1t]
|
model_name: [stablelm-3b-4e1t]
|
||||||
task: ["truthfulqa"]
|
task: [winogrande, drop, gsm8k] # truthfulqa, arc, hellaswag, mmlu, winogrande, drop, gsm8k
|
||||||
precision: [sym_int4] #options: sym_int4, fp4, nf4, mixed_4bit, fp8
|
precision: [sym_int4] #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8
|
||||||
|
|
||||||
runs-on: [self-hosted, llm, accuracy, temp-arc01]
|
runs-on: [self-hosted, llm, accuracy]
|
||||||
env:
|
env:
|
||||||
ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
|
ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
|
||||||
ORIGIN_DIR: /mnt/disk1/models
|
ORIGIN_DIR: /mnt/disk1/models
|
||||||
|
|
@ -78,11 +78,9 @@ jobs:
|
||||||
MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
|
MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
|
||||||
if [ ! -d $HARNESS_HF_HOME ]; then
|
if [ ! -d $HARNESS_HF_HOME ]; then
|
||||||
mkdir -p $HARNESS_HF_HOME
|
mkdir -p $HARNESS_HF_HOME
|
||||||
wget -r -nH -l inf --no-verbose --cut-dirs=2 ${LLM_FTP_URL}/llm/LeaderBoard_Datasets/ -P $HARNESS_HF_HOME/
|
|
||||||
fi
|
|
||||||
if [ ! -d $MODEL_PATH ]; then
|
|
||||||
wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
|
|
||||||
fi
|
fi
|
||||||
|
wget -r -nH -nc -l inf --no-verbose --cut-dirs=2 ${LLM_FTP_URL}/llm/LeaderBoard_Datasets/ -P $HARNESS_HF_HOME/
|
||||||
|
wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
|
||||||
|
|
||||||
- name: Upgrade packages
|
- name: Upgrade packages
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
|
||||||
|
|
@ -78,7 +78,7 @@ class BigDLLM(BaseLM):
|
||||||
@property
|
@property
|
||||||
def eot_token_id(self):
|
def eot_token_id(self):
|
||||||
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
|
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
|
||||||
return self.model.token_eos()
|
return self.tokenizer.eos_token_id
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_length(self):
|
def max_length(self):
|
||||||
|
|
@ -103,7 +103,7 @@ class BigDLLM(BaseLM):
|
||||||
return input_ids
|
return input_ids
|
||||||
|
|
||||||
def tok_decode(self, tokens):
|
def tok_decode(self, tokens):
|
||||||
return self.tokenizer.decode(output[0], skip_special_tokens=True)
|
return self.tokenizer.decode(tokens, skip_special_tokens=True)
|
||||||
|
|
||||||
def _model_call(self, inps):
|
def _model_call(self, inps):
|
||||||
"""
|
"""
|
||||||
|
|
@ -119,4 +119,10 @@ class BigDLLM(BaseLM):
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def _model_generate(self, context, max_length, eos_token_id):
|
def _model_generate(self, context, max_length, eos_token_id):
|
||||||
return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True)
|
generation_kwargs = {"do_sample": False, "max_length": max_length}
|
||||||
|
if eos_token_id is not None:
|
||||||
|
generation_kwargs["eos_token_id"] = eos_token_id
|
||||||
|
generation_kwargs[
|
||||||
|
"pad_token_id"
|
||||||
|
] = eos_token_id # setting eos_token_id as pad token
|
||||||
|
return self.model.generate(context, **generation_kwargs)
|
||||||
|
|
@ -20,7 +20,10 @@ task_map = dict(
|
||||||
hellaswag="hellaswag",
|
hellaswag="hellaswag",
|
||||||
arc="arc_challenge",
|
arc="arc_challenge",
|
||||||
truthfulqa="truthfulqa_mc",
|
truthfulqa="truthfulqa_mc",
|
||||||
mmlu="hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
|
mmlu="hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions",
|
||||||
|
winogrande='winogrande',
|
||||||
|
gsm8k='gsm8k',
|
||||||
|
drop='drop'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -28,10 +31,23 @@ task_to_n_few_shots = dict(
|
||||||
hellaswag=10,
|
hellaswag=10,
|
||||||
arc=25,
|
arc=25,
|
||||||
truthfulqa=0,
|
truthfulqa=0,
|
||||||
mmlu=5
|
mmlu=5,
|
||||||
|
winogrande=5,
|
||||||
|
gsm8k=5,
|
||||||
|
drop=3
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
task_to_metric = dict(
|
||||||
|
hellaswag='acc_norm',
|
||||||
|
arc='acc_norm',
|
||||||
|
truthfulqa='mc2',
|
||||||
|
mmlu='acc',
|
||||||
|
winogrande='acc',
|
||||||
|
gsm8k='acc',
|
||||||
|
drop='f1'
|
||||||
|
)
|
||||||
|
|
||||||
def parse_precision(precision, model="bigdl-llm"):
|
def parse_precision(precision, model="bigdl-llm"):
|
||||||
result = match(r"([a-zA-Z_]*)(\d+)", precision)
|
result = match(r"([a-zA-Z_]*)(\d+)", precision)
|
||||||
datatype = result.group(1)
|
datatype = result.group(1)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue