From d19ca21957ea09164248f1ded0acf728f178be07 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Tue, 14 Nov 2023 12:51:39 +0800 Subject: [PATCH] patch bigdl-llm model to harness by binding instead of patch file (#9420) * add run_llb.py * fix args interpret * modify outputs * update workflow * add license * test mixed 4 bit * update readme * use autotokenizer * add timeout * refactor workflow file * fix working directory * fix env * throw exception if some jobs failed * improve terminal outputs * Disable var which cause the run stuck * fix unknown precision * fix key error * directly output config instead * rm harness submodule --- .github/workflows/llm-harness-evaluation.yml | 42 ++--- python/llm/dev/benchmark/harness/README.md | 8 +- .../llm/dev/benchmark/harness/bigdl-llm.patch | 151 ------------------ python/llm/dev/benchmark/harness/bigdl_llm.py | 121 ++++++++++++++ .../harness/harness_to_leaderboard.py | 51 ++++++ python/llm/dev/benchmark/harness/llb.py | 82 ---------- python/llm/dev/benchmark/harness/run_llb.py | 149 +++++++++++++++++ 7 files changed, 347 insertions(+), 257 deletions(-) delete mode 100644 python/llm/dev/benchmark/harness/bigdl-llm.patch create mode 100644 python/llm/dev/benchmark/harness/bigdl_llm.py create mode 100644 python/llm/dev/benchmark/harness/harness_to_leaderboard.py delete mode 100644 python/llm/dev/benchmark/harness/llb.py create mode 100644 python/llm/dev/benchmark/harness/run_llb.py diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index fe0e0f4f..1bed8aa7 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -20,25 +20,28 @@ on: jobs: llm-cpp-build: uses: ./.github/workflows/llm-binary-build.yml - llm-nightly-harness-test: + llm-harness-evalution: + timeout-minutes: 1000 needs: llm-cpp-build strategy: fail-fast: false matrix: + # include: + # python-version: "3.9" + # model_name: "stablelm-3b-4e1t" + # task: "arc" + # precision: "sym_int4" #options: sym_int4, fp4, nf4, mixed_4bit, fp8 python-version: ["3.9"] model_name: [stablelm-3b-4e1t] task: ["truthfulqa"] - precision: ["int4"] + precision: [sym_int4] #options: sym_int4, fp4, nf4, mixed_4bit, fp8 + runs-on: [self-hosted, llm, accuracy, temp-arc01] env: ANALYTICS_ZOO_ROOT: ${{ github.workspace }} + ORIGIN_DIR: /mnt/disk1/models + HARNESS_HF_HOME: /mnt/disk1/harness_home steps: - - name: Set model and dataset directories - shell: bash - run: | - echo "ORIGIN_DIR=/mnt/disk1/models" >> "$GITHUB_ENV" - echo "HARNESS_HF_HOME=/mnt/disk1/harness_home" >> "$GITHUB_ENV" - - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 @@ -60,16 +63,13 @@ jobs: extra-dependency: "xpu" - name: Install harness + working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/ shell: bash run: | - cd python/llm/dev/benchmark/harness/ git clone https://github.com/EleutherAI/lm-evaluation-harness.git cd lm-evaluation-harness git checkout e81d3cc pip install -e . - git apply ../bigdl-llm.patch - cd .. - - name: Download models and datasets shell: bash @@ -84,17 +84,21 @@ jobs: wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR} fi - - name: Set datasets env + - name: Upgrade packages shell: bash run: | - echo "HF_HOME=$HARNESS_HF_HOME" >> "$GITHUB_ENV" - echo "HF_DATASETS=$HARNESS_HF_HOME/datasets" >> "$GITHUB_ENV" - echo "HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets" >> "$GITHUB_ENV" + pip install --upgrade transformers - name: Run harness shell: bash + working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness + env: + USE_XETLA: OFF + # SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1 run: | - export USE_XETLA=OFF + export HF_HOME=${HARNESS_HF_HOME} + export HF_DATASETS=$HARNESS_HF_HOME/datasets + export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets source /opt/intel/oneapi/setvars.sh - cd python/llm/dev/benchmark/harness - python llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --output_dir results/${{ matrix.model_name }} --batch 1 + python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --batch_size 1 --no_cache + diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md index 8187cd5a..22359e08 100644 --- a/python/llm/dev/benchmark/harness/README.md +++ b/python/llm/dev/benchmark/harness/README.md @@ -9,20 +9,18 @@ git clone https://github.com/EleutherAI/lm-evaluation-harness.git cd lm-evaluation-harness git checkout e81d3cc pip install -e . -git apply ../bigdl-llm.patch -cd .. ``` ## Run -run `python llb.py`. `llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py). +run `python run_llb.py`. `run_llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py). ### Evaluation on CPU ```python -python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output +python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache ``` ### Evaluation on Intel GPU ```python -python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output +python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache ``` ## Results We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result. diff --git a/python/llm/dev/benchmark/harness/bigdl-llm.patch b/python/llm/dev/benchmark/harness/bigdl-llm.patch deleted file mode 100644 index b84ea631..00000000 --- a/python/llm/dev/benchmark/harness/bigdl-llm.patch +++ /dev/null @@ -1,151 +0,0 @@ -diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py -index 8ca27fac..6b581487 100644 ---- a/lm_eval/models/__init__.py -+++ b/lm_eval/models/__init__.py -@@ -4,6 +4,7 @@ from . import anthropic_llms - from . import huggingface - from . import textsynth - from . import dummy -+from . import bigdl_llm - - MODEL_REGISTRY = { - "hf": gpt2.HFLM, -@@ -15,6 +16,7 @@ MODEL_REGISTRY = { - "anthropic": anthropic_llms.AnthropicLM, - "textsynth": textsynth.TextSynthLM, - "dummy": dummy.DummyLM, -+ "bigdl-llm": bigdl_llm.BigDLLM - } - - -diff --git a/lm_eval/models/bigdl_llm.py b/lm_eval/models/bigdl_llm.py -new file mode 100644 -index 00000000..74010da3 ---- /dev/null -+++ b/lm_eval/models/bigdl_llm.py -@@ -0,0 +1,124 @@ -+# -+# Copyright 2016 The BigDL Authors. -+# -+# Licensed under the Apache License, Version 2.0 (the "License"); -+# you may not use this file except in compliance with the License. -+# You may obtain a copy of the License at -+# -+# http://www.apache.org/licenses/LICENSE-2.0 -+# -+# Unless required by applicable law or agreed to in writing, software -+# distributed under the License is distributed on an "AS IS" BASIS, -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+# See the License for the specific language governing permissions and -+# limitations under the License. -+# -+ -+ -+# this code is copied from llama2 example test, and added performance test -+import os -+import multiprocessing -+ -+from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM -+ -+import torch -+from typing import Optional, Union -+from lm_eval.base import BaseLM -+ -+from transformers import AutoTokenizer, LlamaTokenizer -+ -+def _get_dtype( -+ dtype: Union[str, torch.dtype] -+) -> torch.dtype: -+ """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig""" -+ if isinstance(dtype, str) and dtype != "auto": -+ # Convert `str` args torch dtype: `float16` -> `torch.float16` -+ _torch_dtype = getattr(torch, dtype) -+ else: -+ _torch_dtype = dtype -+ return _torch_dtype -+ -+class BigDLLM(BaseLM): -+ def __init__( -+ self, -+ device="xpu", -+ pretrained="gpt2", -+ revision="main", -+ low_cpu_mem_usage=None, -+ subfolder=None, -+ tokenizer=None, -+ batch_size=1, -+ load_in_8bit: Optional[bool] = False, -+ trust_remote_code: Optional[bool] = False, -+ load_in_low_bit=None, -+ dtype: Optional[Union[str, torch.dtype]] = "auto", -+ ): -+ super().__init__() -+ -+ assert isinstance(pretrained, str) -+ assert isinstance(batch_size, (int,str)) -+ if 'xpu' in device: -+ import intel_extension_for_pytorch as ipex -+ model = AutoModelForCausalLM.from_pretrained(pretrained, -+ load_in_low_bit=load_in_low_bit, -+ optimize_model=True, -+ trust_remote_code=True, -+ use_cache=True, -+ torch_dtype=_get_dtype(dtype)) -+ print(model) # print model to check precision -+ self._device = device -+ self.model = model.to(device) -+ -+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True) -+ -+ # setup for automatic batch size detection -+ if batch_size == 'auto': -+ self.batch_size_per_gpu = batch_size -+ else: -+ self.batch_size_per_gpu = int(batch_size) -+ -+ @property -+ def eot_token_id(self): -+ # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* -+ return self.model.token_eos() -+ -+ @property -+ def max_length(self): -+ return 2048 # TODO: how to get this from config -+ -+ @property -+ def max_gen_toks(self): -+ return 256 -+ -+ @property -+ def batch_size(self): -+ # TODO: fix multi-gpu -+ return self.batch_size_per_gpu # * gpus -+ -+ @property -+ def device(self): -+ # TODO: fix multi-gpu -+ return torch.device(self._device) -+ -+ def tok_encode(self, string: str): -+ input_ids = self.tokenizer.encode(string) -+ return input_ids -+ -+ def tok_decode(self, tokens): -+ return self.tokenizer.decode(output[0], skip_special_tokens=True) -+ -+ def _model_call(self, inps): -+ """ -+ inps: a torch tensor of shape [batch, sequence] -+ the size of sequence may vary from call to call -+ -+ returns: a torch tensor of shape [batch, sequence, vocab] with the -+ logits returned from the model -+ """ -+ with torch.inference_mode(): -+ inps = inps.to(self.device) -+ res = self.model(inps)[0] -+ return res -+ -+ def _model_generate(self, context, max_length, eos_token_id): -+ return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True) -\ No newline at end of file diff --git a/python/llm/dev/benchmark/harness/bigdl_llm.py b/python/llm/dev/benchmark/harness/bigdl_llm.py new file mode 100644 index 00000000..6e0bbfa6 --- /dev/null +++ b/python/llm/dev/benchmark/harness/bigdl_llm.py @@ -0,0 +1,121 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import multiprocessing + +from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM + +import torch +from typing import Optional, Union +from lm_eval.base import BaseLM + +from transformers import AutoTokenizer, LlamaTokenizer + +def _get_dtype( + dtype: Union[str, torch.dtype] +) -> torch.dtype: + """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig""" + if isinstance(dtype, str) and dtype != "auto": + # Convert `str` args torch dtype: `float16` -> `torch.float16` + _torch_dtype = getattr(torch, dtype) + else: + _torch_dtype = dtype + return _torch_dtype + +class BigDLLM(BaseLM): + def __init__( + self, + device="xpu", + pretrained="gpt2", + revision="main", + low_cpu_mem_usage=None, + subfolder=None, + tokenizer=None, + batch_size=1, + load_in_8bit: Optional[bool] = False, + trust_remote_code: Optional[bool] = False, + load_in_low_bit=None, + dtype: Optional[Union[str, torch.dtype]] = "auto", + ): + super().__init__() + + assert isinstance(pretrained, str) + assert isinstance(batch_size, (int,str)) + if device == 'xpu': + import intel_extension_for_pytorch as ipex + model = AutoModelForCausalLM.from_pretrained(pretrained, + load_in_low_bit=load_in_low_bit, + optimize_model=True, + trust_remote_code=True, + use_cache=True, + torch_dtype=_get_dtype(dtype)) + print(model) # print model to check precision + self._device = device + self.model = model.to(device) + + self.tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True) + + # setup for automatic batch size detection + if batch_size == 'auto': + self.batch_size_per_gpu = batch_size + else: + self.batch_size_per_gpu = int(batch_size) + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.model.token_eos() + + @property + def max_length(self): + return 2048 # TODO: how to get this from config + + @property + def max_gen_toks(self): + return 256 + + @property + def batch_size(self): + # TODO: fix multi-gpu + return self.batch_size_per_gpu # * gpus + + @property + def device(self): + # TODO: fix multi-gpu + return torch.device(self._device) + + def tok_encode(self, string: str): + input_ids = self.tokenizer.encode(string) + return input_ids + + def tok_decode(self, tokens): + return self.tokenizer.decode(output[0], skip_special_tokens=True) + + def _model_call(self, inps): + """ + inps: a torch tensor of shape [batch, sequence] + the size of sequence may vary from call to call + + returns: a torch tensor of shape [batch, sequence, vocab] with the + logits returned from the model + """ + with torch.inference_mode(): + inps = inps.to(self.device) + res = self.model(inps)[0] + return res + + def _model_generate(self, context, max_length, eos_token_id): + return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True) \ No newline at end of file diff --git a/python/llm/dev/benchmark/harness/harness_to_leaderboard.py b/python/llm/dev/benchmark/harness/harness_to_leaderboard.py new file mode 100644 index 00000000..ce8b2620 --- /dev/null +++ b/python/llm/dev/benchmark/harness/harness_to_leaderboard.py @@ -0,0 +1,51 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from regex import match + + +task_map = dict( + hellaswag="hellaswag", + arc="arc_challenge", + truthfulqa="truthfulqa_mc", + mmlu="hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions" +) + + +task_to_n_few_shots = dict( + hellaswag=10, + arc=25, + truthfulqa=0, + mmlu=5 +) + + +def parse_precision(precision, model="bigdl-llm"): + result = match(r"([a-zA-Z_]*)(\d+)", precision) + datatype = result.group(1) + bit = int(result.group(2)) + if bit >= 16: + float_map = dict( + bf16="bfloat16", + fp16="float16", + fp32="float32" + ) + return f"dtype={float_map[precision]}" + else: + if model == "hf-causal": + return f"bnb_type={precision}" + if model == "bigdl-llm": + return f"load_in_low_bit={precision}" + raise RuntimeError(f"invald precision {precision}") diff --git a/python/llm/dev/benchmark/harness/llb.py b/python/llm/dev/benchmark/harness/llb.py deleted file mode 100644 index 88e31a51..00000000 --- a/python/llm/dev/benchmark/harness/llb.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -# this code is copied from llama2 example test, and added performance test -import argparse -import os -import subprocess - -task_cmd = "--num_fewshot {} --tasks {}" - -task_map = { - "hellaswag": task_cmd.format(10, "hellaswag"), - "arc": task_cmd.format(25, "arc_challenge"), - "truthfulqa": task_cmd.format(0, "truthfulqa_mc"), - "mmlu": task_cmd.format(5, "hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions") -} - -prec_to_arg = { - "bigdl-llm": { - "int4": "load_in_low_bit=sym_int4", - "nf4": "load_in_low_bit=nf4", - "nf3": "load_in_low_bit=nf3", - "fp8": "load_in_low_bit=fp8", - "fp4": "load_in_low_bit=fp4", - "bf16": "dtype=bfloat16", - "fp16": "dtype=float16", - }, - "hf-causal": { - "nf4": "bnb_type=nf4", - "bf16": "dtype=bfloat16", - "fp16": "dtype=float16", - } -} - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True, type=str) - parser.add_argument("--pretrained", required=True, type=str) - parser.add_argument("--precision", required=True, nargs='+', type=str) - parser.add_argument("--device", required=True, type=str) - parser.add_argument("--batch", default=1, type=int) - parser.add_argument("--tasks", required=True, nargs='+', type=str) - parser.add_argument("--output_dir", type=str) - args = parser.parse_args() - print(args.model) - print(args.tasks) - basic_cmd = "python lm-evaluation-harness/main.py --model {} --model_args pretrained={},{} --no_cache --device {} --batch_size {} {} --output_path {} " - os.makedirs(args.output_dir, exist_ok=True) - index = 1 - total = len(args.precision) * len(args.tasks) - for prec in args.precision: - prec_arg = prec_to_arg[args.model][prec] - for task in args.tasks: - output_path = f"{args.model}_{prec}_{args.device}_{task}" - task_arg = task_map[task] - cmd_exec = basic_cmd.format(args.model, args.pretrained, prec_arg, args.device, args.batch, - task_arg, f"{args.output_dir}/{output_path}") - print(f"Running job {index}/{total}:\n{cmd_exec}") - index += 1 - with open(f"{args.output_dir}/log_{output_path}.txt", "w") as f: - return_code = subprocess.call(cmd_exec, shell=True) - if return_code == 0: - print("Successful") - else: - print("Failed") - -main() diff --git a/python/llm/dev/benchmark/harness/run_llb.py b/python/llm/dev/benchmark/harness/run_llb.py new file mode 100644 index 00000000..6e8c7143 --- /dev/null +++ b/python/llm/dev/benchmark/harness/run_llb.py @@ -0,0 +1,149 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import argparse +import json +import logging +import os +from harness_to_leaderboard import * +from lm_eval import tasks, evaluator, utils, models + +from bigdl_llm import BigDLLM +models.MODEL_REGISTRY['bigdl-llm'] = BigDLLM # patch bigdl-llm to harness + +logging.getLogger("openai").setLevel(logging.WARNING) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", required=True) + parser.add_argument("--model_args", default="") + parser.add_argument("--pretrained", required=True, type=str) + parser.add_argument("--tasks", required=True, nargs='+', type=str) + parser.add_argument("--precision", required=True, nargs='+', type=str) + parser.add_argument("--provide_description", action="store_true") + parser.add_argument("--num_fewshot", type=int, default=0) + parser.add_argument("--batch_size", type=str, default=None) + parser.add_argument( + "--max_batch_size", + type=int, + default=None, + help="Maximal batch size to try with --batch_size auto", + ) + parser.add_argument("--device", type=str, default=None) + parser.add_argument("--output_path", default=None) + parser.add_argument( + "--limit", + type=float, + default=None, + help="Limit the number of examples per task. " + "If <1, limit is a percentage of the total number of examples.", + ) + parser.add_argument("--data_sampling", type=float, default=None) + parser.add_argument("--no_cache", action="store_true") + parser.add_argument("--decontamination_ngrams_path", default=None) + parser.add_argument("--description_dict_path", default=None) + parser.add_argument("--check_integrity", action="store_true") + parser.add_argument("--write_out", action="store_true", default=False) + parser.add_argument("--output_base_path", type=str, default=None) + + return parser.parse_args() + + +def main(): + args = parse_args() + + assert not args.provide_description # not implemented + + if args.limit: + print( + "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." + ) + + # if args.tasks is None: + # task_names = tasks.ALL_TASKS + # else: + # task_names = utils.pattern_match(args.tasks.split(","), tasks.ALL_TASKS) + + print(f"Selected Tasks: {args.tasks}") + + description_dict = {} + if args.description_dict_path: + with open(args.description_dict_path, "r") as f: + description_dict = json.load(f) + + success = [] + fail = [] + for prec in args.precision: + prec_arg = parse_precision(prec, args.model) + model_args = f"pretrained={args.pretrained},{prec_arg}" + if len(args.model_args) > 0: + model_args += args.model_args + for task in args.tasks: + task_names=task_map.get(task, task).split(',') + num_fewshot = task_to_n_few_shots.get(task, args.num_fewshot) + try: + results = evaluator.simple_evaluate( + model=args.model, + model_args=model_args, + tasks=task_names, + num_fewshot=num_fewshot, + batch_size=args.batch_size, + max_batch_size=args.max_batch_size, + device=args.device, + no_cache=args.no_cache, + limit=args.limit, + description_dict=description_dict, + decontamination_ngrams_path=args.decontamination_ngrams_path, + check_integrity=args.check_integrity, + write_out=args.write_out, + output_base_path=args.output_base_path, + ) + if len(results['results']) > 1: + average = {} + for _, subtask in results['results'].items(): + for metric, value in subtask.items(): + average[metric] = average.get(metric, []) + [value] + for k, v in average.items(): + average[k] = sum(average[k]) / len(average[k]) if not k.endswith("_stderr") else 0 + results['results'][f"avg_{task}"] = average + results['versions'][f"avg_{task}"] = 1 + + dumped = json.dumps(results, indent=2) + print(dumped) + + if args.output_path: + dirname = os.path.dirname(args.output_path) + if dirname: + os.makedirs(dirname, exist_ok=True) + with open(args.output_path, "w") as f: + f.write(dumped) + success.append(results) + except Exception as e: + fail.append(f"Job config of task={task}, precision={prec} failed. Error Message: {str(e)}") + print(f"Job config of task={task}, precision={prec} failed. Error Message: {str(e)}") + + ## print all task summary + print("Here are results of all successful tasks:") + for results in success: + print(results['config']) + print(evaluator.make_table(results)) + + if len(fail) > 0: + raise RuntimeError('\n'.join(fail)) + + +if __name__ == "__main__": + main()