patch bigdl-llm model to harness by binding instead of patch file (#9420)
* add run_llb.py * fix args interpret * modify outputs * update workflow * add license * test mixed 4 bit * update readme * use autotokenizer * add timeout * refactor workflow file * fix working directory * fix env * throw exception if some jobs failed * improve terminal outputs * Disable var which cause the run stuck * fix unknown precision * fix key error * directly output config instead * rm harness submodule
This commit is contained in:
parent
51d07a9fd8
commit
d19ca21957
7 changed files with 347 additions and 257 deletions
42
.github/workflows/llm-harness-evaluation.yml
vendored
42
.github/workflows/llm-harness-evaluation.yml
vendored
|
|
@ -20,25 +20,28 @@ on:
|
||||||
jobs:
|
jobs:
|
||||||
llm-cpp-build:
|
llm-cpp-build:
|
||||||
uses: ./.github/workflows/llm-binary-build.yml
|
uses: ./.github/workflows/llm-binary-build.yml
|
||||||
llm-nightly-harness-test:
|
llm-harness-evalution:
|
||||||
|
timeout-minutes: 1000
|
||||||
needs: llm-cpp-build
|
needs: llm-cpp-build
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
|
# include:
|
||||||
|
# python-version: "3.9"
|
||||||
|
# model_name: "stablelm-3b-4e1t"
|
||||||
|
# task: "arc"
|
||||||
|
# precision: "sym_int4" #options: sym_int4, fp4, nf4, mixed_4bit, fp8
|
||||||
python-version: ["3.9"]
|
python-version: ["3.9"]
|
||||||
model_name: [stablelm-3b-4e1t]
|
model_name: [stablelm-3b-4e1t]
|
||||||
task: ["truthfulqa"]
|
task: ["truthfulqa"]
|
||||||
precision: ["int4"]
|
precision: [sym_int4] #options: sym_int4, fp4, nf4, mixed_4bit, fp8
|
||||||
|
|
||||||
runs-on: [self-hosted, llm, accuracy, temp-arc01]
|
runs-on: [self-hosted, llm, accuracy, temp-arc01]
|
||||||
env:
|
env:
|
||||||
ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
|
ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
|
||||||
|
ORIGIN_DIR: /mnt/disk1/models
|
||||||
|
HARNESS_HF_HOME: /mnt/disk1/harness_home
|
||||||
steps:
|
steps:
|
||||||
- name: Set model and dataset directories
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo "ORIGIN_DIR=/mnt/disk1/models" >> "$GITHUB_ENV"
|
|
||||||
echo "HARNESS_HF_HOME=/mnt/disk1/harness_home" >> "$GITHUB_ENV"
|
|
||||||
|
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
|
|
@ -60,16 +63,13 @@ jobs:
|
||||||
extra-dependency: "xpu"
|
extra-dependency: "xpu"
|
||||||
|
|
||||||
- name: Install harness
|
- name: Install harness
|
||||||
|
working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
cd python/llm/dev/benchmark/harness/
|
|
||||||
git clone https://github.com/EleutherAI/lm-evaluation-harness.git
|
git clone https://github.com/EleutherAI/lm-evaluation-harness.git
|
||||||
cd lm-evaluation-harness
|
cd lm-evaluation-harness
|
||||||
git checkout e81d3cc
|
git checkout e81d3cc
|
||||||
pip install -e .
|
pip install -e .
|
||||||
git apply ../bigdl-llm.patch
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
|
|
||||||
- name: Download models and datasets
|
- name: Download models and datasets
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
@ -84,17 +84,21 @@ jobs:
|
||||||
wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
|
wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Set datasets env
|
- name: Upgrade packages
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
echo "HF_HOME=$HARNESS_HF_HOME" >> "$GITHUB_ENV"
|
pip install --upgrade transformers
|
||||||
echo "HF_DATASETS=$HARNESS_HF_HOME/datasets" >> "$GITHUB_ENV"
|
|
||||||
echo "HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets" >> "$GITHUB_ENV"
|
|
||||||
|
|
||||||
- name: Run harness
|
- name: Run harness
|
||||||
shell: bash
|
shell: bash
|
||||||
|
working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness
|
||||||
|
env:
|
||||||
|
USE_XETLA: OFF
|
||||||
|
# SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1
|
||||||
run: |
|
run: |
|
||||||
export USE_XETLA=OFF
|
export HF_HOME=${HARNESS_HF_HOME}
|
||||||
|
export HF_DATASETS=$HARNESS_HF_HOME/datasets
|
||||||
|
export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
cd python/llm/dev/benchmark/harness
|
python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --batch_size 1 --no_cache
|
||||||
python llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --output_dir results/${{ matrix.model_name }} --batch 1
|
|
||||||
|
|
|
||||||
|
|
@ -9,20 +9,18 @@ git clone https://github.com/EleutherAI/lm-evaluation-harness.git
|
||||||
cd lm-evaluation-harness
|
cd lm-evaluation-harness
|
||||||
git checkout e81d3cc
|
git checkout e81d3cc
|
||||||
pip install -e .
|
pip install -e .
|
||||||
git apply ../bigdl-llm.patch
|
|
||||||
cd ..
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Run
|
## Run
|
||||||
run `python llb.py`. `llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
|
run `python run_llb.py`. `run_llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
|
||||||
|
|
||||||
### Evaluation on CPU
|
### Evaluation on CPU
|
||||||
```python
|
```python
|
||||||
python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output
|
python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
|
||||||
```
|
```
|
||||||
### Evaluation on Intel GPU
|
### Evaluation on Intel GPU
|
||||||
```python
|
```python
|
||||||
python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output
|
python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
|
||||||
```
|
```
|
||||||
## Results
|
## Results
|
||||||
We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result.
|
We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result.
|
||||||
|
|
|
||||||
|
|
@ -1,151 +0,0 @@
|
||||||
diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
|
|
||||||
index 8ca27fac..6b581487 100644
|
|
||||||
--- a/lm_eval/models/__init__.py
|
|
||||||
+++ b/lm_eval/models/__init__.py
|
|
||||||
@@ -4,6 +4,7 @@ from . import anthropic_llms
|
|
||||||
from . import huggingface
|
|
||||||
from . import textsynth
|
|
||||||
from . import dummy
|
|
||||||
+from . import bigdl_llm
|
|
||||||
|
|
||||||
MODEL_REGISTRY = {
|
|
||||||
"hf": gpt2.HFLM,
|
|
||||||
@@ -15,6 +16,7 @@ MODEL_REGISTRY = {
|
|
||||||
"anthropic": anthropic_llms.AnthropicLM,
|
|
||||||
"textsynth": textsynth.TextSynthLM,
|
|
||||||
"dummy": dummy.DummyLM,
|
|
||||||
+ "bigdl-llm": bigdl_llm.BigDLLM
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
diff --git a/lm_eval/models/bigdl_llm.py b/lm_eval/models/bigdl_llm.py
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..74010da3
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/lm_eval/models/bigdl_llm.py
|
|
||||||
@@ -0,0 +1,124 @@
|
|
||||||
+#
|
|
||||||
+# Copyright 2016 The BigDL Authors.
|
|
||||||
+#
|
|
||||||
+# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
+# you may not use this file except in compliance with the License.
|
|
||||||
+# You may obtain a copy of the License at
|
|
||||||
+#
|
|
||||||
+# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
+#
|
|
||||||
+# Unless required by applicable law or agreed to in writing, software
|
|
||||||
+# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
+# See the License for the specific language governing permissions and
|
|
||||||
+# limitations under the License.
|
|
||||||
+#
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+# this code is copied from llama2 example test, and added performance test
|
|
||||||
+import os
|
|
||||||
+import multiprocessing
|
|
||||||
+
|
|
||||||
+from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
|
||||||
+
|
|
||||||
+import torch
|
|
||||||
+from typing import Optional, Union
|
|
||||||
+from lm_eval.base import BaseLM
|
|
||||||
+
|
|
||||||
+from transformers import AutoTokenizer, LlamaTokenizer
|
|
||||||
+
|
|
||||||
+def _get_dtype(
|
|
||||||
+ dtype: Union[str, torch.dtype]
|
|
||||||
+) -> torch.dtype:
|
|
||||||
+ """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
|
|
||||||
+ if isinstance(dtype, str) and dtype != "auto":
|
|
||||||
+ # Convert `str` args torch dtype: `float16` -> `torch.float16`
|
|
||||||
+ _torch_dtype = getattr(torch, dtype)
|
|
||||||
+ else:
|
|
||||||
+ _torch_dtype = dtype
|
|
||||||
+ return _torch_dtype
|
|
||||||
+
|
|
||||||
+class BigDLLM(BaseLM):
|
|
||||||
+ def __init__(
|
|
||||||
+ self,
|
|
||||||
+ device="xpu",
|
|
||||||
+ pretrained="gpt2",
|
|
||||||
+ revision="main",
|
|
||||||
+ low_cpu_mem_usage=None,
|
|
||||||
+ subfolder=None,
|
|
||||||
+ tokenizer=None,
|
|
||||||
+ batch_size=1,
|
|
||||||
+ load_in_8bit: Optional[bool] = False,
|
|
||||||
+ trust_remote_code: Optional[bool] = False,
|
|
||||||
+ load_in_low_bit=None,
|
|
||||||
+ dtype: Optional[Union[str, torch.dtype]] = "auto",
|
|
||||||
+ ):
|
|
||||||
+ super().__init__()
|
|
||||||
+
|
|
||||||
+ assert isinstance(pretrained, str)
|
|
||||||
+ assert isinstance(batch_size, (int,str))
|
|
||||||
+ if 'xpu' in device:
|
|
||||||
+ import intel_extension_for_pytorch as ipex
|
|
||||||
+ model = AutoModelForCausalLM.from_pretrained(pretrained,
|
|
||||||
+ load_in_low_bit=load_in_low_bit,
|
|
||||||
+ optimize_model=True,
|
|
||||||
+ trust_remote_code=True,
|
|
||||||
+ use_cache=True,
|
|
||||||
+ torch_dtype=_get_dtype(dtype))
|
|
||||||
+ print(model) # print model to check precision
|
|
||||||
+ self._device = device
|
|
||||||
+ self.model = model.to(device)
|
|
||||||
+
|
|
||||||
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
|
|
||||||
+
|
|
||||||
+ # setup for automatic batch size detection
|
|
||||||
+ if batch_size == 'auto':
|
|
||||||
+ self.batch_size_per_gpu = batch_size
|
|
||||||
+ else:
|
|
||||||
+ self.batch_size_per_gpu = int(batch_size)
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def eot_token_id(self):
|
|
||||||
+ # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
|
|
||||||
+ return self.model.token_eos()
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def max_length(self):
|
|
||||||
+ return 2048 # TODO: how to get this from config
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def max_gen_toks(self):
|
|
||||||
+ return 256
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def batch_size(self):
|
|
||||||
+ # TODO: fix multi-gpu
|
|
||||||
+ return self.batch_size_per_gpu # * gpus
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def device(self):
|
|
||||||
+ # TODO: fix multi-gpu
|
|
||||||
+ return torch.device(self._device)
|
|
||||||
+
|
|
||||||
+ def tok_encode(self, string: str):
|
|
||||||
+ input_ids = self.tokenizer.encode(string)
|
|
||||||
+ return input_ids
|
|
||||||
+
|
|
||||||
+ def tok_decode(self, tokens):
|
|
||||||
+ return self.tokenizer.decode(output[0], skip_special_tokens=True)
|
|
||||||
+
|
|
||||||
+ def _model_call(self, inps):
|
|
||||||
+ """
|
|
||||||
+ inps: a torch tensor of shape [batch, sequence]
|
|
||||||
+ the size of sequence may vary from call to call
|
|
||||||
+
|
|
||||||
+ returns: a torch tensor of shape [batch, sequence, vocab] with the
|
|
||||||
+ logits returned from the model
|
|
||||||
+ """
|
|
||||||
+ with torch.inference_mode():
|
|
||||||
+ inps = inps.to(self.device)
|
|
||||||
+ res = self.model(inps)[0]
|
|
||||||
+ return res
|
|
||||||
+
|
|
||||||
+ def _model_generate(self, context, max_length, eos_token_id):
|
|
||||||
+ return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True)
|
|
||||||
\ No newline at end of file
|
|
||||||
121
python/llm/dev/benchmark/harness/bigdl_llm.py
Normal file
121
python/llm/dev/benchmark/harness/bigdl_llm.py
Normal file
|
|
@ -0,0 +1,121 @@
|
||||||
|
#
|
||||||
|
# Copyright 2016 The BigDL Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
import os
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from typing import Optional, Union
|
||||||
|
from lm_eval.base import BaseLM
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer, LlamaTokenizer
|
||||||
|
|
||||||
|
def _get_dtype(
|
||||||
|
dtype: Union[str, torch.dtype]
|
||||||
|
) -> torch.dtype:
|
||||||
|
"""Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
|
||||||
|
if isinstance(dtype, str) and dtype != "auto":
|
||||||
|
# Convert `str` args torch dtype: `float16` -> `torch.float16`
|
||||||
|
_torch_dtype = getattr(torch, dtype)
|
||||||
|
else:
|
||||||
|
_torch_dtype = dtype
|
||||||
|
return _torch_dtype
|
||||||
|
|
||||||
|
class BigDLLM(BaseLM):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
device="xpu",
|
||||||
|
pretrained="gpt2",
|
||||||
|
revision="main",
|
||||||
|
low_cpu_mem_usage=None,
|
||||||
|
subfolder=None,
|
||||||
|
tokenizer=None,
|
||||||
|
batch_size=1,
|
||||||
|
load_in_8bit: Optional[bool] = False,
|
||||||
|
trust_remote_code: Optional[bool] = False,
|
||||||
|
load_in_low_bit=None,
|
||||||
|
dtype: Optional[Union[str, torch.dtype]] = "auto",
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
assert isinstance(pretrained, str)
|
||||||
|
assert isinstance(batch_size, (int,str))
|
||||||
|
if device == 'xpu':
|
||||||
|
import intel_extension_for_pytorch as ipex
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(pretrained,
|
||||||
|
load_in_low_bit=load_in_low_bit,
|
||||||
|
optimize_model=True,
|
||||||
|
trust_remote_code=True,
|
||||||
|
use_cache=True,
|
||||||
|
torch_dtype=_get_dtype(dtype))
|
||||||
|
print(model) # print model to check precision
|
||||||
|
self._device = device
|
||||||
|
self.model = model.to(device)
|
||||||
|
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
|
||||||
|
|
||||||
|
# setup for automatic batch size detection
|
||||||
|
if batch_size == 'auto':
|
||||||
|
self.batch_size_per_gpu = batch_size
|
||||||
|
else:
|
||||||
|
self.batch_size_per_gpu = int(batch_size)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def eot_token_id(self):
|
||||||
|
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
|
||||||
|
return self.model.token_eos()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_length(self):
|
||||||
|
return 2048 # TODO: how to get this from config
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_gen_toks(self):
|
||||||
|
return 256
|
||||||
|
|
||||||
|
@property
|
||||||
|
def batch_size(self):
|
||||||
|
# TODO: fix multi-gpu
|
||||||
|
return self.batch_size_per_gpu # * gpus
|
||||||
|
|
||||||
|
@property
|
||||||
|
def device(self):
|
||||||
|
# TODO: fix multi-gpu
|
||||||
|
return torch.device(self._device)
|
||||||
|
|
||||||
|
def tok_encode(self, string: str):
|
||||||
|
input_ids = self.tokenizer.encode(string)
|
||||||
|
return input_ids
|
||||||
|
|
||||||
|
def tok_decode(self, tokens):
|
||||||
|
return self.tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
|
|
||||||
|
def _model_call(self, inps):
|
||||||
|
"""
|
||||||
|
inps: a torch tensor of shape [batch, sequence]
|
||||||
|
the size of sequence may vary from call to call
|
||||||
|
|
||||||
|
returns: a torch tensor of shape [batch, sequence, vocab] with the
|
||||||
|
logits returned from the model
|
||||||
|
"""
|
||||||
|
with torch.inference_mode():
|
||||||
|
inps = inps.to(self.device)
|
||||||
|
res = self.model(inps)[0]
|
||||||
|
return res
|
||||||
|
|
||||||
|
def _model_generate(self, context, max_length, eos_token_id):
|
||||||
|
return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True)
|
||||||
51
python/llm/dev/benchmark/harness/harness_to_leaderboard.py
Normal file
51
python/llm/dev/benchmark/harness/harness_to_leaderboard.py
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
#
|
||||||
|
# Copyright 2016 The BigDL Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
from regex import match
|
||||||
|
|
||||||
|
|
||||||
|
task_map = dict(
|
||||||
|
hellaswag="hellaswag",
|
||||||
|
arc="arc_challenge",
|
||||||
|
truthfulqa="truthfulqa_mc",
|
||||||
|
mmlu="hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
task_to_n_few_shots = dict(
|
||||||
|
hellaswag=10,
|
||||||
|
arc=25,
|
||||||
|
truthfulqa=0,
|
||||||
|
mmlu=5
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_precision(precision, model="bigdl-llm"):
|
||||||
|
result = match(r"([a-zA-Z_]*)(\d+)", precision)
|
||||||
|
datatype = result.group(1)
|
||||||
|
bit = int(result.group(2))
|
||||||
|
if bit >= 16:
|
||||||
|
float_map = dict(
|
||||||
|
bf16="bfloat16",
|
||||||
|
fp16="float16",
|
||||||
|
fp32="float32"
|
||||||
|
)
|
||||||
|
return f"dtype={float_map[precision]}"
|
||||||
|
else:
|
||||||
|
if model == "hf-causal":
|
||||||
|
return f"bnb_type={precision}"
|
||||||
|
if model == "bigdl-llm":
|
||||||
|
return f"load_in_low_bit={precision}"
|
||||||
|
raise RuntimeError(f"invald precision {precision}")
|
||||||
|
|
@ -1,82 +0,0 @@
|
||||||
#
|
|
||||||
# Copyright 2016 The BigDL Authors.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
# this code is copied from llama2 example test, and added performance test
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
task_cmd = "--num_fewshot {} --tasks {}"
|
|
||||||
|
|
||||||
task_map = {
|
|
||||||
"hellaswag": task_cmd.format(10, "hellaswag"),
|
|
||||||
"arc": task_cmd.format(25, "arc_challenge"),
|
|
||||||
"truthfulqa": task_cmd.format(0, "truthfulqa_mc"),
|
|
||||||
"mmlu": task_cmd.format(5, "hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions")
|
|
||||||
}
|
|
||||||
|
|
||||||
prec_to_arg = {
|
|
||||||
"bigdl-llm": {
|
|
||||||
"int4": "load_in_low_bit=sym_int4",
|
|
||||||
"nf4": "load_in_low_bit=nf4",
|
|
||||||
"nf3": "load_in_low_bit=nf3",
|
|
||||||
"fp8": "load_in_low_bit=fp8",
|
|
||||||
"fp4": "load_in_low_bit=fp4",
|
|
||||||
"bf16": "dtype=bfloat16",
|
|
||||||
"fp16": "dtype=float16",
|
|
||||||
},
|
|
||||||
"hf-causal": {
|
|
||||||
"nf4": "bnb_type=nf4",
|
|
||||||
"bf16": "dtype=bfloat16",
|
|
||||||
"fp16": "dtype=float16",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--model", required=True, type=str)
|
|
||||||
parser.add_argument("--pretrained", required=True, type=str)
|
|
||||||
parser.add_argument("--precision", required=True, nargs='+', type=str)
|
|
||||||
parser.add_argument("--device", required=True, type=str)
|
|
||||||
parser.add_argument("--batch", default=1, type=int)
|
|
||||||
parser.add_argument("--tasks", required=True, nargs='+', type=str)
|
|
||||||
parser.add_argument("--output_dir", type=str)
|
|
||||||
args = parser.parse_args()
|
|
||||||
print(args.model)
|
|
||||||
print(args.tasks)
|
|
||||||
basic_cmd = "python lm-evaluation-harness/main.py --model {} --model_args pretrained={},{} --no_cache --device {} --batch_size {} {} --output_path {} "
|
|
||||||
os.makedirs(args.output_dir, exist_ok=True)
|
|
||||||
index = 1
|
|
||||||
total = len(args.precision) * len(args.tasks)
|
|
||||||
for prec in args.precision:
|
|
||||||
prec_arg = prec_to_arg[args.model][prec]
|
|
||||||
for task in args.tasks:
|
|
||||||
output_path = f"{args.model}_{prec}_{args.device}_{task}"
|
|
||||||
task_arg = task_map[task]
|
|
||||||
cmd_exec = basic_cmd.format(args.model, args.pretrained, prec_arg, args.device, args.batch,
|
|
||||||
task_arg, f"{args.output_dir}/{output_path}")
|
|
||||||
print(f"Running job {index}/{total}:\n{cmd_exec}")
|
|
||||||
index += 1
|
|
||||||
with open(f"{args.output_dir}/log_{output_path}.txt", "w") as f:
|
|
||||||
return_code = subprocess.call(cmd_exec, shell=True)
|
|
||||||
if return_code == 0:
|
|
||||||
print("Successful")
|
|
||||||
else:
|
|
||||||
print("Failed")
|
|
||||||
|
|
||||||
main()
|
|
||||||
149
python/llm/dev/benchmark/harness/run_llb.py
Normal file
149
python/llm/dev/benchmark/harness/run_llb.py
Normal file
|
|
@ -0,0 +1,149 @@
|
||||||
|
#
|
||||||
|
# Copyright 2016 The BigDL Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from harness_to_leaderboard import *
|
||||||
|
from lm_eval import tasks, evaluator, utils, models
|
||||||
|
|
||||||
|
from bigdl_llm import BigDLLM
|
||||||
|
models.MODEL_REGISTRY['bigdl-llm'] = BigDLLM # patch bigdl-llm to harness
|
||||||
|
|
||||||
|
logging.getLogger("openai").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--model", required=True)
|
||||||
|
parser.add_argument("--model_args", default="")
|
||||||
|
parser.add_argument("--pretrained", required=True, type=str)
|
||||||
|
parser.add_argument("--tasks", required=True, nargs='+', type=str)
|
||||||
|
parser.add_argument("--precision", required=True, nargs='+', type=str)
|
||||||
|
parser.add_argument("--provide_description", action="store_true")
|
||||||
|
parser.add_argument("--num_fewshot", type=int, default=0)
|
||||||
|
parser.add_argument("--batch_size", type=str, default=None)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max_batch_size",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Maximal batch size to try with --batch_size auto",
|
||||||
|
)
|
||||||
|
parser.add_argument("--device", type=str, default=None)
|
||||||
|
parser.add_argument("--output_path", default=None)
|
||||||
|
parser.add_argument(
|
||||||
|
"--limit",
|
||||||
|
type=float,
|
||||||
|
default=None,
|
||||||
|
help="Limit the number of examples per task. "
|
||||||
|
"If <1, limit is a percentage of the total number of examples.",
|
||||||
|
)
|
||||||
|
parser.add_argument("--data_sampling", type=float, default=None)
|
||||||
|
parser.add_argument("--no_cache", action="store_true")
|
||||||
|
parser.add_argument("--decontamination_ngrams_path", default=None)
|
||||||
|
parser.add_argument("--description_dict_path", default=None)
|
||||||
|
parser.add_argument("--check_integrity", action="store_true")
|
||||||
|
parser.add_argument("--write_out", action="store_true", default=False)
|
||||||
|
parser.add_argument("--output_base_path", type=str, default=None)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
assert not args.provide_description # not implemented
|
||||||
|
|
||||||
|
if args.limit:
|
||||||
|
print(
|
||||||
|
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
||||||
|
)
|
||||||
|
|
||||||
|
# if args.tasks is None:
|
||||||
|
# task_names = tasks.ALL_TASKS
|
||||||
|
# else:
|
||||||
|
# task_names = utils.pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
|
||||||
|
|
||||||
|
print(f"Selected Tasks: {args.tasks}")
|
||||||
|
|
||||||
|
description_dict = {}
|
||||||
|
if args.description_dict_path:
|
||||||
|
with open(args.description_dict_path, "r") as f:
|
||||||
|
description_dict = json.load(f)
|
||||||
|
|
||||||
|
success = []
|
||||||
|
fail = []
|
||||||
|
for prec in args.precision:
|
||||||
|
prec_arg = parse_precision(prec, args.model)
|
||||||
|
model_args = f"pretrained={args.pretrained},{prec_arg}"
|
||||||
|
if len(args.model_args) > 0:
|
||||||
|
model_args += args.model_args
|
||||||
|
for task in args.tasks:
|
||||||
|
task_names=task_map.get(task, task).split(',')
|
||||||
|
num_fewshot = task_to_n_few_shots.get(task, args.num_fewshot)
|
||||||
|
try:
|
||||||
|
results = evaluator.simple_evaluate(
|
||||||
|
model=args.model,
|
||||||
|
model_args=model_args,
|
||||||
|
tasks=task_names,
|
||||||
|
num_fewshot=num_fewshot,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
max_batch_size=args.max_batch_size,
|
||||||
|
device=args.device,
|
||||||
|
no_cache=args.no_cache,
|
||||||
|
limit=args.limit,
|
||||||
|
description_dict=description_dict,
|
||||||
|
decontamination_ngrams_path=args.decontamination_ngrams_path,
|
||||||
|
check_integrity=args.check_integrity,
|
||||||
|
write_out=args.write_out,
|
||||||
|
output_base_path=args.output_base_path,
|
||||||
|
)
|
||||||
|
if len(results['results']) > 1:
|
||||||
|
average = {}
|
||||||
|
for _, subtask in results['results'].items():
|
||||||
|
for metric, value in subtask.items():
|
||||||
|
average[metric] = average.get(metric, []) + [value]
|
||||||
|
for k, v in average.items():
|
||||||
|
average[k] = sum(average[k]) / len(average[k]) if not k.endswith("_stderr") else 0
|
||||||
|
results['results'][f"avg_{task}"] = average
|
||||||
|
results['versions'][f"avg_{task}"] = 1
|
||||||
|
|
||||||
|
dumped = json.dumps(results, indent=2)
|
||||||
|
print(dumped)
|
||||||
|
|
||||||
|
if args.output_path:
|
||||||
|
dirname = os.path.dirname(args.output_path)
|
||||||
|
if dirname:
|
||||||
|
os.makedirs(dirname, exist_ok=True)
|
||||||
|
with open(args.output_path, "w") as f:
|
||||||
|
f.write(dumped)
|
||||||
|
success.append(results)
|
||||||
|
except Exception as e:
|
||||||
|
fail.append(f"Job config of task={task}, precision={prec} failed. Error Message: {str(e)}")
|
||||||
|
print(f"Job config of task={task}, precision={prec} failed. Error Message: {str(e)}")
|
||||||
|
|
||||||
|
## print all task summary
|
||||||
|
print("Here are results of all successful tasks:")
|
||||||
|
for results in success:
|
||||||
|
print(results['config'])
|
||||||
|
print(evaluator.make_table(results))
|
||||||
|
|
||||||
|
if len(fail) > 0:
|
||||||
|
raise RuntimeError('\n'.join(fail))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in a new issue