patch bigdl-llm model to harness by binding instead of patch file (#9420)
* add run_llb.py * fix args interpret * modify outputs * update workflow * add license * test mixed 4 bit * update readme * use autotokenizer * add timeout * refactor workflow file * fix working directory * fix env * throw exception if some jobs failed * improve terminal outputs * Disable var which cause the run stuck * fix unknown precision * fix key error * directly output config instead * rm harness submodule
This commit is contained in:
		
							parent
							
								
									51d07a9fd8
								
							
						
					
					
						commit
						d19ca21957
					
				
					 7 changed files with 347 additions and 257 deletions
				
			
		
							
								
								
									
										42
									
								
								.github/workflows/llm-harness-evaluation.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										42
									
								
								.github/workflows/llm-harness-evaluation.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -20,25 +20,28 @@ on:
 | 
			
		|||
jobs:
 | 
			
		||||
  llm-cpp-build:
 | 
			
		||||
    uses: ./.github/workflows/llm-binary-build.yml
 | 
			
		||||
  llm-nightly-harness-test:
 | 
			
		||||
  llm-harness-evalution:
 | 
			
		||||
    timeout-minutes: 1000
 | 
			
		||||
    needs: llm-cpp-build
 | 
			
		||||
    strategy:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        # include:
 | 
			
		||||
        #   python-version: "3.9"
 | 
			
		||||
        #   model_name: "stablelm-3b-4e1t"
 | 
			
		||||
        #   task: "arc"
 | 
			
		||||
        #   precision: "sym_int4" #options: sym_int4, fp4, nf4, mixed_4bit, fp8
 | 
			
		||||
        python-version: ["3.9"]
 | 
			
		||||
        model_name: [stablelm-3b-4e1t]
 | 
			
		||||
        task: ["truthfulqa"]
 | 
			
		||||
        precision: ["int4"]
 | 
			
		||||
        precision: [sym_int4] #options: sym_int4, fp4, nf4, mixed_4bit, fp8
 | 
			
		||||
        
 | 
			
		||||
    runs-on: [self-hosted, llm, accuracy, temp-arc01]
 | 
			
		||||
    env:
 | 
			
		||||
      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
 | 
			
		||||
      ORIGIN_DIR: /mnt/disk1/models
 | 
			
		||||
      HARNESS_HF_HOME: /mnt/disk1/harness_home
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Set model and dataset directories
 | 
			
		||||
        shell: bash
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "ORIGIN_DIR=/mnt/disk1/models" >> "$GITHUB_ENV"
 | 
			
		||||
          echo "HARNESS_HF_HOME=/mnt/disk1/harness_home" >> "$GITHUB_ENV"
 | 
			
		||||
 | 
			
		||||
      - uses: actions/checkout@v3
 | 
			
		||||
      - name: Set up Python ${{ matrix.python-version }}
 | 
			
		||||
        uses: actions/setup-python@v4
 | 
			
		||||
| 
						 | 
				
			
			@ -60,16 +63,13 @@ jobs:
 | 
			
		|||
          extra-dependency: "xpu"
 | 
			
		||||
 | 
			
		||||
      - name: Install harness
 | 
			
		||||
        working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/
 | 
			
		||||
        shell: bash
 | 
			
		||||
        run: |
 | 
			
		||||
          cd python/llm/dev/benchmark/harness/
 | 
			
		||||
          git clone https://github.com/EleutherAI/lm-evaluation-harness.git
 | 
			
		||||
          cd  lm-evaluation-harness
 | 
			
		||||
          git checkout e81d3cc
 | 
			
		||||
          pip install -e .
 | 
			
		||||
          git apply ../bigdl-llm.patch
 | 
			
		||||
          cd ..
 | 
			
		||||
      
 | 
			
		||||
      
 | 
			
		||||
      - name: Download models and datasets
 | 
			
		||||
        shell: bash
 | 
			
		||||
| 
						 | 
				
			
			@ -84,17 +84,21 @@ jobs:
 | 
			
		|||
            wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
 | 
			
		||||
          fi
 | 
			
		||||
          
 | 
			
		||||
      - name: Set datasets env
 | 
			
		||||
      - name: Upgrade packages
 | 
			
		||||
        shell: bash
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "HF_HOME=$HARNESS_HF_HOME" >> "$GITHUB_ENV"
 | 
			
		||||
          echo "HF_DATASETS=$HARNESS_HF_HOME/datasets" >> "$GITHUB_ENV"
 | 
			
		||||
          echo "HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets" >> "$GITHUB_ENV"
 | 
			
		||||
          pip install --upgrade transformers
 | 
			
		||||
 | 
			
		||||
      - name: Run harness
 | 
			
		||||
        shell: bash
 | 
			
		||||
        working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness
 | 
			
		||||
        env:
 | 
			
		||||
          USE_XETLA: OFF
 | 
			
		||||
          # SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1
 | 
			
		||||
        run: |
 | 
			
		||||
          export USE_XETLA=OFF
 | 
			
		||||
          export HF_HOME=${HARNESS_HF_HOME}
 | 
			
		||||
          export HF_DATASETS=$HARNESS_HF_HOME/datasets
 | 
			
		||||
          export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
 | 
			
		||||
          source /opt/intel/oneapi/setvars.sh
 | 
			
		||||
          cd python/llm/dev/benchmark/harness
 | 
			
		||||
          python llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --output_dir results/${{ matrix.model_name }} --batch 1
 | 
			
		||||
          python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --batch_size 1 --no_cache
 | 
			
		||||
        
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -9,20 +9,18 @@ git clone https://github.com/EleutherAI/lm-evaluation-harness.git
 | 
			
		|||
cd  lm-evaluation-harness
 | 
			
		||||
git checkout e81d3cc
 | 
			
		||||
pip install -e .
 | 
			
		||||
git apply ../bigdl-llm.patch
 | 
			
		||||
cd ..
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Run
 | 
			
		||||
run `python llb.py`. `llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
 | 
			
		||||
run `python run_llb.py`. `run_llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
 | 
			
		||||
 | 
			
		||||
### Evaluation on CPU
 | 
			
		||||
```python
 | 
			
		||||
python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output
 | 
			
		||||
python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
 | 
			
		||||
```
 | 
			
		||||
### Evaluation on Intel GPU
 | 
			
		||||
```python
 | 
			
		||||
python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output
 | 
			
		||||
python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
 | 
			
		||||
```
 | 
			
		||||
## Results
 | 
			
		||||
We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,151 +0,0 @@
 | 
			
		|||
diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
 | 
			
		||||
index 8ca27fac..6b581487 100644
 | 
			
		||||
--- a/lm_eval/models/__init__.py
 | 
			
		||||
+++ b/lm_eval/models/__init__.py
 | 
			
		||||
@@ -4,6 +4,7 @@ from . import anthropic_llms
 | 
			
		||||
 from . import huggingface
 | 
			
		||||
 from . import textsynth
 | 
			
		||||
 from . import dummy
 | 
			
		||||
+from . import bigdl_llm
 | 
			
		||||
 
 | 
			
		||||
 MODEL_REGISTRY = {
 | 
			
		||||
     "hf": gpt2.HFLM,
 | 
			
		||||
@@ -15,6 +16,7 @@ MODEL_REGISTRY = {
 | 
			
		||||
     "anthropic": anthropic_llms.AnthropicLM,
 | 
			
		||||
     "textsynth": textsynth.TextSynthLM,
 | 
			
		||||
     "dummy": dummy.DummyLM,
 | 
			
		||||
+    "bigdl-llm": bigdl_llm.BigDLLM
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
 
 | 
			
		||||
diff --git a/lm_eval/models/bigdl_llm.py b/lm_eval/models/bigdl_llm.py
 | 
			
		||||
new file mode 100644
 | 
			
		||||
index 00000000..74010da3
 | 
			
		||||
--- /dev/null
 | 
			
		||||
+++ b/lm_eval/models/bigdl_llm.py
 | 
			
		||||
@@ -0,0 +1,124 @@
 | 
			
		||||
+#
 | 
			
		||||
+# Copyright 2016 The BigDL Authors.
 | 
			
		||||
+#
 | 
			
		||||
+# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
+# you may not use this file except in compliance with the License.
 | 
			
		||||
+# You may obtain a copy of the License at
 | 
			
		||||
+#
 | 
			
		||||
+#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
+#
 | 
			
		||||
+# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
+# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
+# See the License for the specific language governing permissions and
 | 
			
		||||
+# limitations under the License.
 | 
			
		||||
+#
 | 
			
		||||
+
 | 
			
		||||
+
 | 
			
		||||
+# this code is copied from llama2 example test, and added performance test
 | 
			
		||||
+import os
 | 
			
		||||
+import multiprocessing
 | 
			
		||||
+
 | 
			
		||||
+from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
 | 
			
		||||
+
 | 
			
		||||
+import torch
 | 
			
		||||
+from typing import Optional, Union
 | 
			
		||||
+from lm_eval.base import BaseLM
 | 
			
		||||
+
 | 
			
		||||
+from transformers import AutoTokenizer, LlamaTokenizer
 | 
			
		||||
+
 | 
			
		||||
+def _get_dtype(
 | 
			
		||||
+    dtype: Union[str, torch.dtype]
 | 
			
		||||
+) -> torch.dtype:
 | 
			
		||||
+    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
 | 
			
		||||
+    if isinstance(dtype, str) and dtype != "auto":
 | 
			
		||||
+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
 | 
			
		||||
+        _torch_dtype = getattr(torch, dtype)
 | 
			
		||||
+    else:
 | 
			
		||||
+        _torch_dtype = dtype
 | 
			
		||||
+    return _torch_dtype
 | 
			
		||||
+
 | 
			
		||||
+class BigDLLM(BaseLM):
 | 
			
		||||
+    def __init__(
 | 
			
		||||
+        self,
 | 
			
		||||
+        device="xpu",
 | 
			
		||||
+        pretrained="gpt2",
 | 
			
		||||
+        revision="main",
 | 
			
		||||
+        low_cpu_mem_usage=None,
 | 
			
		||||
+        subfolder=None,
 | 
			
		||||
+        tokenizer=None,
 | 
			
		||||
+        batch_size=1,
 | 
			
		||||
+        load_in_8bit: Optional[bool] = False,
 | 
			
		||||
+        trust_remote_code: Optional[bool] = False,
 | 
			
		||||
+        load_in_low_bit=None,
 | 
			
		||||
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
 | 
			
		||||
+    ):
 | 
			
		||||
+        super().__init__()
 | 
			
		||||
+
 | 
			
		||||
+        assert isinstance(pretrained, str)
 | 
			
		||||
+        assert isinstance(batch_size, (int,str))
 | 
			
		||||
+        if 'xpu' in device:
 | 
			
		||||
+            import intel_extension_for_pytorch as ipex
 | 
			
		||||
+        model = AutoModelForCausalLM.from_pretrained(pretrained,
 | 
			
		||||
+                                          load_in_low_bit=load_in_low_bit,
 | 
			
		||||
+                                          optimize_model=True,
 | 
			
		||||
+                                          trust_remote_code=True,
 | 
			
		||||
+                                          use_cache=True,
 | 
			
		||||
+                                          torch_dtype=_get_dtype(dtype))
 | 
			
		||||
+        print(model) # print model to check precision
 | 
			
		||||
+        self._device = device
 | 
			
		||||
+        self.model = model.to(device)
 | 
			
		||||
+
 | 
			
		||||
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
 | 
			
		||||
+
 | 
			
		||||
+        # setup for automatic batch size detection
 | 
			
		||||
+        if batch_size == 'auto':
 | 
			
		||||
+            self.batch_size_per_gpu = batch_size
 | 
			
		||||
+        else:
 | 
			
		||||
+            self.batch_size_per_gpu = int(batch_size)
 | 
			
		||||
+
 | 
			
		||||
+    @property
 | 
			
		||||
+    def eot_token_id(self):
 | 
			
		||||
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
 | 
			
		||||
+        return self.model.token_eos()
 | 
			
		||||
+
 | 
			
		||||
+    @property
 | 
			
		||||
+    def max_length(self):
 | 
			
		||||
+        return 2048  # TODO: how to get this from config
 | 
			
		||||
+
 | 
			
		||||
+    @property
 | 
			
		||||
+    def max_gen_toks(self):
 | 
			
		||||
+        return 256
 | 
			
		||||
+
 | 
			
		||||
+    @property
 | 
			
		||||
+    def batch_size(self):
 | 
			
		||||
+        # TODO: fix multi-gpu
 | 
			
		||||
+        return self.batch_size_per_gpu  # * gpus
 | 
			
		||||
+
 | 
			
		||||
+    @property
 | 
			
		||||
+    def device(self):
 | 
			
		||||
+        # TODO: fix multi-gpu
 | 
			
		||||
+        return torch.device(self._device)
 | 
			
		||||
+
 | 
			
		||||
+    def tok_encode(self, string: str):
 | 
			
		||||
+        input_ids = self.tokenizer.encode(string)
 | 
			
		||||
+        return input_ids
 | 
			
		||||
+
 | 
			
		||||
+    def tok_decode(self, tokens):
 | 
			
		||||
+        return self.tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
+
 | 
			
		||||
+    def _model_call(self, inps):
 | 
			
		||||
+        """
 | 
			
		||||
+        inps: a torch tensor of shape [batch, sequence]
 | 
			
		||||
+        the size of sequence may vary from call to call
 | 
			
		||||
+
 | 
			
		||||
+        returns: a torch tensor of shape [batch, sequence, vocab] with the
 | 
			
		||||
+        logits returned from the model
 | 
			
		||||
+        """
 | 
			
		||||
+        with torch.inference_mode():
 | 
			
		||||
+            inps = inps.to(self.device)
 | 
			
		||||
+            res = self.model(inps)[0]
 | 
			
		||||
+            return res
 | 
			
		||||
+
 | 
			
		||||
+    def _model_generate(self, context, max_length, eos_token_id):
 | 
			
		||||
+        return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True)
 | 
			
		||||
\ No newline at end of file
 | 
			
		||||
							
								
								
									
										121
									
								
								python/llm/dev/benchmark/harness/bigdl_llm.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										121
									
								
								python/llm/dev/benchmark/harness/bigdl_llm.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,121 @@
 | 
			
		|||
#
 | 
			
		||||
# Copyright 2016 The BigDL Authors.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
#
 | 
			
		||||
import os
 | 
			
		||||
import multiprocessing
 | 
			
		||||
 | 
			
		||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
 | 
			
		||||
 | 
			
		||||
import torch
 | 
			
		||||
from typing import Optional, Union
 | 
			
		||||
from lm_eval.base import BaseLM
 | 
			
		||||
 | 
			
		||||
from transformers import AutoTokenizer, LlamaTokenizer
 | 
			
		||||
 | 
			
		||||
def _get_dtype(
 | 
			
		||||
    dtype: Union[str, torch.dtype]
 | 
			
		||||
) -> torch.dtype:
 | 
			
		||||
    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
 | 
			
		||||
    if isinstance(dtype, str) and dtype != "auto":
 | 
			
		||||
        # Convert `str` args torch dtype: `float16` -> `torch.float16`
 | 
			
		||||
        _torch_dtype = getattr(torch, dtype)
 | 
			
		||||
    else:
 | 
			
		||||
        _torch_dtype = dtype
 | 
			
		||||
    return _torch_dtype
 | 
			
		||||
 | 
			
		||||
class BigDLLM(BaseLM):
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        device="xpu",
 | 
			
		||||
        pretrained="gpt2",
 | 
			
		||||
        revision="main",
 | 
			
		||||
        low_cpu_mem_usage=None,
 | 
			
		||||
        subfolder=None,
 | 
			
		||||
        tokenizer=None,
 | 
			
		||||
        batch_size=1,
 | 
			
		||||
        load_in_8bit: Optional[bool] = False,
 | 
			
		||||
        trust_remote_code: Optional[bool] = False,
 | 
			
		||||
        load_in_low_bit=None,
 | 
			
		||||
        dtype: Optional[Union[str, torch.dtype]] = "auto",
 | 
			
		||||
    ):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
 | 
			
		||||
        assert isinstance(pretrained, str)
 | 
			
		||||
        assert isinstance(batch_size, (int,str))
 | 
			
		||||
        if device == 'xpu':
 | 
			
		||||
            import intel_extension_for_pytorch as ipex
 | 
			
		||||
        model = AutoModelForCausalLM.from_pretrained(pretrained,
 | 
			
		||||
                                          load_in_low_bit=load_in_low_bit,
 | 
			
		||||
                                          optimize_model=True,
 | 
			
		||||
                                          trust_remote_code=True,
 | 
			
		||||
                                          use_cache=True,
 | 
			
		||||
                                          torch_dtype=_get_dtype(dtype))
 | 
			
		||||
        print(model) # print model to check precision
 | 
			
		||||
        self._device = device
 | 
			
		||||
        self.model = model.to(device)
 | 
			
		||||
 | 
			
		||||
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
 | 
			
		||||
 | 
			
		||||
        # setup for automatic batch size detection
 | 
			
		||||
        if batch_size == 'auto':
 | 
			
		||||
            self.batch_size_per_gpu = batch_size
 | 
			
		||||
        else:
 | 
			
		||||
            self.batch_size_per_gpu = int(batch_size)
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def eot_token_id(self):
 | 
			
		||||
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
 | 
			
		||||
        return self.model.token_eos()
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def max_length(self):
 | 
			
		||||
        return 2048  # TODO: how to get this from config
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def max_gen_toks(self):
 | 
			
		||||
        return 256
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def batch_size(self):
 | 
			
		||||
        # TODO: fix multi-gpu
 | 
			
		||||
        return self.batch_size_per_gpu  # * gpus
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def device(self):
 | 
			
		||||
        # TODO: fix multi-gpu
 | 
			
		||||
        return torch.device(self._device)
 | 
			
		||||
 | 
			
		||||
    def tok_encode(self, string: str):
 | 
			
		||||
        input_ids = self.tokenizer.encode(string)
 | 
			
		||||
        return input_ids
 | 
			
		||||
 | 
			
		||||
    def tok_decode(self, tokens):
 | 
			
		||||
        return self.tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
 | 
			
		||||
    def _model_call(self, inps):
 | 
			
		||||
        """
 | 
			
		||||
        inps: a torch tensor of shape [batch, sequence]
 | 
			
		||||
        the size of sequence may vary from call to call
 | 
			
		||||
 | 
			
		||||
        returns: a torch tensor of shape [batch, sequence, vocab] with the
 | 
			
		||||
        logits returned from the model
 | 
			
		||||
        """
 | 
			
		||||
        with torch.inference_mode():
 | 
			
		||||
            inps = inps.to(self.device)
 | 
			
		||||
            res = self.model(inps)[0]
 | 
			
		||||
            return res
 | 
			
		||||
 | 
			
		||||
    def _model_generate(self, context, max_length, eos_token_id):
 | 
			
		||||
        return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True)
 | 
			
		||||
							
								
								
									
										51
									
								
								python/llm/dev/benchmark/harness/harness_to_leaderboard.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								python/llm/dev/benchmark/harness/harness_to_leaderboard.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,51 @@
 | 
			
		|||
#
 | 
			
		||||
# Copyright 2016 The BigDL Authors.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
#
 | 
			
		||||
from regex import match
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
task_map = dict(
 | 
			
		||||
    hellaswag="hellaswag",
 | 
			
		||||
    arc="arc_challenge",
 | 
			
		||||
    truthfulqa="truthfulqa_mc",
 | 
			
		||||
    mmlu="hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
task_to_n_few_shots = dict(
 | 
			
		||||
    hellaswag=10,
 | 
			
		||||
    arc=25,
 | 
			
		||||
    truthfulqa=0,
 | 
			
		||||
    mmlu=5
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_precision(precision, model="bigdl-llm"):
 | 
			
		||||
    result = match(r"([a-zA-Z_]*)(\d+)", precision)
 | 
			
		||||
    datatype = result.group(1)
 | 
			
		||||
    bit = int(result.group(2))
 | 
			
		||||
    if bit >= 16:
 | 
			
		||||
        float_map = dict(
 | 
			
		||||
            bf16="bfloat16",
 | 
			
		||||
            fp16="float16",
 | 
			
		||||
            fp32="float32"
 | 
			
		||||
        )
 | 
			
		||||
        return f"dtype={float_map[precision]}"
 | 
			
		||||
    else:
 | 
			
		||||
        if model == "hf-causal":
 | 
			
		||||
            return f"bnb_type={precision}"
 | 
			
		||||
        if model == "bigdl-llm":
 | 
			
		||||
            return f"load_in_low_bit={precision}"
 | 
			
		||||
    raise RuntimeError(f"invald precision {precision}")    
 | 
			
		||||
| 
						 | 
				
			
			@ -1,82 +0,0 @@
 | 
			
		|||
#
 | 
			
		||||
# Copyright 2016 The BigDL Authors.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# this code is copied from llama2 example test, and added performance test
 | 
			
		||||
import argparse
 | 
			
		||||
import os
 | 
			
		||||
import subprocess
 | 
			
		||||
 | 
			
		||||
task_cmd = "--num_fewshot {} --tasks {}"
 | 
			
		||||
 | 
			
		||||
task_map = {
 | 
			
		||||
    "hellaswag": task_cmd.format(10, "hellaswag"),
 | 
			
		||||
    "arc": task_cmd.format(25, "arc_challenge"),
 | 
			
		||||
    "truthfulqa": task_cmd.format(0, "truthfulqa_mc"),
 | 
			
		||||
    "mmlu": task_cmd.format(5, "hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
prec_to_arg = {
 | 
			
		||||
    "bigdl-llm": {
 | 
			
		||||
        "int4": "load_in_low_bit=sym_int4",
 | 
			
		||||
        "nf4": "load_in_low_bit=nf4",
 | 
			
		||||
        "nf3": "load_in_low_bit=nf3",
 | 
			
		||||
        "fp8": "load_in_low_bit=fp8",
 | 
			
		||||
        "fp4": "load_in_low_bit=fp4",
 | 
			
		||||
        "bf16": "dtype=bfloat16",
 | 
			
		||||
        "fp16": "dtype=float16",
 | 
			
		||||
    },
 | 
			
		||||
    "hf-causal": {
 | 
			
		||||
        "nf4": "bnb_type=nf4",
 | 
			
		||||
        "bf16": "dtype=bfloat16",
 | 
			
		||||
        "fp16": "dtype=float16",
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
    parser = argparse.ArgumentParser()
 | 
			
		||||
    parser.add_argument("--model", required=True, type=str)
 | 
			
		||||
    parser.add_argument("--pretrained", required=True, type=str)
 | 
			
		||||
    parser.add_argument("--precision", required=True, nargs='+', type=str)
 | 
			
		||||
    parser.add_argument("--device", required=True, type=str)
 | 
			
		||||
    parser.add_argument("--batch", default=1, type=int)
 | 
			
		||||
    parser.add_argument("--tasks", required=True, nargs='+', type=str)
 | 
			
		||||
    parser.add_argument("--output_dir", type=str)
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    print(args.model)
 | 
			
		||||
    print(args.tasks)
 | 
			
		||||
    basic_cmd = "python lm-evaluation-harness/main.py --model {} --model_args pretrained={},{} --no_cache --device {} --batch_size {} {} --output_path {} "
 | 
			
		||||
    os.makedirs(args.output_dir, exist_ok=True)
 | 
			
		||||
    index = 1
 | 
			
		||||
    total = len(args.precision) * len(args.tasks)
 | 
			
		||||
    for prec in args.precision:
 | 
			
		||||
        prec_arg = prec_to_arg[args.model][prec]
 | 
			
		||||
        for task in args.tasks:
 | 
			
		||||
            output_path = f"{args.model}_{prec}_{args.device}_{task}"
 | 
			
		||||
            task_arg = task_map[task]
 | 
			
		||||
            cmd_exec = basic_cmd.format(args.model, args.pretrained, prec_arg, args.device, args.batch,
 | 
			
		||||
             task_arg, f"{args.output_dir}/{output_path}")
 | 
			
		||||
            print(f"Running job {index}/{total}:\n{cmd_exec}")
 | 
			
		||||
            index += 1
 | 
			
		||||
            with open(f"{args.output_dir}/log_{output_path}.txt", "w") as f:
 | 
			
		||||
                return_code = subprocess.call(cmd_exec, shell=True)
 | 
			
		||||
            if return_code == 0:
 | 
			
		||||
                print("Successful")
 | 
			
		||||
            else:
 | 
			
		||||
                print("Failed")
 | 
			
		||||
 | 
			
		||||
main()
 | 
			
		||||
							
								
								
									
										149
									
								
								python/llm/dev/benchmark/harness/run_llb.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										149
									
								
								python/llm/dev/benchmark/harness/run_llb.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,149 @@
 | 
			
		|||
#
 | 
			
		||||
# Copyright 2016 The BigDL Authors.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
#
 | 
			
		||||
import argparse
 | 
			
		||||
import json
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
from harness_to_leaderboard import *
 | 
			
		||||
from lm_eval import tasks, evaluator, utils, models
 | 
			
		||||
 | 
			
		||||
from bigdl_llm import BigDLLM
 | 
			
		||||
models.MODEL_REGISTRY['bigdl-llm'] = BigDLLM    # patch bigdl-llm to harness
 | 
			
		||||
 | 
			
		||||
logging.getLogger("openai").setLevel(logging.WARNING)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_args():
 | 
			
		||||
    parser = argparse.ArgumentParser()
 | 
			
		||||
    parser.add_argument("--model", required=True)
 | 
			
		||||
    parser.add_argument("--model_args", default="")
 | 
			
		||||
    parser.add_argument("--pretrained", required=True, type=str)
 | 
			
		||||
    parser.add_argument("--tasks", required=True, nargs='+', type=str)
 | 
			
		||||
    parser.add_argument("--precision", required=True, nargs='+', type=str)
 | 
			
		||||
    parser.add_argument("--provide_description", action="store_true")
 | 
			
		||||
    parser.add_argument("--num_fewshot", type=int, default=0)
 | 
			
		||||
    parser.add_argument("--batch_size", type=str, default=None)
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--max_batch_size",
 | 
			
		||||
        type=int,
 | 
			
		||||
        default=None,
 | 
			
		||||
        help="Maximal batch size to try with --batch_size auto",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument("--device", type=str, default=None)
 | 
			
		||||
    parser.add_argument("--output_path", default=None)
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--limit",
 | 
			
		||||
        type=float,
 | 
			
		||||
        default=None,
 | 
			
		||||
        help="Limit the number of examples per task. "
 | 
			
		||||
        "If <1, limit is a percentage of the total number of examples.",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument("--data_sampling", type=float, default=None)
 | 
			
		||||
    parser.add_argument("--no_cache", action="store_true")
 | 
			
		||||
    parser.add_argument("--decontamination_ngrams_path", default=None)
 | 
			
		||||
    parser.add_argument("--description_dict_path", default=None)
 | 
			
		||||
    parser.add_argument("--check_integrity", action="store_true")
 | 
			
		||||
    parser.add_argument("--write_out", action="store_true", default=False)
 | 
			
		||||
    parser.add_argument("--output_base_path", type=str, default=None)
 | 
			
		||||
 | 
			
		||||
    return parser.parse_args()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
    args = parse_args()
 | 
			
		||||
 | 
			
		||||
    assert not args.provide_description  # not implemented
 | 
			
		||||
 | 
			
		||||
    if args.limit:
 | 
			
		||||
        print(
 | 
			
		||||
            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
 | 
			
		||||
        )
 | 
			
		||||
    
 | 
			
		||||
    # if args.tasks is None:
 | 
			
		||||
    #     task_names = tasks.ALL_TASKS
 | 
			
		||||
    # else:
 | 
			
		||||
    #     task_names = utils.pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
 | 
			
		||||
 | 
			
		||||
    print(f"Selected Tasks: {args.tasks}")
 | 
			
		||||
 | 
			
		||||
    description_dict = {}
 | 
			
		||||
    if args.description_dict_path:
 | 
			
		||||
        with open(args.description_dict_path, "r") as f:
 | 
			
		||||
            description_dict = json.load(f)
 | 
			
		||||
 | 
			
		||||
    success = []
 | 
			
		||||
    fail = []
 | 
			
		||||
    for prec in args.precision:
 | 
			
		||||
        prec_arg = parse_precision(prec, args.model)
 | 
			
		||||
        model_args = f"pretrained={args.pretrained},{prec_arg}"
 | 
			
		||||
        if len(args.model_args) > 0:
 | 
			
		||||
            model_args += args.model_args
 | 
			
		||||
        for task in args.tasks:
 | 
			
		||||
            task_names=task_map.get(task, task).split(',')
 | 
			
		||||
            num_fewshot = task_to_n_few_shots.get(task, args.num_fewshot)
 | 
			
		||||
            try:
 | 
			
		||||
                results = evaluator.simple_evaluate(
 | 
			
		||||
                    model=args.model,
 | 
			
		||||
                    model_args=model_args,
 | 
			
		||||
                    tasks=task_names,
 | 
			
		||||
                    num_fewshot=num_fewshot,
 | 
			
		||||
                    batch_size=args.batch_size,
 | 
			
		||||
                    max_batch_size=args.max_batch_size,
 | 
			
		||||
                    device=args.device,
 | 
			
		||||
                    no_cache=args.no_cache,
 | 
			
		||||
                    limit=args.limit,
 | 
			
		||||
                    description_dict=description_dict,
 | 
			
		||||
                    decontamination_ngrams_path=args.decontamination_ngrams_path,
 | 
			
		||||
                    check_integrity=args.check_integrity,
 | 
			
		||||
                    write_out=args.write_out,
 | 
			
		||||
                    output_base_path=args.output_base_path,
 | 
			
		||||
                )
 | 
			
		||||
                if len(results['results']) > 1:
 | 
			
		||||
                    average = {}
 | 
			
		||||
                    for _, subtask in results['results'].items():
 | 
			
		||||
                        for metric, value in subtask.items():
 | 
			
		||||
                            average[metric] = average.get(metric, []) + [value]
 | 
			
		||||
                    for k, v in average.items():
 | 
			
		||||
                        average[k] = sum(average[k]) / len(average[k]) if not k.endswith("_stderr") else 0
 | 
			
		||||
                    results['results'][f"avg_{task}"] = average
 | 
			
		||||
                    results['versions'][f"avg_{task}"] = 1
 | 
			
		||||
 | 
			
		||||
                dumped = json.dumps(results, indent=2)
 | 
			
		||||
                print(dumped)
 | 
			
		||||
 | 
			
		||||
                if args.output_path:
 | 
			
		||||
                    dirname = os.path.dirname(args.output_path)
 | 
			
		||||
                    if dirname:
 | 
			
		||||
                        os.makedirs(dirname, exist_ok=True)
 | 
			
		||||
                    with open(args.output_path, "w") as f:
 | 
			
		||||
                        f.write(dumped)
 | 
			
		||||
                success.append(results)
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                fail.append(f"Job config of task={task}, precision={prec} failed. Error Message: {str(e)}")
 | 
			
		||||
                print(f"Job config of task={task}, precision={prec} failed. Error Message: {str(e)}")
 | 
			
		||||
    
 | 
			
		||||
    ## print all task summary
 | 
			
		||||
    print("Here are results of all successful tasks:")
 | 
			
		||||
    for results in success:
 | 
			
		||||
        print(results['config'])
 | 
			
		||||
        print(evaluator.make_table(results))
 | 
			
		||||
 | 
			
		||||
    if len(fail) > 0:
 | 
			
		||||
        raise RuntimeError('\n'.join(fail))
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    main()
 | 
			
		||||
		Loading…
	
		Reference in a new issue