Merge harness (#9319)
* add harness patch and llb script * add readme * add license * use patch instead * update readme * rename tests to evaluation * fix typo * remove nano dependency * add original harness link * rename title of usage * rename BigDLGPULM as BigDLLM * empty commit to rerun job
This commit is contained in:
		
							parent
							
								
									63b2556ce2
								
							
						
					
					
						commit
						d4dffbdb62
					
				
					 3 changed files with 261 additions and 0 deletions
				
			
		
							
								
								
									
										28
									
								
								python/llm/dev/benchmark/harness/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								python/llm/dev/benchmark/harness/README.md
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,28 @@
 | 
				
			||||||
 | 
					# Harness Evalution
 | 
				
			||||||
 | 
					[Harness evalution](https://github.com/EleutherAI/lm-evaluation-harness) allows users to eaisly get accuracy on various datasets. Here we have enabled harness evalution with BigDL-LLM under 
 | 
				
			||||||
 | 
					[Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) settings.
 | 
				
			||||||
 | 
					Before running, make sure to have [bigdl-llm](../../../README.md) installed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Install Harness
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					git clone https://github.com/EleutherAI/lm-evaluation-harness.git
 | 
				
			||||||
 | 
					cd  lm-evaluation-harness
 | 
				
			||||||
 | 
					git checkout e81d3cc
 | 
				
			||||||
 | 
					pip install -e .
 | 
				
			||||||
 | 
					git apply ../bigdl-llm.patch
 | 
				
			||||||
 | 
					cd ..
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Run
 | 
				
			||||||
 | 
					run `python llb.py`. `llb.py` combines some arguments in `main.py` to make evalutions easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Evaluation on CPU
 | 
				
			||||||
 | 
					```python
 | 
				
			||||||
 | 
					python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					### Evaluation on Intel GPU
 | 
				
			||||||
 | 
					```python
 | 
				
			||||||
 | 
					python llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --output_dir results/output
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					## Results
 | 
				
			||||||
 | 
					We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result.
 | 
				
			||||||
							
								
								
									
										151
									
								
								python/llm/dev/benchmark/harness/bigdl-llm.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151
									
								
								python/llm/dev/benchmark/harness/bigdl-llm.patch
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,151 @@
 | 
				
			||||||
 | 
					diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
 | 
				
			||||||
 | 
					index 8ca27fac..6b581487 100644
 | 
				
			||||||
 | 
					--- a/lm_eval/models/__init__.py
 | 
				
			||||||
 | 
					+++ b/lm_eval/models/__init__.py
 | 
				
			||||||
 | 
					@@ -4,6 +4,7 @@ from . import anthropic_llms
 | 
				
			||||||
 | 
					 from . import huggingface
 | 
				
			||||||
 | 
					 from . import textsynth
 | 
				
			||||||
 | 
					 from . import dummy
 | 
				
			||||||
 | 
					+from . import bigdl_llm
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					 MODEL_REGISTRY = {
 | 
				
			||||||
 | 
					     "hf": gpt2.HFLM,
 | 
				
			||||||
 | 
					@@ -15,6 +16,7 @@ MODEL_REGISTRY = {
 | 
				
			||||||
 | 
					     "anthropic": anthropic_llms.AnthropicLM,
 | 
				
			||||||
 | 
					     "textsynth": textsynth.TextSynthLM,
 | 
				
			||||||
 | 
					     "dummy": dummy.DummyLM,
 | 
				
			||||||
 | 
					+    "bigdl-llm": bigdl_llm.BigDLLM
 | 
				
			||||||
 | 
					 }
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					diff --git a/lm_eval/models/bigdl_llm.py b/lm_eval/models/bigdl_llm.py
 | 
				
			||||||
 | 
					new file mode 100644
 | 
				
			||||||
 | 
					index 00000000..74010da3
 | 
				
			||||||
 | 
					--- /dev/null
 | 
				
			||||||
 | 
					+++ b/lm_eval/models/bigdl_llm.py
 | 
				
			||||||
 | 
					@@ -0,0 +1,124 @@
 | 
				
			||||||
 | 
					+#
 | 
				
			||||||
 | 
					+# Copyright 2016 The BigDL Authors.
 | 
				
			||||||
 | 
					+#
 | 
				
			||||||
 | 
					+# Licensed under the Apache License, Version 2.0 (the "License");
 | 
				
			||||||
 | 
					+# you may not use this file except in compliance with the License.
 | 
				
			||||||
 | 
					+# You may obtain a copy of the License at
 | 
				
			||||||
 | 
					+#
 | 
				
			||||||
 | 
					+#     http://www.apache.org/licenses/LICENSE-2.0
 | 
				
			||||||
 | 
					+#
 | 
				
			||||||
 | 
					+# Unless required by applicable law or agreed to in writing, software
 | 
				
			||||||
 | 
					+# distributed under the License is distributed on an "AS IS" BASIS,
 | 
				
			||||||
 | 
					+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
				
			||||||
 | 
					+# See the License for the specific language governing permissions and
 | 
				
			||||||
 | 
					+# limitations under the License.
 | 
				
			||||||
 | 
					+#
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+# this code is copied from llama2 example test, and added performance test
 | 
				
			||||||
 | 
					+import os
 | 
				
			||||||
 | 
					+import multiprocessing
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+import torch
 | 
				
			||||||
 | 
					+from typing import Optional, Union
 | 
				
			||||||
 | 
					+from lm_eval.base import BaseLM
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+from transformers import AutoTokenizer, LlamaTokenizer
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+def _get_dtype(
 | 
				
			||||||
 | 
					+    dtype: Union[str, torch.dtype]
 | 
				
			||||||
 | 
					+) -> torch.dtype:
 | 
				
			||||||
 | 
					+    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
 | 
				
			||||||
 | 
					+    if isinstance(dtype, str) and dtype != "auto":
 | 
				
			||||||
 | 
					+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
 | 
				
			||||||
 | 
					+        _torch_dtype = getattr(torch, dtype)
 | 
				
			||||||
 | 
					+    else:
 | 
				
			||||||
 | 
					+        _torch_dtype = dtype
 | 
				
			||||||
 | 
					+    return _torch_dtype
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+class BigDLLM(BaseLM):
 | 
				
			||||||
 | 
					+    def __init__(
 | 
				
			||||||
 | 
					+        self,
 | 
				
			||||||
 | 
					+        device="xpu",
 | 
				
			||||||
 | 
					+        pretrained="gpt2",
 | 
				
			||||||
 | 
					+        revision="main",
 | 
				
			||||||
 | 
					+        low_cpu_mem_usage=None,
 | 
				
			||||||
 | 
					+        subfolder=None,
 | 
				
			||||||
 | 
					+        tokenizer=None,
 | 
				
			||||||
 | 
					+        batch_size=1,
 | 
				
			||||||
 | 
					+        load_in_8bit: Optional[bool] = False,
 | 
				
			||||||
 | 
					+        trust_remote_code: Optional[bool] = False,
 | 
				
			||||||
 | 
					+        load_in_low_bit=None,
 | 
				
			||||||
 | 
					+        dtype: Optional[Union[str, torch.dtype]] = "auto",
 | 
				
			||||||
 | 
					+    ):
 | 
				
			||||||
 | 
					+        super().__init__()
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+        assert isinstance(pretrained, str)
 | 
				
			||||||
 | 
					+        assert isinstance(batch_size, (int,str))
 | 
				
			||||||
 | 
					+        if device == 'xpu':
 | 
				
			||||||
 | 
					+            import intel_extension_for_pytorch as ipex
 | 
				
			||||||
 | 
					+        model = AutoModelForCausalLM.from_pretrained(pretrained,
 | 
				
			||||||
 | 
					+                                          load_in_low_bit=load_in_low_bit,
 | 
				
			||||||
 | 
					+                                          optimize_model=True,
 | 
				
			||||||
 | 
					+                                          trust_remote_code=True,
 | 
				
			||||||
 | 
					+                                          use_cache=True,
 | 
				
			||||||
 | 
					+                                          torch_dtype=_get_dtype(dtype))
 | 
				
			||||||
 | 
					+        print(model) # print model to check precision
 | 
				
			||||||
 | 
					+        self._device = device
 | 
				
			||||||
 | 
					+        self.model = model.to(device)
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+        self.tokenizer = LlamaTokenizer.from_pretrained(pretrained, trust_remote_code=True)
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+        # setup for automatic batch size detection
 | 
				
			||||||
 | 
					+        if batch_size == 'auto':
 | 
				
			||||||
 | 
					+            self.batch_size_per_gpu = batch_size
 | 
				
			||||||
 | 
					+        else:
 | 
				
			||||||
 | 
					+            self.batch_size_per_gpu = int(batch_size)
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+    @property
 | 
				
			||||||
 | 
					+    def eot_token_id(self):
 | 
				
			||||||
 | 
					+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
 | 
				
			||||||
 | 
					+        return self.model.token_eos()
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+    @property
 | 
				
			||||||
 | 
					+    def max_length(self):
 | 
				
			||||||
 | 
					+        return 2048  # TODO: how to get this from config
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+    @property
 | 
				
			||||||
 | 
					+    def max_gen_toks(self):
 | 
				
			||||||
 | 
					+        return 256
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+    @property
 | 
				
			||||||
 | 
					+    def batch_size(self):
 | 
				
			||||||
 | 
					+        # TODO: fix multi-gpu
 | 
				
			||||||
 | 
					+        return self.batch_size_per_gpu  # * gpus
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+    @property
 | 
				
			||||||
 | 
					+    def device(self):
 | 
				
			||||||
 | 
					+        # TODO: fix multi-gpu
 | 
				
			||||||
 | 
					+        return torch.device(self._device)
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+    def tok_encode(self, string: str):
 | 
				
			||||||
 | 
					+        input_ids = self.tokenizer.encode(string)
 | 
				
			||||||
 | 
					+        return input_ids
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+    def tok_decode(self, tokens):
 | 
				
			||||||
 | 
					+        return self.tokenizer.decode(output[0], skip_special_tokens=True)
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+    def _model_call(self, inps):
 | 
				
			||||||
 | 
					+        """
 | 
				
			||||||
 | 
					+        inps: a torch tensor of shape [batch, sequence]
 | 
				
			||||||
 | 
					+        the size of sequence may vary from call to call
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+        returns: a torch tensor of shape [batch, sequence, vocab] with the
 | 
				
			||||||
 | 
					+        logits returned from the model
 | 
				
			||||||
 | 
					+        """
 | 
				
			||||||
 | 
					+        with torch.inference_mode():
 | 
				
			||||||
 | 
					+            inps = inps.to(self.device)
 | 
				
			||||||
 | 
					+            res = self.model(inps)[0]
 | 
				
			||||||
 | 
					+            return res
 | 
				
			||||||
 | 
					+
 | 
				
			||||||
 | 
					+    def _model_generate(self, context, max_length, eos_token_id):
 | 
				
			||||||
 | 
					+        return self.model(context, max_tokens=max_length, stop=["Q:", "\n"], echo=True)
 | 
				
			||||||
 | 
					\ No newline at end of file
 | 
				
			||||||
							
								
								
									
										82
									
								
								python/llm/dev/benchmark/harness/llb.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								python/llm/dev/benchmark/harness/llb.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,82 @@
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Copyright 2016 The BigDL Authors.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Licensed under the Apache License, Version 2.0 (the "License");
 | 
				
			||||||
 | 
					# you may not use this file except in compliance with the License.
 | 
				
			||||||
 | 
					# You may obtain a copy of the License at
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					#     http://www.apache.org/licenses/LICENSE-2.0
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Unless required by applicable law or agreed to in writing, software
 | 
				
			||||||
 | 
					# distributed under the License is distributed on an "AS IS" BASIS,
 | 
				
			||||||
 | 
					# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
				
			||||||
 | 
					# See the License for the specific language governing permissions and
 | 
				
			||||||
 | 
					# limitations under the License.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# this code is copied from llama2 example test, and added performance test
 | 
				
			||||||
 | 
					import argparse
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import subprocess
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					task_cmd = "--num_fewshot {} --tasks {}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					task_map = {
 | 
				
			||||||
 | 
					    "hellaswag": task_cmd.format(10, "hellaswag"),
 | 
				
			||||||
 | 
					    "arc": task_cmd.format(25, "arc_challenge"),
 | 
				
			||||||
 | 
					    "truthfulqa": task_cmd.format(0, "truthfulqa_mc"),
 | 
				
			||||||
 | 
					    "mmlu": task_cmd.format(5, "hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions")
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					prec_to_arg = {
 | 
				
			||||||
 | 
					    "bigdl-llm": {
 | 
				
			||||||
 | 
					        "int4": "load_in_low_bit=sym_int4",
 | 
				
			||||||
 | 
					        "nf4": "load_in_low_bit=nf4",
 | 
				
			||||||
 | 
					        "nf3": "load_in_low_bit=nf3",
 | 
				
			||||||
 | 
					        "fp8": "load_in_low_bit=fp8",
 | 
				
			||||||
 | 
					        "fp4": "load_in_low_bit=fp4",
 | 
				
			||||||
 | 
					        "bf16": "dtype=bfloat16",
 | 
				
			||||||
 | 
					        "fp16": "dtype=float16",
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    "hf-causal": {
 | 
				
			||||||
 | 
					        "nf4": "bnb_type=nf4",
 | 
				
			||||||
 | 
					        "bf16": "dtype=bfloat16",
 | 
				
			||||||
 | 
					        "fp16": "dtype=float16",
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def main():
 | 
				
			||||||
 | 
					    parser = argparse.ArgumentParser()
 | 
				
			||||||
 | 
					    parser.add_argument("--model", required=True, type=str)
 | 
				
			||||||
 | 
					    parser.add_argument("--pretrained", required=True, type=str)
 | 
				
			||||||
 | 
					    parser.add_argument("--precision", required=True, nargs='+', type=str)
 | 
				
			||||||
 | 
					    parser.add_argument("--device", required=True, type=str)
 | 
				
			||||||
 | 
					    parser.add_argument("--batch", default=1, type=int)
 | 
				
			||||||
 | 
					    parser.add_argument("--tasks", required=True, nargs='+', type=str)
 | 
				
			||||||
 | 
					    parser.add_argument("--output_dir", type=str)
 | 
				
			||||||
 | 
					    args = parser.parse_args()
 | 
				
			||||||
 | 
					    print(args.model)
 | 
				
			||||||
 | 
					    print(args.tasks)
 | 
				
			||||||
 | 
					    basic_cmd = "python lm-evaluation-harness/main.py --model {} --model_args pretrained={},{} --no_cache --device {} --batch_size {} {} --output_path {} "
 | 
				
			||||||
 | 
					    os.makedirs(args.output_dir, exist_ok=True)
 | 
				
			||||||
 | 
					    index = 1
 | 
				
			||||||
 | 
					    total = len(args.precision) * len(args.tasks)
 | 
				
			||||||
 | 
					    for prec in args.precision:
 | 
				
			||||||
 | 
					        prec_arg = prec_to_arg[args.model][prec]
 | 
				
			||||||
 | 
					        for task in args.tasks:
 | 
				
			||||||
 | 
					            output_path = f"{args.model}_{prec}_{args.device}_{task}"
 | 
				
			||||||
 | 
					            task_arg = task_map[task]
 | 
				
			||||||
 | 
					            cmd_exec = basic_cmd.format(args.model, args.pretrained, prec_arg, args.device, args.batch,
 | 
				
			||||||
 | 
					             task_arg, f"{args.output_dir}/{output_path}")
 | 
				
			||||||
 | 
					            print(f"Running job {index}/{total}:\n{cmd_exec}")
 | 
				
			||||||
 | 
					            index += 1
 | 
				
			||||||
 | 
					            with open(f"{args.output_dir}/log_{output_path}.txt", "w") as f:
 | 
				
			||||||
 | 
					                return_code = subprocess.call(cmd_exec, shell=True, stderr=f, stdout=f)
 | 
				
			||||||
 | 
					            if return_code == 0:
 | 
				
			||||||
 | 
					                print("Successful")
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                print("Failed")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					main()
 | 
				
			||||||
		Loading…
	
		Reference in a new issue