harness tests on pvc multiple xpus (#9908)
* add run_multi_llb.py * update readme * add job hint
This commit is contained in:
parent
27b19106f3
commit
301425e377
2 changed files with 169 additions and 0 deletions
|
|
@ -22,5 +22,10 @@ python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3
|
||||||
```python
|
```python
|
||||||
python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
|
python run_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
|
||||||
```
|
```
|
||||||
|
### Evaluation using multiple Intel GPU
|
||||||
|
```python
|
||||||
|
python run_multi_llb.py --model bigdl-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu:0,2,3 --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
|
||||||
|
```
|
||||||
|
Taking example above, the script will fork 3 processes, each for one xpu, to execute the tasks.
|
||||||
## Results
|
## Results
|
||||||
We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result.
|
We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result.
|
||||||
|
|
|
||||||
164
python/llm/dev/benchmark/harness/run_multi_llb.py
Normal file
164
python/llm/dev/benchmark/harness/run_multi_llb.py
Normal file
|
|
@ -0,0 +1,164 @@
|
||||||
|
#
|
||||||
|
# Copyright 2016 The BigDL Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from harness_to_leaderboard import *
|
||||||
|
from lm_eval import tasks, evaluator, utils, models
|
||||||
|
from multiprocessing import Queue, Process
|
||||||
|
import multiprocessing as mp
|
||||||
|
from contextlib import redirect_stdout, redirect_stderr
|
||||||
|
from bigdl_llm import BigDLLM
|
||||||
|
models.MODEL_REGISTRY['bigdl-llm'] = BigDLLM # patch bigdl-llm to harness
|
||||||
|
|
||||||
|
logging.getLogger("openai").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_device(device):
|
||||||
|
device = device.split(':')
|
||||||
|
if len(device) == 0:
|
||||||
|
return device
|
||||||
|
device_indices = device[1].split(',')
|
||||||
|
return list(map(lambda i: f"{device[0]}:{i}", device_indices))
|
||||||
|
|
||||||
|
def run_job(device, prec, task, args, device_pool, result_pool):
|
||||||
|
print(f"Current Job: device={device}, precision={prec}, task={task}")
|
||||||
|
device_type = device.split(':')[0]
|
||||||
|
description_dict = {}
|
||||||
|
if args.description_dict_path:
|
||||||
|
with open(args.description_dict_path, "r") as f:
|
||||||
|
description_dict = json.load(f)
|
||||||
|
|
||||||
|
model_name = os.path.basename(os.path.realpath(args.pretrained))
|
||||||
|
output_path = args.output_path if args.output_path else "results"
|
||||||
|
|
||||||
|
prec_arg = parse_precision(prec, args.model)
|
||||||
|
model_args = f"pretrained={args.pretrained},{prec_arg}"
|
||||||
|
if len(args.model_args) > 0:
|
||||||
|
model_args = f"{model_args},{args.model_args}"
|
||||||
|
task_names=task_map.get(task, task).split(',')
|
||||||
|
num_fewshot = task_to_n_few_shots.get(task, args.num_fewshot)
|
||||||
|
log_dir = f"{output_path}/{model_name}/{device_type}/{prec}/{task}"
|
||||||
|
os.makedirs(log_dir, exist_ok=True)
|
||||||
|
|
||||||
|
with open(f"{log_dir}/log.txt", 'w') as f, redirect_stderr(f), redirect_stdout(f):
|
||||||
|
results = evaluator.simple_evaluate(
|
||||||
|
model=args.model,
|
||||||
|
model_args=model_args,
|
||||||
|
tasks=task_names,
|
||||||
|
num_fewshot=num_fewshot,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
max_batch_size=args.max_batch_size,
|
||||||
|
device=device,
|
||||||
|
no_cache=args.no_cache,
|
||||||
|
limit=args.limit,
|
||||||
|
description_dict=description_dict,
|
||||||
|
decontamination_ngrams_path=args.decontamination_ngrams_path,
|
||||||
|
check_integrity=args.check_integrity,
|
||||||
|
write_out=args.write_out,
|
||||||
|
output_base_path=log_dir
|
||||||
|
)
|
||||||
|
if len(results['results']) > 1:
|
||||||
|
average = {}
|
||||||
|
for _, subtask in results['results'].items():
|
||||||
|
for metric, value in subtask.items():
|
||||||
|
average[metric] = average.get(metric, []) + [value]
|
||||||
|
for k, v in average.items():
|
||||||
|
average[k] = sum(v) / len(v) if not k.endswith("_stderr") else 0
|
||||||
|
results['results'][task] = average
|
||||||
|
results['versions'][task] = 1
|
||||||
|
|
||||||
|
dumped = json.dumps(results, indent=2)
|
||||||
|
print(dumped)
|
||||||
|
|
||||||
|
if args.output_path:
|
||||||
|
with open(f"{log_dir}/result.json", "w") as f:
|
||||||
|
f.write(dumped)
|
||||||
|
result_pool.put(results)
|
||||||
|
device_pool.put(device)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--model", required=True)
|
||||||
|
parser.add_argument("--model_args", default="")
|
||||||
|
parser.add_argument("--pretrained", required=True, type=str)
|
||||||
|
parser.add_argument("--tasks", required=True, nargs='+', type=str)
|
||||||
|
parser.add_argument("--precision", required=True, nargs='+', type=str)
|
||||||
|
parser.add_argument("--provide_description", action="store_true")
|
||||||
|
parser.add_argument("--num_fewshot", type=int, default=0)
|
||||||
|
parser.add_argument("--batch_size", type=str, default=None)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max_batch_size",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Maximal batch size to try with --batch_size auto",
|
||||||
|
)
|
||||||
|
parser.add_argument("--device", type=str, default=None)
|
||||||
|
parser.add_argument("--output_path", default=None)
|
||||||
|
parser.add_argument(
|
||||||
|
"--limit",
|
||||||
|
type=float,
|
||||||
|
default=None,
|
||||||
|
help="Limit the number of examples per task. "
|
||||||
|
"If <1, limit is a percentage of the total number of examples.",
|
||||||
|
)
|
||||||
|
parser.add_argument("--data_sampling", type=float, default=None)
|
||||||
|
parser.add_argument("--no_cache", action="store_true")
|
||||||
|
parser.add_argument("--decontamination_ngrams_path", default=None)
|
||||||
|
parser.add_argument("--description_dict_path", default=None)
|
||||||
|
parser.add_argument("--check_integrity", action="store_true")
|
||||||
|
parser.add_argument("--write_out", action="store_true", default=False)
|
||||||
|
parser.add_argument("--output_base_path", type=str, default=None)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
mp.set_start_method('spawn')
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
assert not args.provide_description # not implemented
|
||||||
|
|
||||||
|
if args.limit:
|
||||||
|
print(
|
||||||
|
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
||||||
|
)
|
||||||
|
print(f"Selected Tasks: {args.tasks}")
|
||||||
|
|
||||||
|
device_pool = Queue()
|
||||||
|
result_pool = Queue()
|
||||||
|
for device in parse_device(args.device):
|
||||||
|
device_pool.put(device)
|
||||||
|
|
||||||
|
jobs = []
|
||||||
|
for prec in args.precision:
|
||||||
|
for task in args.tasks:
|
||||||
|
device = device_pool.get()
|
||||||
|
p = Process(target=run_job, args=(device, prec, task, args, device_pool, result_pool))
|
||||||
|
p.start()
|
||||||
|
jobs.append(p)
|
||||||
|
|
||||||
|
for j in jobs:
|
||||||
|
j.join()
|
||||||
|
|
||||||
|
while not result_pool.empty():
|
||||||
|
result = result_pool.get()
|
||||||
|
print(result if isinstance(result, str) else evaluator.make_table(result))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in a new issue