* add nv longbench * LongBench: NV code to ipex-llm * ammend * add more models support * ammend * optimize LongBench's user experience * ammend * ammend * fix typo * ammend * remove cuda related information & add a readme * add license to python scripts & polish the readme * ammend * ammend --------- Co-authored-by: cyita <yitastudy@gmail.com> Co-authored-by: ATMxsp01 <shou.xu@intel.com> Co-authored-by: leonardozcm <leonardo1997zcm@gmail.com>
150 lines
5.7 KiB
Python
150 lines
5.7 KiB
Python
#
|
|
# Copyright 2016 The BigDL Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# This file is adapted from
|
|
# https://github.com/THUDM/LongBench/blob/main/eval.py
|
|
# and
|
|
# https://github.com/FasterDecoding/SnapKV/blob/main/experiments/LongBench/eval.py
|
|
|
|
import os
|
|
import json
|
|
import argparse
|
|
import numpy as np
|
|
|
|
current_dir = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
from metrics import (
|
|
qa_f1_score,
|
|
rouge_zh_score,
|
|
qa_f1_zh_score,
|
|
rouge_score,
|
|
classification_score,
|
|
retrieval_score,
|
|
retrieval_zh_score,
|
|
count_score,
|
|
code_sim_score,
|
|
)
|
|
|
|
dataset2metric = {
|
|
"narrativeqa": qa_f1_score,
|
|
"qasper": qa_f1_score,
|
|
"multifieldqa_en": qa_f1_score,
|
|
"multifieldqa_zh": qa_f1_zh_score,
|
|
"hotpotqa": qa_f1_score,
|
|
"2wikimqa": qa_f1_score,
|
|
"musique": qa_f1_score,
|
|
"dureader": rouge_zh_score,
|
|
"gov_report": rouge_score,
|
|
"qmsum": rouge_score,
|
|
"multi_news": rouge_score,
|
|
"vcsum": rouge_zh_score,
|
|
"trec": classification_score,
|
|
"triviaqa": qa_f1_score,
|
|
"samsum": rouge_score,
|
|
"lsht": classification_score,
|
|
"passage_retrieval_en": retrieval_score,
|
|
"passage_count": count_score,
|
|
"passage_retrieval_zh": retrieval_zh_score,
|
|
"lcc": code_sim_score,
|
|
"repobench-p": code_sim_score,
|
|
}
|
|
|
|
def parse_args(args=None):
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--model', type=str, default=None)
|
|
parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
|
|
return parser.parse_args(args)
|
|
|
|
def scorer_e(dataset, predictions, answers, lengths, all_classes):
|
|
scores = {"0-4k": [], "4-8k": [], "8k+": []}
|
|
for (prediction, ground_truths, length) in zip(predictions, answers, lengths):
|
|
score = 0.
|
|
if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
|
|
prediction = prediction.lstrip('\n').split('\n')[0]
|
|
for ground_truth in ground_truths:
|
|
score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
|
|
if length < 4000:
|
|
scores["0-4k"].append(score)
|
|
elif length < 8000:
|
|
scores["4-8k"].append(score)
|
|
else:
|
|
scores["8k+"].append(score)
|
|
for key in scores.keys():
|
|
scores[key] = round(100 * np.mean(scores[key]), 2)
|
|
return scores
|
|
|
|
def scorer(dataset, predictions, answers, all_classes):
|
|
total_score = 0.
|
|
for (prediction, ground_truths) in zip(predictions, answers):
|
|
score = 0.
|
|
if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
|
|
prediction = prediction.lstrip('\n').split('\n')[0]
|
|
for ground_truth in ground_truths:
|
|
score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
|
|
total_score += score
|
|
return round(100 * total_score / len(predictions), 2)
|
|
|
|
|
|
def result_path_range(full_kv: bool, configs: list[str], model_name: str, fa_name: str):
|
|
if full_kv:
|
|
yield f"{fa_name}/{model_name}"
|
|
|
|
for config in configs:
|
|
yield f"{fa_name}/{model_name}_{config}"
|
|
|
|
|
|
if __name__ == '__main__':
|
|
from omegaconf import OmegaConf
|
|
conf = OmegaConf.load(f'{current_dir}/config.yaml')
|
|
|
|
model_names = conf['model_name'] if OmegaConf.is_list(conf['model_name']) else [conf['model_name']]
|
|
full_kv = conf['full_kv']
|
|
ees = conf['e'] if OmegaConf.is_list(conf['e']) else [conf['e']]
|
|
compresskv_configs = conf['compress_kv'] if OmegaConf.is_list(conf['compress_kv']) else [conf['compress_kv']]
|
|
|
|
model2maxlen = json.load(open(f"{current_dir}/config/model2maxlen.json", "r"))
|
|
|
|
for model_name in model_names:
|
|
max_length = model2maxlen[model_name]
|
|
for e in ees:
|
|
fa_dir_name = f"pred_{'e_' if e else ''}{max_length}"
|
|
for path in result_path_range(full_kv, compresskv_configs, model_name, fa_dir_name):
|
|
scores = dict()
|
|
all_files = os.listdir(path)
|
|
print("Evaluating on:", all_files)
|
|
for filename in all_files:
|
|
if not filename.endswith("jsonl"):
|
|
continue
|
|
predictions, answers, lengths = [], [], []
|
|
dataset = filename.split('.')[0]
|
|
with open(f"{path}/{filename}", "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
data = json.loads(line)
|
|
predictions.append(data["pred"])
|
|
answers.append(data["answers"])
|
|
all_classes = data["all_classes"]
|
|
if "length" in data:
|
|
lengths.append(data["length"])
|
|
if e:
|
|
score = scorer_e(dataset, predictions, answers, lengths, all_classes)
|
|
else:
|
|
score = scorer(dataset, predictions, answers, all_classes)
|
|
if dataset == 'qasper':
|
|
score_e = scorer_e(dataset, predictions, answers, lengths, all_classes)
|
|
scores[dataset] = score
|
|
|
|
out_path = f"{path}/result.json"
|
|
with open(out_path, "w") as f:
|
|
json.dump(scores, f, ensure_ascii=False, indent=4)
|