Add ChatGLM C-Eval Evaluator (#10095)

* Add ChatGLM ceval evaluator

* Modify ChatGLM Evaluator Reference
This commit is contained in:
Yuxuan Xia 2024-02-07 11:27:06 +08:00 committed by GitHub
parent 5e9710cec4
commit 3832eb0ce0
2 changed files with 232 additions and 4 deletions

View file

@ -24,6 +24,7 @@ from tqdm import tqdm
from bigdl.llm.utils.common.log4Error import invalidInputError from bigdl.llm.utils.common.log4Error import invalidInputError
from evaluators.qwen import QwenEvaluator from evaluators.qwen import QwenEvaluator
from evaluators.llama import LlamaEvaluator from evaluators.llama import LlamaEvaluator
from evaluators.chatglm import ChatGLMEvaluator
TASK_NAME_MAPPING = { TASK_NAME_MAPPING = {
@ -280,7 +281,6 @@ def main(args, evaluator):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model_family", type=str, default="llama")
parser.add_argument("--model_path", type=str, default="meta-llama/Llama-2-7b-chat-hf") parser.add_argument("--model_path", type=str, default="meta-llama/Llama-2-7b-chat-hf")
parser.add_argument("--eval_type", type=str, default="validation") parser.add_argument("--eval_type", type=str, default="validation")
parser.add_argument("--device", type=str, default="xpu") parser.add_argument("--device", type=str, default="xpu")
@ -289,22 +289,39 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
if args.model_family == "llama": # decide the model family
model_families = ['llama', 'qwen', 'chatglm']
model_family = None
for family in model_families:
if family in args.model_path.lower():
model_family = family
assert model_family is not None, f"Model {args.model_path}'s model family is not implemented"
if model_family == "llama":
evaluator = LlamaEvaluator( evaluator = LlamaEvaluator(
choices=choices, choices=choices,
model_path=args.model_path, model_path=args.model_path,
device=args.device, device=args.device,
qtype=args.qtype qtype=args.qtype
) )
elif args.model_family == "qwen": elif model_family == "qwen":
evaluator = QwenEvaluator( evaluator = QwenEvaluator(
choices=choices, choices=choices,
model_path=args.model_path, model_path=args.model_path,
device=args.device, device=args.device,
qtype=args.qtype qtype=args.qtype
) )
elif model_family == "chatglm":
evaluator = ChatGLMEvaluator(
choices=choices,
model_path=args.model_path,
device=args.device,
qtype=args.qtype
)
else: else:
invalidInputError( invalidInputError(
False, False,
"Invalid model_family, currently support llama and qwen only.") "Invalid model_family, currently support llama, qwen, and chatglm only.")
main(args, evaluator=evaluator) main(args, evaluator=evaluator)

View file

@ -0,0 +1,211 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# refer to https://github.com/THUDM/ChatGLM2-6B/blob/main/evaluation/evaluate_ceval.py
import re
import torch
from tqdm import tqdm
from thefuzz import process
from transformers import AutoTokenizer
from evaluators.evaluator import Evaluator
from bigdl.llm.transformers import AutoModelForCausalLM
from transformers.generation.utils import LogitsProcessorList
from transformers.generation.logits_process import LogitsProcessor
class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
if torch.isnan(scores).any() or torch.isinf(scores).any():
scores.zero_()
scores[..., 5] = 5e4
return scores
class ChatGLMEvaluator(Evaluator):
def __init__(self, choices, model_path="THUDM/chatglm-6b", device="xpu", qtype="sym_int4"):
super(ChatGLMEvaluator, self).__init__(choices, model_path, device, qtype)
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_path,
trust_remote_code=True
)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path,
load_in_low_bit=self.qtype,
optimize_model=True,
use_cache=True,
trust_remote_code=True
).eval().to(self.device)
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
message = []
k = self.k
if self.k == -1:
k = dev_df.shape[0]
message.append(self.format_example(dev_df.iloc[0, :], cot=cot, add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
for i in range(1, k):
message.append(self.format_example(dev_df.iloc[i, :], cot=cot))
return message
def format_example(self, line, include_answer=True, cot=False, add_prompt=''):
example = add_prompt + line['question']
# print(example)
for choice in self.choices:
example += f'\n{choice}. {line[f"{choice}"]}'
example += '\n答案:'
if include_answer:
if cot:
ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}"
else:
ans = line["answer"]
m = (example, ans)
return m
return example
def extract_cot_answer(self, line, gen_ans):
m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M)
if len(m) > 0 and m[-1] in self.choices:
return m[-1], True
answer_patterns = [
r'([ABCD])是正确的',
r'选项([ABCD])正确',
r'答案为([ABCD])',
r'答案是([ABCD])',
r'答案([ABCD])',
r'选择([ABCD])',
r'答案:([ABCD])',
r'选择答案([ABCD])'
]
# RE extraction
for answer_pattern in answer_patterns:
m = re.search(answer_pattern, gen_ans, re.M)
if m:
answer = m.group(1)
return answer, False
# only containing one choice-character
m = re.findall(r'[ABCD]', gen_ans, re.M)
if len(m) == 1:
answer = m[0]
return answer, False
answer_word_counter = 0
# only containing one choice-context
for c in self.choices:
if str(line[f'{c}']) in gen_ans:
answer = c
answer_word_counter += 1
if answer_word_counter == 1:
return answer, False
return '-', False
def build_prompt(self, text):
return "[Round {}]\n\n问:{}\n\n答:".format(1, text)
def generate_dist(self, model, tokenizer, query, history, max_length=2048,
do_sample=False, logits_processor=None):
if history is None:
history = []
if logits_processor is None:
logits_processor = LogitsProcessorList()
logits_processor.append(InvalidScoreLogitsProcessor())
if not history:
prompt = query
else:
prompt = ""
for i, (old_query, response) in enumerate(history):
prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
prompt += "[Round {}]\n问:{}\n答:".format(len(history), query)
# first round prompt
inputs = tokenizer([prompt], padding=True, return_tensors="pt",
truncation=True, max_length=max_length).to(model.device)
# first round generation
outputs = model.generate(**inputs, do_sample=do_sample, max_new_tokens=512)
# organize intermediate_outputs
intermediate_outputs = []
for idx in range(len(outputs)):
output = outputs.tolist()[idx][len(inputs["input_ids"][idx]):]
response = tokenizer.decode(output)
intermediate_outputs.append(response)
# prepare second round prompt
extraction_prompt = '综上所述ABCD中正确的选项是'
answer_texts = [query + intermediate + "\n" + extraction_prompt for intermediate in intermediate_outputs]
input_tokens = [self.build_prompt(answer_text) for answer_text in answer_texts]
inputs = tokenizer(input_tokens, padding=True, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
# second round generation
outputs = model(**inputs, return_last_logit=True)
logits = outputs.logits[:, -1]
choice_tokens = [tokenizer.encode(choice, add_special_tokens=False)[0] for choice in self.choices]
logits = logits[:, choice_tokens]
preds = logits.argmax(dim=-1)
return self.choices[preds]
@torch.no_grad()
def eval_subject(
self,
subject_name,
test_df,
eval_type="validation", # "test","validation",
dev_df=None,
few_shot=False,
cot=False,
):
if eval_type == "validation":
correct_num = 0
if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
else:
history = []
answers = list(test_df['answer'])
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = self.format_example(row, include_answer=False, cot=cot)
if few_shot:
response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history)
response = response.strip()
# For ChatGLM, we use answer extraction in answer-only mode too.
ans, direct_extract = self.extract_cot_answer(row, response)
else: # zero-shot by extracting answer from distribution
ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=history)
if ans == answers[row_index]:
correct_num += 1
correct_ratio = 100*correct_num/len(answers)
return correct_ratio, None
elif eval_type == "test":
answers = {}
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = self.format_example(row)
response, _ = self.model.chat(
self.tokenizer,
question,
history=None,
)
pred = self.extract_answer(response, row)
answers[str(i)] = pred
return None, answers