diff --git a/.github/workflows/llm-ceval.yml b/.github/workflows/llm-ceval.yml new file mode 100644 index 00000000..88ddf77c --- /dev/null +++ b/.github/workflows/llm-ceval.yml @@ -0,0 +1,200 @@ +name: LLM C-Eval + +# Cancel previous runs in the PR when you push new commits +concurrency: + group: ${{ github.workflow }}-llm-nightly-test-${{ github.event.pull_request.number || github.run_id }} + cancel-in-progress: true + +# Controls when the action will run. +on: + schedule: + - cron: "00 15 * * 5" # GMT time, 15:00 GMT == 23:00 Beijing Time + pull_request: + branches: [main] + paths: + - ".github/workflows/llm-ceval.yml" + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + inputs: + model_name: + description: 'Model names, separated by comma and must be quoted.' + required: true + type: string + precision: + description: 'Precisions, separated by comma and must be quoted.' + required: true + type: string + runs-on: + description: 'Labels to filter the runners, separated by comma and must be quoted.' + default: "accuracy" + required: false + type: string + +# A workflow run is made up of one or more jobs that can run sequentially +jobs: + llm-cpp-build: + uses: ./.github/workflows/llm-binary-build.yml + # Set the testing matrix based on the event (schedule, PR, or manual dispatch) + set-matrix: + runs-on: ubuntu-latest + outputs: + model_name: ${{ steps.set-matrix.outputs.model_name }} + precision: ${{ steps.set-matrix.outputs.precision }} + runner: ${{ steps.set-matrix.outputs.runner }} + steps: + - name: set-nightly-env + if: ${{github.event_name == 'schedule'}} + env: + NIGHTLY_MATRIX_MODEL_NAME: '["chatglm2-6b","chinese-llama2-7b", "Qwen-7B-Chat"]' + NIGHTLY_MATRIX_PRECISION: '["sym_int4", "fp8_e5m2"]' + NIGHTLY_LABELS: '["self-hosted", "llm", "accuracy-nightly"]' + run: | + echo "model_name=$NIGHTLY_MATRIX_MODEL_NAME" >> $GITHUB_ENV + echo "precision=$NIGHTLY_MATRIX_PRECISION" >> $GITHUB_ENV + echo "runner=$NIGHTLY_LABELS" >> $GITHUB_ENV + + - name: set-pr-env + if: ${{github.event_name == 'pull_request'}} + env: + PR_MATRIX_MODEL_NAME: '["Qwen-7B-Chat"]' + PR_MATRIX_PRECISION: '["sym_int4", "fp8_e5m2"]' + PR_LABELS: '["self-hosted", "llm", "temp-arc01"]' + run: | + echo "model_name=$PR_MATRIX_MODEL_NAME" >> $GITHUB_ENV + echo "precision=$PR_MATRIX_PRECISION" >> $GITHUB_ENV + echo "runner=$PR_LABELS" >> $GITHUB_ENV + - name: set-manual-env + if: ${{github.event_name == 'workflow_dispatch'}} + env: + MANUAL_MATRIX_MODEL_NAME: ${{format('[ {0} ]', inputs.model_name)}} + MANUAL_MATRIX_PRECISION: ${{format('[ {0} ]', inputs.precision)}} + MANUAL_LABELS: ${{format('["self-hosted", "llm", {0}]', inputs.runs-on)}} + run: | + echo "model_name=$MANUAL_MATRIX_MODEL_NAME" >> $GITHUB_ENV + echo "precision=$MANUAL_MATRIX_PRECISION" >> $GITHUB_ENV + echo "runner=$MANUAL_LABELS" >> $GITHUB_ENV + - name: set-matrix + id: set-matrix + run: | + echo "model_name=$model_name" >> $GITHUB_OUTPUT + echo "precision=$precision" >> $GITHUB_OUTPUT + echo "runner=$runner" >> $GITHUB_OUTPUT + llm-ceval-evalution: + timeout-minutes: 1200 + needs: [llm-cpp-build, set-matrix] + strategy: + fail-fast: false + matrix: + # include: + # python-version: "3.9" + # model_name: "stablelm-3b-4e1t" + # task: "arc" + # precision: "sym_int4" #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8 + python-version: ["3.9"] + model_name: ${{ fromJson(needs.set-matrix.outputs.model_name) }} + precision: ${{ fromJson(needs.set-matrix.outputs.precision) }} + device: [xpu] + + runs-on: ${{ fromJson(needs.set-matrix.outputs.runner) }} + env: + ANALYTICS_ZOO_ROOT: ${{ github.workspace }} + ORIGIN_DIR: /mnt/disk1/models + CEVAL_HF_HOME: /mnt/disk1/ceval_home + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + shell: bash + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade setuptools==58.0.4 + python -m pip install --upgrade wheel + + - name: Download llm binary + uses: ./.github/actions/llm/download-llm-binary + + - name: Run LLM install (all) test + uses: ./.github/actions/llm/setup-llm-env + with: + extra-dependency: "xpu_2.1" + + - name: Download models and datasets + shell: bash + run: | + echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV" + MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/ + if [ ! -d $CEVAL_HF_HOME ]; then + mkdir -p $CEVAL_HF_HOME + fi + if [ ! -d "$CEVAL_HF_HOME/data" ]; then + mkdir -p "$CEVAL_HF_HOME/data" + fi + if [ -d "$CEVAL_HF_HOME/data/dev" ]; then + rm -rf "$CEVAL_HF_HOME/data/dev" + fi + + if [ -d "$CEVAL_HF_HOME/data/test" ]; then + rm -rf "$CEVAL_HF_HOME/data/test" + fi + + if [ -d "$CEVAL_HF_HOME/data/val" ]; then + rm -rf "$CEVAL_HF_HOME/data/val" + fi + wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/ceval-exam.zip -P "$CEVAL_HF_HOME/data" + echo "DATA_PATH=$CEVAL_HF_HOME/data" >> "$GITHUB_ENV" + DATA_PATH=$CEVAL_HF_HOME/data + unzip -o "$CEVAL_HF_HOME/data/ceval-exam.zip" -d "$CEVAL_HF_HOME/data" + wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR} + + - name: Install Dependencies + shell: bash + run: | + pip install transformers==4.31.0 + pip install thefuzz + pip install tiktoken + pip install transformers_stream_generator + + - name: Run C-Eval + shell: bash + working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/ceval + env: + USE_XETLA: OFF + SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1 + run: | + source /opt/intel/oneapi/setvars.sh + python eval.py \ + --model_path ${MODEL_PATH} \ + --eval_type validation \ + --device xpu \ + --eval_data_path ${DATA_PATH} \ + --qtype ${{ matrix.precision }} + + - uses: actions/upload-artifact@v3 + with: + name: ceval_results + path: + ${{ github.workspace }}/python/llm/dev/benchmark/ceval/results/** + + llm-ceval-summary: + if: ${{ always() }} + needs: llm-ceval-evalution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.9 + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Download all results + uses: actions/download-artifact@v3 + with: + name: ceval_results + path: results + - name: Summarize the results + shell: bash + run: | + ls results + python ${{ github.workspace }}/python/llm/dev/benchmark/ceval/organize_results.py results/ diff --git a/python/llm/dev/benchmark/ceval/README.md b/python/llm/dev/benchmark/ceval/README.md index 608c170c..03e83567 100644 --- a/python/llm/dev/benchmark/ceval/README.md +++ b/python/llm/dev/benchmark/ceval/README.md @@ -19,7 +19,6 @@ bash run.sh + `run.sh` ```shell python eval.py \ - --model_family llama \ --model_path "path to model" \ --eval_type validation \ --device xpu \ diff --git a/python/llm/dev/benchmark/ceval/eval.py b/python/llm/dev/benchmark/ceval/eval.py index 61ad6f91..e0530d46 100644 --- a/python/llm/dev/benchmark/ceval/eval.py +++ b/python/llm/dev/benchmark/ceval/eval.py @@ -222,7 +222,7 @@ hard_list = [ choices = ["A", "B", "C", "D"] -def cal_ceval(res): +def cal_ceval(res, model_path, qtype): acc_sum_dict = dict() acc_norm_sum_dict = dict() cnt_dict = dict() @@ -244,13 +244,22 @@ def cal_ceval(res): hard_acc_sum += float(res[tt]) acc_sum_dict[class_] += float(res[tt]) cnt_dict[class_] += 1 - print("\n\n\n") - for k in ["STEM", "Social Science", "Humanities", "Other"]: - if k in cnt_dict: - print("%s acc: %.2f " % (k, acc_sum_dict[k] / cnt_dict[k])) - if hard_cnt > 0: - print("Hard acc:%.2f " % (hard_acc_sum / hard_cnt)) - print("AVERAGE acc:%.2f " % (acc_sum / cnt)) + + result_lst = [] + subject_names = ["STEM", "Social Science", "Humanities", "Other", "Hard", "Average"] + for value in subject_names: + if value == "Hard": + result_lst.append(f"{hard_acc_sum / hard_cnt:.2f}") + elif value == "Average": + result_lst.append(f"{acc_sum / cnt:.2f}") + else: + result_lst.append(f"{acc_sum_dict[value] / cnt_dict[value]:.2f}") + + if not os.path.exists('results/'): + os.mkdir('results/') + + dump_dict = {"Model Name": model_path.split('/')[-2], "Precision": qtype, "Results": result_lst} + json.dump(dump_dict, open(f'results/{dump_dict["Model Name"]}_{dump_dict["Precision"]}.json','w'), ensure_ascii=False, indent=4) def main(args, evaluator): @@ -262,8 +271,9 @@ def main(args, evaluator): ) val_df = pd.read_csv(val_file_path) score, _ = evaluator.eval_subject(subject_name, val_df, args.eval_type) + torch.xpu.empty_cache() result[subject_name] = score - cal_ceval(result) + cal_ceval(result, args.model_path, args.qtype) elif args.eval_type == "test": all_answers = {} for subject_name in tqdm(TASK_NAME_MAPPING.keys()): @@ -272,6 +282,7 @@ def main(args, evaluator): ) test_df = pd.read_csv(test_file_path) _, answers = evaluator.eval_subject(subject_name, test_df, args.eval_type) + torch.xpu.empty_cache() all_answers[subject_name] = answers json.dump(all_answers, open('submission.json','w'), ensure_ascii=False, indent=4) else: @@ -297,7 +308,7 @@ if __name__ == "__main__": if family in args.model_path.lower(): model_family = family - assert model_family is not None, f"Model {args.model_path}'s model family is not implemented" + assert model_family is not None, f"Model {args.model_path}'s evaluator is not implemented" if model_family == "llama": evaluator = LlamaEvaluator( diff --git a/python/llm/dev/benchmark/ceval/evaluators/chatglm.py b/python/llm/dev/benchmark/ceval/evaluators/chatglm.py index 435a4317..b501c9a2 100644 --- a/python/llm/dev/benchmark/ceval/evaluators/chatglm.py +++ b/python/llm/dev/benchmark/ceval/evaluators/chatglm.py @@ -60,7 +60,7 @@ class ChatGLMEvaluator(Evaluator): message.append(self.format_example(dev_df.iloc[i, :], cot=cot)) return message - def format_example(self, line, include_answer=True, cot=False, add_prompt=''): + def format_example(self, line, include_answer=False, cot=False, add_prompt=''): example = add_prompt + line['question'] # print(example) for choice in self.choices: @@ -110,6 +110,51 @@ class ChatGLMEvaluator(Evaluator): return answer, False return '-', False + def extract_choice(self, gen, prompt, choice_list): + res = re.search( + r"(?:(?:选|选择|选定)[::]?\s*|(?:(?:答案|选项)(?![^ABCD]{0,10}?(?:不|非)[^ABCD]{0,10}?(?:是|选|为|:|:|】))[^ABCD]{0,10}?(?:是|选|为|:|:|】))[^ABCD]{0,10}?)(A|B|C|D)(?:选项)?(?:\)|。|\.|,|,|.|、|A|B|C|D|$|:|:|\)|))", + gen, + ) + + if res is None: + res = re.search( + r"(A|B|C|D)(?:选?项)?(?![^ABCD]{0,4}?(?:不|非)[^ABCD]{0,4}?(?:正确|对[的,。:]|符合))[^ABCD]{0,4}?(?:正确|对[的,。:]|符合)", + gen, + ) + + if res is None: + res = re.search(r"^[\((]?(A|B|C|D)(?:。|\)|)|\.|,|,|.|:|:|$)", gen) + + if res is None: + res = re.search(r"(? 4: + gen = gen.replace(question_split[0], "答案是") + if len(question_split[-1].strip()) > 4: + gen = gen.replace(question_split[-1], "") + + for key, val in sorted(choice_dict.items(), key=lambda x: len(x[1]), reverse=True): + gen = gen.replace(val.rstrip("。"), key) + return gen + + def extract_answer(self, response, row): + prompt = row["question"] + gen = self.process_before_extraction( + response, prompt, {choice: row[choice] for choice in self.choices} + ) + if not isinstance(prompt, str): + prompt = prompt[0] + pred = self.extract_choice(gen, prompt, [row[choice] for choice in self.choices]) + return pred + def build_prompt(self, text): return "[Round {}]\n\n问:{}\n\n答:".format(1, text) @@ -168,7 +213,7 @@ class ChatGLMEvaluator(Evaluator): eval_type="validation", # "test","validation", dev_df=None, few_shot=False, - cot=False, + cot=True, ): if eval_type == "validation": correct_num = 0 @@ -200,12 +245,7 @@ class ChatGLMEvaluator(Evaluator): elif eval_type == "test": answers = {} for i, row in tqdm(test_df.iterrows(), total=len(test_df)): - question = self.format_example(row) - response, _ = self.model.chat( - self.tokenizer, - question, - history=None, - ) - pred = self.extract_answer(response, row) - answers[str(i)] = pred + question = self.format_example(row, include_answer=False, cot=cot) + answers[str(i)] = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=[]) + return None, answers \ No newline at end of file diff --git a/python/llm/dev/benchmark/ceval/organize_results.py b/python/llm/dev/benchmark/ceval/organize_results.py new file mode 100644 index 00000000..a3c41110 --- /dev/null +++ b/python/llm/dev/benchmark/ceval/organize_results.py @@ -0,0 +1,68 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys +import json + +if __name__ == '__main__': + result_path = sys.argv[1] + + column_size = [25, 15, 10, 18, 15, 10, 10, 10] + pad_string = lambda x, l: [i.ljust(j) for i, j in zip(x, l)] + column_names = ["Model Name", "Precision", "STEM", "Social Science", "Humanities", "Other", "Hard", "Average"] + + print(f'\nDumping results for C-Eval score:\n') + print(' '.join(pad_string(column_names, column_size))) + print() + + file_lst = os.listdir(result_path) + file_lst = [f'{result_path}/{i}' for i in file_lst] + + organized_dict = {} # {'Qwen-7B': {'sym_int4': [], 'mixed_fp4': }} + for file in file_lst: + # Read the JSON file + with open(file, 'r') as file: + data = json.load(file) + + result_lst = [data['Model Name'], data['Precision']] + + result_lst += data['Results'] + + # store in the organized dictionary + try: + organized_dict[data['Model Name']][data['Precision']] = result_lst + except: + organized_dict[data['Model Name']] = {} + organized_dict[data['Model Name']][data['Precision']] = result_lst + + # define the print precision order + precision_order = ['sym_int4', 'mixed_fp4', 'fp4', 'sym_int8', 'fp8_e4m3', 'fp8_e5m2', 'mixed_fp8'] + + # print the results + for model_name in organized_dict.keys(): + for precision in precision_order: + try: + print(' '.join(pad_string(organized_dict[model_name][precision], column_size))) + except KeyError: + continue + + # separate between models + print() + + + +