From 16febc949cdecb38dc8dc7683ef16c23d59bdea8 Mon Sep 17 00:00:00 2001
From: Mingyu Wei <76120304+Mingyu-Wei@users.noreply.github.com>
Date: Wed, 13 Dec 2023 18:13:06 +0800
Subject: [PATCH] [LLM] Add exclude option in all-in-one performance test
 (#9632)

* add exclude option in all-in-one perf test

* update arc-perf-test.yaml

* Exclude in_out_pairs in main function

* fix some bugs

* address Kai's comments

* define excludes at the beginning

* add bloomz:2048 to exclude
---
 python/llm/dev/benchmark/all-in-one/run.py   | 11 ++++++++++-
 python/llm/test/benchmark/arc-perf-test.yaml |  4 ++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 3f3f6011..eb24dac7 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -42,6 +42,7 @@ CHATGLM_IDS = ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b', 'THUDM/chatglm3-6b']
 LLAVA_IDS = ['liuhaotian/llava-v1.5-7b']
 
 results = []
+excludes = []
 
 
 def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False):
@@ -748,11 +749,19 @@ if __name__ == '__main__':
     from omegaconf import OmegaConf
     conf = OmegaConf.load(f'{current_dir}/config.yaml')
     today = date.today()
+    if 'exclude' in conf:
+        excludes = conf['exclude']
     
     import pandas as pd
     for api in conf.test_api:
         for model in conf.repo_id:
-            run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
+            in_out_pairs = conf['in_out_pairs'].copy()
+            if excludes:
+                for in_out in conf['in_out_pairs']:
+                    model_id_input = model + ':' + in_out.split('-')[0]
+                    if model_id_input in excludes:
+                        in_out_pairs.remove(in_out)
+            run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
                       conf['low_bit'], conf['cpu_embedding'])
         df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
                                             'input/output tokens', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding', 
diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index 82f64d04..bd605028 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -27,3 +27,7 @@ in_out_pairs:
 test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
+  - 'fnlp/moss-moon-003-sft:1024'
+  - 'fnlp/moss-moon-003-sft:2048'
+  - 'bigscience/bloomz-7b1:2048'