diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 16b825dc..00d6161c 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -365,12 +365,17 @@ def run_transformer_int4_gpu(repo_id, # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format st = time.perf_counter() - if repo_id in CHATGLM_IDS: - model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, - trust_remote_code=True, use_cache=True).eval() + origin_repo_id = repo_id.replace("-4bit", "") + if origin_repo_id in CHATGLM_IDS: + if "4bit" in repo_id: + model = AutoModel.load_low_bit(model_path, optimize_model=True, + trust_remote_code=True, use_cache=True).eval() + else: + model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, + trust_remote_code=True, use_cache=True).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') - elif repo_id in LLAMA_IDS: + elif origin_repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True).eval() tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml index fdf7adac..61b8d454 100644 --- a/python/llm/test/benchmark/arc-perf-test.yaml +++ b/python/llm/test/benchmark/arc-perf-test.yaml @@ -2,6 +2,7 @@ repo_id: - 'meta-llama/Llama-2-7b-chat-hf' - 'meta-llama/Llama-2-13b-chat-hf' - 'THUDM/chatglm2-6b' + - 'THUDM/chatglm3-6b-4bit' - 'tiiuae/falcon-7b-instruct-with-patch' - 'mosaicml/mpt-7b-chat' - 'redpajama/gptneox-7b-redpajama-bf16'