LLM: add compressed chatglm3 model (#9892)

* LLM: add compressed chatglm3 model * small fix * revert github action
2024-01-18 17:48:15 +08:00 · 2024-01-18 17:48:15 +08:00 · 100e0a87e5
commit 100e0a87e5
parent 9e2ac5291b
2 changed files with 10 additions and 4 deletions
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@ -365,12 +365,17 @@ def run_transformer_int4_gpu(repo_id,
    # Load model in 4 bit,
    # which convert the relevant layers in the model into INT4 format
    st = time.perf_counter()
-    if repo_id in CHATGLM_IDS:
+    origin_repo_id = repo_id.replace("-4bit", "")
-        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
+    if origin_repo_id in CHATGLM_IDS:
-                                          trust_remote_code=True, use_cache=True).eval()
+        if "4bit" in repo_id:
            model = AutoModel.load_low_bit(model_path, optimize_model=True,
                                            trust_remote_code=True, use_cache=True).eval()  
        else:
            model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
                                            trust_remote_code=True, use_cache=True).eval()
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        model = model.to('xpu')
-    elif repo_id in LLAMA_IDS:
+    elif origin_repo_id in LLAMA_IDS:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
                                                     use_cache=True).eval()
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@ -2,6 +2,7 @@ repo_id:
  - 'meta-llama/Llama-2-7b-chat-hf'
  - 'meta-llama/Llama-2-13b-chat-hf'
  - 'THUDM/chatglm2-6b'
  - 'THUDM/chatglm3-6b-4bit'
  - 'tiiuae/falcon-7b-instruct-with-patch'
  - 'mosaicml/mpt-7b-chat'
  - 'redpajama/gptneox-7b-redpajama-bf16'