From fd81d6604781307e1330f8073fdbacafe4a1afcc Mon Sep 17 00:00:00 2001 From: WeiguangHan Date: Mon, 4 Mar 2024 17:53:03 +0800 Subject: [PATCH] LLM: Compress some models to save space (#10315) * LLM: compress some models to save space * add deleted comments --- python/llm/dev/benchmark/all-in-one/run.py | 20 ++++++++++++-------- python/llm/test/benchmark/arc-perf-test.yaml | 14 +++++++------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 74332cb8..12d654b0 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -398,15 +398,19 @@ def run_transformer_int4_gpu(repo_id, tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') else: - if 'starcoder' in repo_id: - # Load starcoder-15.5b model in bf16 format to avoid CPU OOM. - model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, - trust_remote_code=True, use_cache=True, torch_dtype=torch.bfloat16).eval() - # Convert the low-bit model back to fp32 for performance considerations. - model = model.float() + if "4bit" in repo_id: + model = AutoModelForCausalLM.load_low_bit(model_path, optimize_model=True, + trust_remote_code=True, use_cache=True).eval() else: - model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, - trust_remote_code=True, use_cache=True).eval() + if 'starcoder' in repo_id: + # Load starcoder-15.5b model in bf16 format to avoid CPU OOM. + model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, + trust_remote_code=True, use_cache=True, torch_dtype=torch.bfloat16).eval() + # Convert the low-bit model back to fp32 for performance considerations. + model = model.float() + else: + model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, + trust_remote_code=True, use_cache=True).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') if isinstance(model, GPTJForCausalLM): diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml index 2e1a6da8..190c11df 100644 --- a/python/llm/test/benchmark/arc-perf-test.yaml +++ b/python/llm/test/benchmark/arc-perf-test.yaml @@ -6,7 +6,7 @@ repo_id: - 'tiiuae/falcon-7b-instruct-with-patch' - 'mosaicml/mpt-7b-chat' - 'redpajama/gptneox-7b-redpajama-bf16' - - 'bigcode/starcoder-15.5b' + - 'bigcode/starcoder-15.5b-4bit' - 'databricks/dolly-v1-6b' - 'databricks/dolly-v2-7b' - 'databricks/dolly-v2-12b' @@ -14,9 +14,9 @@ repo_id: - 'Qwen/Qwen-7B-Chat' - 'BAAI/AquilaChat-7B' - 'baichuan-inc/Baichuan2-7B-Chat' - - 'baichuan-inc/Baichuan2-13B-Chat' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit' - 'bigscience/bloomz-7b1' - - 'fnlp/moss-moon-003-sft' + - 'fnlp/moss-moon-003-sft-4bit' local_model_hub: '/mnt/disk1/models' warm_up: 1 num_trials: 3 @@ -31,8 +31,8 @@ test_api: - "transformer_int4_gpu" # on Intel GPU cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) exclude: - - 'fnlp/moss-moon-003-sft:1024' - - 'fnlp/moss-moon-003-sft:2048' - - 'baichuan-inc/Baichuan2-13B-Chat:1024' - - 'baichuan-inc/Baichuan2-13B-Chat:2048' + - 'fnlp/moss-moon-003-sft-4bit:1024' + - 'fnlp/moss-moon-003-sft-4bit:2048' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit:1024' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' - 'bigscience/bloomz-7b1:2048'