From 2d88bb9b4b875d0e32bedd3a692ba5981e0fb7f7 Mon Sep 17 00:00:00 2001 From: yb-peng <75617475+pengyb2001@users.noreply.github.com> Date: Sun, 7 Apr 2024 15:47:17 +0800 Subject: [PATCH] add test api transformer_int4_fp16_gpu (#10627) * add test api transformer_int4_fp16_gpu * update config.yaml and README.md in all-in-one * modify run.py in all-in-one * re-order test-api * re-order test-api in config * modify README.md in all-in-one * modify README.md in all-in-one * modify config.yaml --------- Co-authored-by: pengyb2001 Co-authored-by: ivy-lv11 --- python/llm/dev/benchmark/all-in-one/README.md | 27 ++++++++++++------- .../llm/dev/benchmark/all-in-one/config.yaml | 26 +++++++++--------- python/llm/dev/benchmark/all-in-one/run.py | 15 ++++++++--- 3 files changed, 41 insertions(+), 27 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/README.md b/python/llm/dev/benchmark/all-in-one/README.md index 780ee4cc..1407b240 100644 --- a/python/llm/dev/benchmark/all-in-one/README.md +++ b/python/llm/dev/benchmark/all-in-one/README.md @@ -2,7 +2,7 @@ All in one benchmark test allows users to test all the benchmarks and record them in a result CSV. Users can provide models and related information in `config.yaml`. -Before running, make sure to have [bigdl-llm](../../../README.md). +Before running, make sure to have [ipex-llm](../../../../../README.md) installed. ## Dependencies @@ -23,8 +23,7 @@ Config YAML file has following format ```yaml repo_id: - - 'THUDM/chatglm-6b' - - 'THUDM/chatglm2-6b' + # - 'THUDM/chatglm2-6b' - 'meta-llama/Llama-2-7b-chat-hf' # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now local_model_hub: 'path to your local model hub' @@ -37,22 +36,30 @@ in_out_pairs: - '32-32' - '1024-128' test_api: - - "transformer_int4" - - "native_int4" - - "optimize_model" - - "pytorch_autocast_bf16" - # - "transformer_autocast_bf16" + - "transformer_int4_gpu" # on Intel GPU + # - "transformer_int4_fp16_gpu" # on Intel GPU, use fp16 for non-linear layer # - "ipex_fp16_gpu" # on Intel GPU # - "bigdl_fp16_gpu" # on Intel GPU - # - "transformer_int4_gpu" # on Intel GPU # - "optimize_model_gpu" # on Intel GPU - # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server # - "transformer_int4_gpu_win" # on Intel GPU for Windows # - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer # - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model + # - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU + # - "speculative_gpu" + # - "transformer_int4" + # - "native_int4" + # - "optimize_model" + # - "pytorch_autocast_bf16" + # - "transformer_autocast_bf16" + # - "bigdl_ipex_bf16" + # - "bigdl_ipex_int4" + # - "bigdl_ipex_int8" + # - "speculative_cpu" + # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api) + ``` ## (Optional) Save model in low bit diff --git a/python/llm/dev/benchmark/all-in-one/config.yaml b/python/llm/dev/benchmark/all-in-one/config.yaml index 0331271b..9132910c 100644 --- a/python/llm/dev/benchmark/all-in-one/config.yaml +++ b/python/llm/dev/benchmark/all-in-one/config.yaml @@ -1,6 +1,5 @@ repo_id: - - 'THUDM/chatglm-6b' - - 'THUDM/chatglm2-6b' + # - 'THUDM/chatglm2-6b' - 'meta-llama/Llama-2-7b-chat-hf' # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now local_model_hub: 'path to your local model hub' @@ -13,24 +12,25 @@ in_out_pairs: - '32-32' - '1024-128' test_api: - - "transformer_int4" - - "native_int4" - - "optimize_model" - - "pytorch_autocast_bf16" - # - "transformer_autocast_bf16" - # - "bigdl_ipex_bf16" - # - "bigdl_ipex_int4" - # - "bigdl_ipex_int8" + - "transformer_int4_gpu" # on Intel GPU + # - "transformer_int4_fp16_gpu" # on Intel GPU, use fp16 for non-linear layer # - "ipex_fp16_gpu" # on Intel GPU # - "bigdl_fp16_gpu" # on Intel GPU - # - "transformer_int4_gpu" # on Intel GPU # - "optimize_model_gpu" # on Intel GPU - # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server # - "transformer_int4_gpu_win" # on Intel GPU for Windows # - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer # - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model # - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU - # - "speculative_cpu" # - "speculative_gpu" + # - "transformer_int4" + # - "native_int4" + # - "optimize_model" + # - "pytorch_autocast_bf16" + # - "transformer_autocast_bf16" + # - "bigdl_ipex_bf16" + # - "bigdl_ipex_int4" + # - "bigdl_ipex_int8" + # - "speculative_cpu" + # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 8d57bc88..682e4880 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -74,6 +74,8 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) elif test_api == 'transformer_int4_gpu': result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) + elif test_api == 'transformer_int4_fp16_gpu': + result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size, fp16=True) elif test_api == 'optimize_model_gpu': result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) elif test_api == 'pytorch_autocast_bf16': @@ -388,7 +390,8 @@ def run_transformer_int4_gpu(repo_id, num_trials, num_beams, low_bit, - batch_size): + batch_size, + fp16=False): from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer import intel_extension_for_pytorch as ipex @@ -405,12 +408,10 @@ def run_transformer_int4_gpu(repo_id, model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, trust_remote_code=True, use_cache=True).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - model = model.to('xpu') elif origin_repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True).eval() tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) - model = model.to('xpu') else: if "4bit" in repo_id: model = AutoModelForCausalLM.load_low_bit(model_path, optimize_model=True, @@ -426,7 +427,13 @@ def run_transformer_int4_gpu(repo_id, model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - model = model.to('xpu') + + if fp16: + model = model.half() + print("Convert model to half precision") + + model = model.to('xpu') + end = time.perf_counter() load_time = end - st print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))