add test api transformer_int4_fp16_gpu (#10627)

* add test api transformer_int4_fp16_gpu

* update config.yaml and README.md in all-in-one

* modify run.py in all-in-one

* re-order test-api

* re-order test-api in config

* modify README.md in all-in-one

* modify README.md in all-in-one

* modify config.yaml

---------

Co-authored-by: pengyb2001 <arda@arda-arc21.sh.intel.com>
Co-authored-by: ivy-lv11 <zhicunlv@gmail.com>
This commit is contained in:
yb-peng 2024-04-07 15:47:17 +08:00 committed by GitHub
parent 47cabe8fcc
commit 2d88bb9b4b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 41 additions and 27 deletions

View file

@ -2,7 +2,7 @@
All in one benchmark test allows users to test all the benchmarks and record them in a result CSV. Users can provide models and related information in `config.yaml`. All in one benchmark test allows users to test all the benchmarks and record them in a result CSV. Users can provide models and related information in `config.yaml`.
Before running, make sure to have [bigdl-llm](../../../README.md). Before running, make sure to have [ipex-llm](../../../../../README.md) installed.
## Dependencies ## Dependencies
@ -23,8 +23,7 @@ Config YAML file has following format
```yaml ```yaml
repo_id: repo_id:
- 'THUDM/chatglm-6b' # - 'THUDM/chatglm2-6b'
- 'THUDM/chatglm2-6b'
- 'meta-llama/Llama-2-7b-chat-hf' - 'meta-llama/Llama-2-7b-chat-hf'
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
local_model_hub: 'path to your local model hub' local_model_hub: 'path to your local model hub'
@ -37,22 +36,30 @@ in_out_pairs:
- '32-32' - '32-32'
- '1024-128' - '1024-128'
test_api: test_api:
- "transformer_int4" - "transformer_int4_gpu" # on Intel GPU
- "native_int4" # - "transformer_int4_fp16_gpu" # on Intel GPU, use fp16 for non-linear layer
- "optimize_model"
- "pytorch_autocast_bf16"
# - "transformer_autocast_bf16"
# - "ipex_fp16_gpu" # on Intel GPU # - "ipex_fp16_gpu" # on Intel GPU
# - "bigdl_fp16_gpu" # on Intel GPU # - "bigdl_fp16_gpu" # on Intel GPU
# - "transformer_int4_gpu" # on Intel GPU
# - "optimize_model_gpu" # on Intel GPU # - "optimize_model_gpu" # on Intel GPU
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
# - "transformer_int4_gpu_win" # on Intel GPU for Windows # - "transformer_int4_gpu_win" # on Intel GPU for Windows
# - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer # - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
# - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model # - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model
# - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU
# - "speculative_gpu"
# - "transformer_int4"
# - "native_int4"
# - "optimize_model"
# - "pytorch_autocast_bf16"
# - "transformer_autocast_bf16"
# - "bigdl_ipex_bf16"
# - "bigdl_ipex_int4"
# - "bigdl_ipex_int8"
# - "speculative_cpu"
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api) streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api)
``` ```
## (Optional) Save model in low bit ## (Optional) Save model in low bit

View file

@ -1,6 +1,5 @@
repo_id: repo_id:
- 'THUDM/chatglm-6b' # - 'THUDM/chatglm2-6b'
- 'THUDM/chatglm2-6b'
- 'meta-llama/Llama-2-7b-chat-hf' - 'meta-llama/Llama-2-7b-chat-hf'
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
local_model_hub: 'path to your local model hub' local_model_hub: 'path to your local model hub'
@ -13,24 +12,25 @@ in_out_pairs:
- '32-32' - '32-32'
- '1024-128' - '1024-128'
test_api: test_api:
- "transformer_int4" - "transformer_int4_gpu" # on Intel GPU
- "native_int4" # - "transformer_int4_fp16_gpu" # on Intel GPU, use fp16 for non-linear layer
- "optimize_model"
- "pytorch_autocast_bf16"
# - "transformer_autocast_bf16"
# - "bigdl_ipex_bf16"
# - "bigdl_ipex_int4"
# - "bigdl_ipex_int8"
# - "ipex_fp16_gpu" # on Intel GPU # - "ipex_fp16_gpu" # on Intel GPU
# - "bigdl_fp16_gpu" # on Intel GPU # - "bigdl_fp16_gpu" # on Intel GPU
# - "transformer_int4_gpu" # on Intel GPU
# - "optimize_model_gpu" # on Intel GPU # - "optimize_model_gpu" # on Intel GPU
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
# - "transformer_int4_gpu_win" # on Intel GPU for Windows # - "transformer_int4_gpu_win" # on Intel GPU for Windows
# - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer # - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
# - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model # - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model
# - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU # - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU
# - "speculative_cpu"
# - "speculative_gpu" # - "speculative_gpu"
# - "transformer_int4"
# - "native_int4"
# - "optimize_model"
# - "pytorch_autocast_bf16"
# - "transformer_autocast_bf16"
# - "bigdl_ipex_bf16"
# - "bigdl_ipex_int4"
# - "bigdl_ipex_int8"
# - "speculative_cpu"
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api) streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api)

View file

@ -74,6 +74,8 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
elif test_api == 'transformer_int4_gpu': elif test_api == 'transformer_int4_gpu':
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
elif test_api == 'transformer_int4_fp16_gpu':
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size, fp16=True)
elif test_api == 'optimize_model_gpu': elif test_api == 'optimize_model_gpu':
result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
elif test_api == 'pytorch_autocast_bf16': elif test_api == 'pytorch_autocast_bf16':
@ -388,7 +390,8 @@ def run_transformer_int4_gpu(repo_id,
num_trials, num_trials,
num_beams, num_beams,
low_bit, low_bit,
batch_size): batch_size,
fp16=False):
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
@ -405,12 +408,10 @@ def run_transformer_int4_gpu(repo_id,
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
trust_remote_code=True, use_cache=True).eval() trust_remote_code=True, use_cache=True).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
elif origin_repo_id in LLAMA_IDS: elif origin_repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
use_cache=True).eval() use_cache=True).eval()
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
else: else:
if "4bit" in repo_id: if "4bit" in repo_id:
model = AutoModelForCausalLM.load_low_bit(model_path, optimize_model=True, model = AutoModelForCausalLM.load_low_bit(model_path, optimize_model=True,
@ -426,7 +427,13 @@ def run_transformer_int4_gpu(repo_id,
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
trust_remote_code=True, use_cache=True).eval() trust_remote_code=True, use_cache=True).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if fp16:
model = model.half()
print("Convert model to half precision")
model = model.to('xpu') model = model.to('xpu')
end = time.perf_counter() end = time.perf_counter()
load_time = end - st load_time = end - st
print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3))) print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))