add test api transformer_int4_fp16_gpu (#10627)
* add test api transformer_int4_fp16_gpu * update config.yaml and README.md in all-in-one * modify run.py in all-in-one * re-order test-api * re-order test-api in config * modify README.md in all-in-one * modify README.md in all-in-one * modify config.yaml --------- Co-authored-by: pengyb2001 <arda@arda-arc21.sh.intel.com> Co-authored-by: ivy-lv11 <zhicunlv@gmail.com>
This commit is contained in:
parent
47cabe8fcc
commit
2d88bb9b4b
3 changed files with 41 additions and 27 deletions
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
All in one benchmark test allows users to test all the benchmarks and record them in a result CSV. Users can provide models and related information in `config.yaml`.
|
All in one benchmark test allows users to test all the benchmarks and record them in a result CSV. Users can provide models and related information in `config.yaml`.
|
||||||
|
|
||||||
Before running, make sure to have [bigdl-llm](../../../README.md).
|
Before running, make sure to have [ipex-llm](../../../../../README.md) installed.
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
|
|
@ -23,8 +23,7 @@ Config YAML file has following format
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
repo_id:
|
repo_id:
|
||||||
- 'THUDM/chatglm-6b'
|
# - 'THUDM/chatglm2-6b'
|
||||||
- 'THUDM/chatglm2-6b'
|
|
||||||
- 'meta-llama/Llama-2-7b-chat-hf'
|
- 'meta-llama/Llama-2-7b-chat-hf'
|
||||||
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
|
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
|
||||||
local_model_hub: 'path to your local model hub'
|
local_model_hub: 'path to your local model hub'
|
||||||
|
|
@ -37,22 +36,30 @@ in_out_pairs:
|
||||||
- '32-32'
|
- '32-32'
|
||||||
- '1024-128'
|
- '1024-128'
|
||||||
test_api:
|
test_api:
|
||||||
- "transformer_int4"
|
- "transformer_int4_gpu" # on Intel GPU
|
||||||
- "native_int4"
|
# - "transformer_int4_fp16_gpu" # on Intel GPU, use fp16 for non-linear layer
|
||||||
- "optimize_model"
|
|
||||||
- "pytorch_autocast_bf16"
|
|
||||||
# - "transformer_autocast_bf16"
|
|
||||||
# - "ipex_fp16_gpu" # on Intel GPU
|
# - "ipex_fp16_gpu" # on Intel GPU
|
||||||
# - "bigdl_fp16_gpu" # on Intel GPU
|
# - "bigdl_fp16_gpu" # on Intel GPU
|
||||||
# - "transformer_int4_gpu" # on Intel GPU
|
|
||||||
# - "optimize_model_gpu" # on Intel GPU
|
# - "optimize_model_gpu" # on Intel GPU
|
||||||
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
|
|
||||||
# - "transformer_int4_gpu_win" # on Intel GPU for Windows
|
# - "transformer_int4_gpu_win" # on Intel GPU for Windows
|
||||||
# - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
|
# - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
|
||||||
# - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model
|
# - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model
|
||||||
|
# - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU
|
||||||
|
# - "speculative_gpu"
|
||||||
|
# - "transformer_int4"
|
||||||
|
# - "native_int4"
|
||||||
|
# - "optimize_model"
|
||||||
|
# - "pytorch_autocast_bf16"
|
||||||
|
# - "transformer_autocast_bf16"
|
||||||
|
# - "bigdl_ipex_bf16"
|
||||||
|
# - "bigdl_ipex_int4"
|
||||||
|
# - "bigdl_ipex_int8"
|
||||||
|
# - "speculative_cpu"
|
||||||
|
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
|
||||||
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
|
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
|
||||||
streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api)
|
streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api)
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## (Optional) Save model in low bit
|
## (Optional) Save model in low bit
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
repo_id:
|
repo_id:
|
||||||
- 'THUDM/chatglm-6b'
|
# - 'THUDM/chatglm2-6b'
|
||||||
- 'THUDM/chatglm2-6b'
|
|
||||||
- 'meta-llama/Llama-2-7b-chat-hf'
|
- 'meta-llama/Llama-2-7b-chat-hf'
|
||||||
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
|
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
|
||||||
local_model_hub: 'path to your local model hub'
|
local_model_hub: 'path to your local model hub'
|
||||||
|
|
@ -13,24 +12,25 @@ in_out_pairs:
|
||||||
- '32-32'
|
- '32-32'
|
||||||
- '1024-128'
|
- '1024-128'
|
||||||
test_api:
|
test_api:
|
||||||
- "transformer_int4"
|
- "transformer_int4_gpu" # on Intel GPU
|
||||||
- "native_int4"
|
# - "transformer_int4_fp16_gpu" # on Intel GPU, use fp16 for non-linear layer
|
||||||
- "optimize_model"
|
|
||||||
- "pytorch_autocast_bf16"
|
|
||||||
# - "transformer_autocast_bf16"
|
|
||||||
# - "bigdl_ipex_bf16"
|
|
||||||
# - "bigdl_ipex_int4"
|
|
||||||
# - "bigdl_ipex_int8"
|
|
||||||
# - "ipex_fp16_gpu" # on Intel GPU
|
# - "ipex_fp16_gpu" # on Intel GPU
|
||||||
# - "bigdl_fp16_gpu" # on Intel GPU
|
# - "bigdl_fp16_gpu" # on Intel GPU
|
||||||
# - "transformer_int4_gpu" # on Intel GPU
|
|
||||||
# - "optimize_model_gpu" # on Intel GPU
|
# - "optimize_model_gpu" # on Intel GPU
|
||||||
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
|
|
||||||
# - "transformer_int4_gpu_win" # on Intel GPU for Windows
|
# - "transformer_int4_gpu_win" # on Intel GPU for Windows
|
||||||
# - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
|
# - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
|
||||||
# - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model
|
# - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model
|
||||||
# - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU
|
# - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU
|
||||||
# - "speculative_cpu"
|
|
||||||
# - "speculative_gpu"
|
# - "speculative_gpu"
|
||||||
|
# - "transformer_int4"
|
||||||
|
# - "native_int4"
|
||||||
|
# - "optimize_model"
|
||||||
|
# - "pytorch_autocast_bf16"
|
||||||
|
# - "transformer_autocast_bf16"
|
||||||
|
# - "bigdl_ipex_bf16"
|
||||||
|
# - "bigdl_ipex_int4"
|
||||||
|
# - "bigdl_ipex_int8"
|
||||||
|
# - "speculative_cpu"
|
||||||
|
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
|
||||||
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
|
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
|
||||||
streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api)
|
streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api)
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,8 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
|
||||||
result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||||
elif test_api == 'transformer_int4_gpu':
|
elif test_api == 'transformer_int4_gpu':
|
||||||
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||||
|
elif test_api == 'transformer_int4_fp16_gpu':
|
||||||
|
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size, fp16=True)
|
||||||
elif test_api == 'optimize_model_gpu':
|
elif test_api == 'optimize_model_gpu':
|
||||||
result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||||
elif test_api == 'pytorch_autocast_bf16':
|
elif test_api == 'pytorch_autocast_bf16':
|
||||||
|
|
@ -388,7 +390,8 @@ def run_transformer_int4_gpu(repo_id,
|
||||||
num_trials,
|
num_trials,
|
||||||
num_beams,
|
num_beams,
|
||||||
low_bit,
|
low_bit,
|
||||||
batch_size):
|
batch_size,
|
||||||
|
fp16=False):
|
||||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
|
|
@ -405,12 +408,10 @@ def run_transformer_int4_gpu(repo_id,
|
||||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
|
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
|
||||||
trust_remote_code=True, use_cache=True).eval()
|
trust_remote_code=True, use_cache=True).eval()
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
model = model.to('xpu')
|
|
||||||
elif origin_repo_id in LLAMA_IDS:
|
elif origin_repo_id in LLAMA_IDS:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
|
||||||
use_cache=True).eval()
|
use_cache=True).eval()
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
model = model.to('xpu')
|
|
||||||
else:
|
else:
|
||||||
if "4bit" in repo_id:
|
if "4bit" in repo_id:
|
||||||
model = AutoModelForCausalLM.load_low_bit(model_path, optimize_model=True,
|
model = AutoModelForCausalLM.load_low_bit(model_path, optimize_model=True,
|
||||||
|
|
@ -426,7 +427,13 @@ def run_transformer_int4_gpu(repo_id,
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
|
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
|
||||||
trust_remote_code=True, use_cache=True).eval()
|
trust_remote_code=True, use_cache=True).eval()
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
|
if fp16:
|
||||||
|
model = model.half()
|
||||||
|
print("Convert model to half precision")
|
||||||
|
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
|
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
load_time = end - st
|
load_time = end - st
|
||||||
print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
|
print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue