add test api transformer_int4_fp16_gpu (#10627)
* add test api transformer_int4_fp16_gpu * update config.yaml and README.md in all-in-one * modify run.py in all-in-one * re-order test-api * re-order test-api in config * modify README.md in all-in-one * modify README.md in all-in-one * modify config.yaml --------- Co-authored-by: pengyb2001 <arda@arda-arc21.sh.intel.com> Co-authored-by: ivy-lv11 <zhicunlv@gmail.com>
This commit is contained in:
parent
47cabe8fcc
commit
2d88bb9b4b
3 changed files with 41 additions and 27 deletions
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
All in one benchmark test allows users to test all the benchmarks and record them in a result CSV. Users can provide models and related information in `config.yaml`.
|
||||
|
||||
Before running, make sure to have [bigdl-llm](../../../README.md).
|
||||
Before running, make sure to have [ipex-llm](../../../../../README.md) installed.
|
||||
|
||||
## Dependencies
|
||||
|
||||
|
|
@ -23,8 +23,7 @@ Config YAML file has following format
|
|||
|
||||
```yaml
|
||||
repo_id:
|
||||
- 'THUDM/chatglm-6b'
|
||||
- 'THUDM/chatglm2-6b'
|
||||
# - 'THUDM/chatglm2-6b'
|
||||
- 'meta-llama/Llama-2-7b-chat-hf'
|
||||
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
|
||||
local_model_hub: 'path to your local model hub'
|
||||
|
|
@ -37,22 +36,30 @@ in_out_pairs:
|
|||
- '32-32'
|
||||
- '1024-128'
|
||||
test_api:
|
||||
- "transformer_int4"
|
||||
- "native_int4"
|
||||
- "optimize_model"
|
||||
- "pytorch_autocast_bf16"
|
||||
# - "transformer_autocast_bf16"
|
||||
- "transformer_int4_gpu" # on Intel GPU
|
||||
# - "transformer_int4_fp16_gpu" # on Intel GPU, use fp16 for non-linear layer
|
||||
# - "ipex_fp16_gpu" # on Intel GPU
|
||||
# - "bigdl_fp16_gpu" # on Intel GPU
|
||||
# - "transformer_int4_gpu" # on Intel GPU
|
||||
# - "optimize_model_gpu" # on Intel GPU
|
||||
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
|
||||
# - "transformer_int4_gpu_win" # on Intel GPU for Windows
|
||||
# - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
|
||||
# - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model
|
||||
# - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU
|
||||
# - "speculative_gpu"
|
||||
# - "transformer_int4"
|
||||
# - "native_int4"
|
||||
# - "optimize_model"
|
||||
# - "pytorch_autocast_bf16"
|
||||
# - "transformer_autocast_bf16"
|
||||
# - "bigdl_ipex_bf16"
|
||||
# - "bigdl_ipex_int4"
|
||||
# - "bigdl_ipex_int8"
|
||||
# - "speculative_cpu"
|
||||
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
|
||||
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
|
||||
streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api)
|
||||
|
||||
|
||||
```
|
||||
|
||||
## (Optional) Save model in low bit
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
repo_id:
|
||||
- 'THUDM/chatglm-6b'
|
||||
- 'THUDM/chatglm2-6b'
|
||||
# - 'THUDM/chatglm2-6b'
|
||||
- 'meta-llama/Llama-2-7b-chat-hf'
|
||||
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
|
||||
local_model_hub: 'path to your local model hub'
|
||||
|
|
@ -13,24 +12,25 @@ in_out_pairs:
|
|||
- '32-32'
|
||||
- '1024-128'
|
||||
test_api:
|
||||
- "transformer_int4"
|
||||
- "native_int4"
|
||||
- "optimize_model"
|
||||
- "pytorch_autocast_bf16"
|
||||
# - "transformer_autocast_bf16"
|
||||
# - "bigdl_ipex_bf16"
|
||||
# - "bigdl_ipex_int4"
|
||||
# - "bigdl_ipex_int8"
|
||||
- "transformer_int4_gpu" # on Intel GPU
|
||||
# - "transformer_int4_fp16_gpu" # on Intel GPU, use fp16 for non-linear layer
|
||||
# - "ipex_fp16_gpu" # on Intel GPU
|
||||
# - "bigdl_fp16_gpu" # on Intel GPU
|
||||
# - "transformer_int4_gpu" # on Intel GPU
|
||||
# - "optimize_model_gpu" # on Intel GPU
|
||||
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
|
||||
# - "transformer_int4_gpu_win" # on Intel GPU for Windows
|
||||
# - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
|
||||
# - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows using load_low_bit API. Please make sure you have used the save.py to save the converted low bit model
|
||||
# - "deepspeed_optimize_model_gpu" # deepspeed autotp on Intel GPU
|
||||
# - "speculative_cpu"
|
||||
# - "speculative_gpu"
|
||||
# - "transformer_int4"
|
||||
# - "native_int4"
|
||||
# - "optimize_model"
|
||||
# - "pytorch_autocast_bf16"
|
||||
# - "transformer_autocast_bf16"
|
||||
# - "bigdl_ipex_bf16"
|
||||
# - "bigdl_ipex_int4"
|
||||
# - "bigdl_ipex_int8"
|
||||
# - "speculative_cpu"
|
||||
# - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
|
||||
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
|
||||
streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api)
|
||||
|
|
|
|||
|
|
@ -74,6 +74,8 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
|
|||
result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||
elif test_api == 'transformer_int4_gpu':
|
||||
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||
elif test_api == 'transformer_int4_fp16_gpu':
|
||||
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size, fp16=True)
|
||||
elif test_api == 'optimize_model_gpu':
|
||||
result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size)
|
||||
elif test_api == 'pytorch_autocast_bf16':
|
||||
|
|
@ -388,7 +390,8 @@ def run_transformer_int4_gpu(repo_id,
|
|||
num_trials,
|
||||
num_beams,
|
||||
low_bit,
|
||||
batch_size):
|
||||
batch_size,
|
||||
fp16=False):
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
|
@ -405,12 +408,10 @@ def run_transformer_int4_gpu(repo_id,
|
|||
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
|
||||
trust_remote_code=True, use_cache=True).eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
model = model.to('xpu')
|
||||
elif origin_repo_id in LLAMA_IDS:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
|
||||
use_cache=True).eval()
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
model = model.to('xpu')
|
||||
else:
|
||||
if "4bit" in repo_id:
|
||||
model = AutoModelForCausalLM.load_low_bit(model_path, optimize_model=True,
|
||||
|
|
@ -426,7 +427,13 @@ def run_transformer_int4_gpu(repo_id,
|
|||
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
|
||||
trust_remote_code=True, use_cache=True).eval()
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
if fp16:
|
||||
model = model.half()
|
||||
print("Convert model to half precision")
|
||||
|
||||
model = model.to('xpu')
|
||||
|
||||
end = time.perf_counter()
|
||||
load_time = end - st
|
||||
print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3)))
|
||||
|
|
|
|||
Loading…
Reference in a new issue