Speed up gpt-j in gpubenchmark (#9000)

* Speedup gpt-j in gpubenchmark

* meet code review
This commit is contained in:
Xin Qiu 2023-09-19 14:22:28 +08:00 committed by GitHub
parent 2a05581da7
commit 37bb0cbf8f

View file

@ -276,7 +276,7 @@ def run_transformer_int4_gpu(repo_id,
warm_up, warm_up,
num_trials): num_trials):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer, GPTJForCausalLM
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
# Load model in 4 bit, # Load model in 4 bit,
@ -286,14 +286,18 @@ def run_transformer_int4_gpu(repo_id,
model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True, model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True,
use_cache=True) use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True, model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True,
trust_remote_code=True, use_cache=True) trust_remote_code=True, use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
if isinstance(model, GPTJForCausalLM):
# For gpt-j model family, this optimization can provide a better performance.
model = ipex.optimize(model.eval(), inplace=True)
end = time.perf_counter() end = time.perf_counter()
print(">> loading of model costs {}s".format(end - st)) print(">> loading of model costs {}s".format(end - st))
model = model.to('xpu')
model = BenchmarkWrapper(model) model = BenchmarkWrapper(model)
result = {} result = {}
@ -330,7 +334,7 @@ def run_optimize_model_gpu(repo_id,
in_out_pairs, in_out_pairs,
warm_up, warm_up,
num_trials): num_trials):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM
from bigdl.llm import optimize_model from bigdl.llm import optimize_model
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -342,15 +346,19 @@ def run_optimize_model_gpu(repo_id,
trust_remote_code=True, use_cache=True) trust_remote_code=True, use_cache=True)
model = optimize_model(model) model = optimize_model(model)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True, model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True,
trust_remote_code=True, use_cache=True) trust_remote_code=True, use_cache=True)
model = optimize_model(model) model = optimize_model(model)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
if isinstance(model, GPTJForCausalLM):
# For gpt-j model family, this optimization can provide a better performance.
model = ipex.optimize(model.eval(), inplace=True)
end = time.perf_counter() end = time.perf_counter()
print(">> loading of model costs {}s".format(end - st)) print(">> loading of model costs {}s".format(end - st))
model = model.to('xpu')
model = BenchmarkWrapper(model) model = BenchmarkWrapper(model)
result = {} result = {}