Speed up gpt-j in gpubenchmark (#9000)
* Speedup gpt-j in gpubenchmark * meet code review
This commit is contained in:
parent
2a05581da7
commit
37bb0cbf8f
1 changed files with 13 additions and 5 deletions
|
|
@ -276,7 +276,7 @@ def run_transformer_int4_gpu(repo_id,
|
||||||
warm_up,
|
warm_up,
|
||||||
num_trials):
|
num_trials):
|
||||||
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer, GPTJForCausalLM
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
# Load model in 4 bit,
|
# Load model in 4 bit,
|
||||||
|
|
@ -286,14 +286,18 @@ def run_transformer_int4_gpu(repo_id,
|
||||||
model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True,
|
model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True,
|
||||||
use_cache=True)
|
use_cache=True)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
model = model.to('xpu')
|
||||||
else:
|
else:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True,
|
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_4bit=True,
|
||||||
trust_remote_code=True, use_cache=True)
|
trust_remote_code=True, use_cache=True)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
model = model.to('xpu')
|
||||||
|
if isinstance(model, GPTJForCausalLM):
|
||||||
|
# For gpt-j model family, this optimization can provide a better performance.
|
||||||
|
model = ipex.optimize(model.eval(), inplace=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
print(">> loading of model costs {}s".format(end - st))
|
print(">> loading of model costs {}s".format(end - st))
|
||||||
|
|
||||||
model = model.to('xpu')
|
|
||||||
model = BenchmarkWrapper(model)
|
model = BenchmarkWrapper(model)
|
||||||
|
|
||||||
result = {}
|
result = {}
|
||||||
|
|
@ -330,7 +334,7 @@ def run_optimize_model_gpu(repo_id,
|
||||||
in_out_pairs,
|
in_out_pairs,
|
||||||
warm_up,
|
warm_up,
|
||||||
num_trials):
|
num_trials):
|
||||||
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM
|
||||||
from bigdl.llm import optimize_model
|
from bigdl.llm import optimize_model
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
model_path = get_model_path(repo_id, local_model_hub)
|
model_path = get_model_path(repo_id, local_model_hub)
|
||||||
|
|
@ -342,15 +346,19 @@ def run_optimize_model_gpu(repo_id,
|
||||||
trust_remote_code=True, use_cache=True)
|
trust_remote_code=True, use_cache=True)
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
model = model.to('xpu')
|
||||||
else:
|
else:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True,
|
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype='auto', low_cpu_mem_usage=True,
|
||||||
trust_remote_code=True, use_cache=True)
|
trust_remote_code=True, use_cache=True)
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
model = model.to('xpu')
|
||||||
|
if isinstance(model, GPTJForCausalLM):
|
||||||
|
# For gpt-j model family, this optimization can provide a better performance.
|
||||||
|
model = ipex.optimize(model.eval(), inplace=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
print(">> loading of model costs {}s".format(end - st))
|
print(">> loading of model costs {}s".format(end - st))
|
||||||
|
|
||||||
model = model.to('xpu')
|
|
||||||
model = BenchmarkWrapper(model)
|
model = BenchmarkWrapper(model)
|
||||||
|
|
||||||
result = {}
|
result = {}
|
||||||
|
|
@ -393,4 +401,4 @@ if __name__ == '__main__':
|
||||||
run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials'])
|
run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials'])
|
||||||
df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens'])
|
df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens'])
|
||||||
df.to_csv(f'{current_dir}/{api}-results-{today}.csv')
|
df.to_csv(f'{current_dir}/{api}-results-{today}.csv')
|
||||||
results = []
|
results = []
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue