All-in-one benchmark update regarding performance mode for input length threshold (#11920)

* All-in-one benchmark update regarding performance mode input length threshold

* typo fix
This commit is contained in:
Yuwen Hu 2024-08-26 18:52:13 +08:00 committed by GitHub
parent 019f725d4d
commit a0bbd8e28d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -22,6 +22,7 @@ import gc
import traceback import traceback
import threading import threading
import csv import csv
import warnings
import numpy as np import numpy as np
from datetime import date from datetime import date
@ -554,6 +555,15 @@ def run_transformer_int4_gpu(repo_id,
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
actual_in_len = input_ids.shape[1] actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
if not lookahead and os.environ.get("IPEX_LLM_PERFORMANCE_MODE", None) == "1":
from ipex_llm.transformers.lookup import PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD
if actual_in_len < PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD:
warnings.warn(
"All-in-one benchmark currently does not support IPEX_LLM_PERFORMANCE_MODE "
f"with actual input token length < {PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD}. "
f"Skip benchmarking in-out pair {in_out} for model {repo_id}."
)
continue
thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials, load_time, lookahead)) thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials, load_time, lookahead))
thread.start() thread.start()
thread.join() thread.join()