All-in-one benchmark update regarding performance mode for input length threshold (#11920)

* All-in-one benchmark update regarding performance mode input length threshold

* typo fix
This commit is contained in:
Yuwen Hu 2024-08-26 18:52:13 +08:00 committed by GitHub
parent 019f725d4d
commit a0bbd8e28d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -22,6 +22,7 @@ import gc
import traceback
import threading
import csv
import warnings
import numpy as np
from datetime import date
@ -554,6 +555,15 @@ def run_transformer_int4_gpu(repo_id,
input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = []
if not lookahead and os.environ.get("IPEX_LLM_PERFORMANCE_MODE", None) == "1":
from ipex_llm.transformers.lookup import PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD
if actual_in_len < PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD:
warnings.warn(
"All-in-one benchmark currently does not support IPEX_LLM_PERFORMANCE_MODE "
f"with actual input token length < {PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD}. "
f"Skip benchmarking in-out pair {in_out} for model {repo_id}."
)
continue
thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials, load_time, lookahead))
thread.start()
thread.join()