LLM: Fix ipex torchscript=True error (#10832)
* remove * update * remove torchscript
This commit is contained in:
parent
fc33aa3721
commit
23c6a52fb0
9 changed files with 14 additions and 20 deletions
|
|
@ -1210,15 +1210,15 @@ def run_bigdl_ipex_bf16(repo_id,
|
|||
st = time.perf_counter()
|
||||
if repo_id in CHATGLM_IDS:
|
||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||
use_cache=True, torchscript=True)
|
||||
use_cache=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
elif repo_id in LLAMA_IDS:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||
use_cache=True, torchscript=True)
|
||||
use_cache=True)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||
use_cache=True, torchscript=True)
|
||||
use_cache=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
if not hasattr(model.config, "token_latency"):
|
||||
model.config.token_latency = True
|
||||
|
|
@ -1280,15 +1280,15 @@ def run_bigdl_ipex_int4(repo_id,
|
|||
st = time.perf_counter()
|
||||
if repo_id in CHATGLM_IDS:
|
||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
|
||||
use_cache=True, torchscript=True)
|
||||
use_cache=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
elif repo_id in LLAMA_IDS:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
|
||||
use_cache=True, torchscript=True)
|
||||
use_cache=True)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
|
||||
use_cache=True, torchscript=True)
|
||||
use_cache=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
if not hasattr(model.config, "token_latency"):
|
||||
model.config.token_latency = True
|
||||
|
|
@ -1350,15 +1350,15 @@ def run_bigdl_ipex_int8(repo_id,
|
|||
st = time.perf_counter()
|
||||
if repo_id in CHATGLM_IDS:
|
||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
|
||||
use_cache=True, torchscript=True)
|
||||
use_cache=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
elif repo_id in LLAMA_IDS:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
|
||||
use_cache=True, torchscript=True)
|
||||
use_cache=True)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
|
||||
use_cache=True, torchscript=True)
|
||||
use_cache=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
if not hasattr(model.config, "token_latency"):
|
||||
model.config.token_latency = True
|
||||
|
|
@ -1537,15 +1537,15 @@ def run_speculative_cpu(repo_id,
|
|||
st = time.perf_counter()
|
||||
if repo_id in CHATGLM_IDS:
|
||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||
use_cache=True, torchscript=True, speculative=True)
|
||||
use_cache=True, speculative=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
elif repo_id in LLAMA_IDS:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||
use_cache=True, torchscript=True, speculative=True)
|
||||
use_cache=True, speculative=True)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||
use_cache=True, torchscript=True, speculative=True)
|
||||
use_cache=True, speculative=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
|
|
|||
|
|
@ -59,7 +59,6 @@ if __name__ == '__main__':
|
|||
optimize_model=True,
|
||||
torch_dtype=torch.bfloat16,
|
||||
load_in_low_bit="bf16",
|
||||
torchscript=True,
|
||||
speculative=True,
|
||||
trust_remote_code=True,
|
||||
use_cache=True)
|
||||
|
|
|
|||
|
|
@ -57,7 +57,6 @@ if __name__ == '__main__':
|
|||
load_in_low_bit="bf16",
|
||||
speculative=True,
|
||||
trust_remote_code=True,
|
||||
torchscript=True,
|
||||
use_cache=True)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
|
|
|||
|
|
@ -74,7 +74,6 @@ if __name__ == '__main__':
|
|||
optimize_model=True,
|
||||
torch_dtype=torch.bfloat16,
|
||||
load_in_low_bit="bf16",
|
||||
torchscript=True,
|
||||
speculative=True,
|
||||
trust_remote_code=True,
|
||||
use_cache=True)
|
||||
|
|
|
|||
|
|
@ -65,7 +65,6 @@ if __name__ == '__main__':
|
|||
torch_dtype=torch.bfloat16,
|
||||
load_in_low_bit="bf16",
|
||||
speculative=True,
|
||||
torchscript=True,
|
||||
trust_remote_code=True,
|
||||
use_cache=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -51,7 +51,6 @@ if __name__ == '__main__':
|
|||
torch_dtype=torch.bfloat16,
|
||||
load_in_low_bit="bf16",
|
||||
speculative=True,
|
||||
torchscript=True,
|
||||
trust_remote_code=True,
|
||||
use_cache=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -69,7 +69,6 @@ if __name__ == '__main__':
|
|||
torch_dtype=torch.bfloat16,
|
||||
load_in_low_bit="bf16",
|
||||
speculative=True,
|
||||
torchscript=True,
|
||||
trust_remote_code=True,
|
||||
use_cache=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -51,7 +51,6 @@ if __name__ == '__main__':
|
|||
torch_dtype=torch.bfloat16,
|
||||
load_in_low_bit="bf16",
|
||||
speculative=True,
|
||||
torchscript=True,
|
||||
trust_remote_code=True,
|
||||
use_cache=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -150,7 +150,8 @@ def _ipex_optimize_model(model, rms_classes, qtype):
|
|||
_ipex_optimize_attention(model, is_tpp=is_tpp, is_woq=is_woq)
|
||||
_ipex_optimize_decoder(model, is_tpp=is_tpp, is_woq=is_woq)
|
||||
|
||||
model.register_forward_hook(output_hook, with_kwargs=True)
|
||||
# need to register_forward_hook after torch.jit.trace
|
||||
# model.register_forward_hook(output_hook, with_kwargs=True)
|
||||
return model
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue