LLM: Fix ipex torchscript=True error (#10832)

* remove

* update

* remove torchscript
This commit is contained in:
Wang, Jian4 2024-04-22 15:53:09 +08:00 committed by GitHub
parent fc33aa3721
commit 23c6a52fb0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 14 additions and 20 deletions

View file

@ -1210,15 +1210,15 @@ def run_bigdl_ipex_bf16(repo_id,
st = time.perf_counter() st = time.perf_counter()
if repo_id in CHATGLM_IDS: if repo_id in CHATGLM_IDS:
model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
use_cache=True, torchscript=True) use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in LLAMA_IDS: elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
use_cache=True, torchscript=True) use_cache=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
use_cache=True, torchscript=True) use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if not hasattr(model.config, "token_latency"): if not hasattr(model.config, "token_latency"):
model.config.token_latency = True model.config.token_latency = True
@ -1280,15 +1280,15 @@ def run_bigdl_ipex_int4(repo_id,
st = time.perf_counter() st = time.perf_counter()
if repo_id in CHATGLM_IDS: if repo_id in CHATGLM_IDS:
model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto', model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
use_cache=True, torchscript=True) use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in LLAMA_IDS: elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto', model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
use_cache=True, torchscript=True) use_cache=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto', model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
use_cache=True, torchscript=True) use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if not hasattr(model.config, "token_latency"): if not hasattr(model.config, "token_latency"):
model.config.token_latency = True model.config.token_latency = True
@ -1350,15 +1350,15 @@ def run_bigdl_ipex_int8(repo_id,
st = time.perf_counter() st = time.perf_counter()
if repo_id in CHATGLM_IDS: if repo_id in CHATGLM_IDS:
model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto', model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
use_cache=True, torchscript=True) use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in LLAMA_IDS: elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto', model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
use_cache=True, torchscript=True) use_cache=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto', model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
use_cache=True, torchscript=True) use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if not hasattr(model.config, "token_latency"): if not hasattr(model.config, "token_latency"):
model.config.token_latency = True model.config.token_latency = True
@ -1537,15 +1537,15 @@ def run_speculative_cpu(repo_id,
st = time.perf_counter() st = time.perf_counter()
if repo_id in CHATGLM_IDS: if repo_id in CHATGLM_IDS:
model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
use_cache=True, torchscript=True, speculative=True) use_cache=True, speculative=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in LLAMA_IDS: elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
use_cache=True, torchscript=True, speculative=True) use_cache=True, speculative=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
use_cache=True, torchscript=True, speculative=True) use_cache=True, speculative=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if tokenizer.pad_token is None: if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token

View file

@ -59,7 +59,6 @@ if __name__ == '__main__':
optimize_model=True, optimize_model=True,
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
load_in_low_bit="bf16", load_in_low_bit="bf16",
torchscript=True,
speculative=True, speculative=True,
trust_remote_code=True, trust_remote_code=True,
use_cache=True) use_cache=True)

View file

@ -57,7 +57,6 @@ if __name__ == '__main__':
load_in_low_bit="bf16", load_in_low_bit="bf16",
speculative=True, speculative=True,
trust_remote_code=True, trust_remote_code=True,
torchscript=True,
use_cache=True) use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

View file

@ -74,7 +74,6 @@ if __name__ == '__main__':
optimize_model=True, optimize_model=True,
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
load_in_low_bit="bf16", load_in_low_bit="bf16",
torchscript=True,
speculative=True, speculative=True,
trust_remote_code=True, trust_remote_code=True,
use_cache=True) use_cache=True)

View file

@ -65,7 +65,6 @@ if __name__ == '__main__':
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
load_in_low_bit="bf16", load_in_low_bit="bf16",
speculative=True, speculative=True,
torchscript=True,
trust_remote_code=True, trust_remote_code=True,
use_cache=True) use_cache=True)

View file

@ -51,7 +51,6 @@ if __name__ == '__main__':
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
load_in_low_bit="bf16", load_in_low_bit="bf16",
speculative=True, speculative=True,
torchscript=True,
trust_remote_code=True, trust_remote_code=True,
use_cache=True) use_cache=True)

View file

@ -69,7 +69,6 @@ if __name__ == '__main__':
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
load_in_low_bit="bf16", load_in_low_bit="bf16",
speculative=True, speculative=True,
torchscript=True,
trust_remote_code=True, trust_remote_code=True,
use_cache=True) use_cache=True)

View file

@ -51,7 +51,6 @@ if __name__ == '__main__':
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
load_in_low_bit="bf16", load_in_low_bit="bf16",
speculative=True, speculative=True,
torchscript=True,
trust_remote_code=True, trust_remote_code=True,
use_cache=True) use_cache=True)

View file

@ -150,7 +150,8 @@ def _ipex_optimize_model(model, rms_classes, qtype):
_ipex_optimize_attention(model, is_tpp=is_tpp, is_woq=is_woq) _ipex_optimize_attention(model, is_tpp=is_tpp, is_woq=is_woq)
_ipex_optimize_decoder(model, is_tpp=is_tpp, is_woq=is_woq) _ipex_optimize_decoder(model, is_tpp=is_tpp, is_woq=is_woq)
model.register_forward_hook(output_hook, with_kwargs=True) # need to register_forward_hook after torch.jit.trace
# model.register_forward_hook(output_hook, with_kwargs=True)
return model return model