LLM: Fix ipex torchscript=True error (#10832)
* remove * update * remove torchscript
This commit is contained in:
parent
fc33aa3721
commit
23c6a52fb0
9 changed files with 14 additions and 20 deletions
|
|
@ -1210,15 +1210,15 @@ def run_bigdl_ipex_bf16(repo_id,
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
if repo_id in CHATGLM_IDS:
|
if repo_id in CHATGLM_IDS:
|
||||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||||
use_cache=True, torchscript=True)
|
use_cache=True)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
elif repo_id in LLAMA_IDS:
|
elif repo_id in LLAMA_IDS:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||||
use_cache=True, torchscript=True)
|
use_cache=True)
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
else:
|
else:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||||
use_cache=True, torchscript=True)
|
use_cache=True)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
if not hasattr(model.config, "token_latency"):
|
if not hasattr(model.config, "token_latency"):
|
||||||
model.config.token_latency = True
|
model.config.token_latency = True
|
||||||
|
|
@ -1280,15 +1280,15 @@ def run_bigdl_ipex_int4(repo_id,
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
if repo_id in CHATGLM_IDS:
|
if repo_id in CHATGLM_IDS:
|
||||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
|
model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
|
||||||
use_cache=True, torchscript=True)
|
use_cache=True)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
elif repo_id in LLAMA_IDS:
|
elif repo_id in LLAMA_IDS:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
|
||||||
use_cache=True, torchscript=True)
|
use_cache=True)
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
else:
|
else:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
|
||||||
use_cache=True, torchscript=True)
|
use_cache=True)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
if not hasattr(model.config, "token_latency"):
|
if not hasattr(model.config, "token_latency"):
|
||||||
model.config.token_latency = True
|
model.config.token_latency = True
|
||||||
|
|
@ -1350,15 +1350,15 @@ def run_bigdl_ipex_int8(repo_id,
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
if repo_id in CHATGLM_IDS:
|
if repo_id in CHATGLM_IDS:
|
||||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
|
model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
|
||||||
use_cache=True, torchscript=True)
|
use_cache=True)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
elif repo_id in LLAMA_IDS:
|
elif repo_id in LLAMA_IDS:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
|
||||||
use_cache=True, torchscript=True)
|
use_cache=True)
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
else:
|
else:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
|
||||||
use_cache=True, torchscript=True)
|
use_cache=True)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
if not hasattr(model.config, "token_latency"):
|
if not hasattr(model.config, "token_latency"):
|
||||||
model.config.token_latency = True
|
model.config.token_latency = True
|
||||||
|
|
@ -1537,15 +1537,15 @@ def run_speculative_cpu(repo_id,
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
if repo_id in CHATGLM_IDS:
|
if repo_id in CHATGLM_IDS:
|
||||||
model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||||
use_cache=True, torchscript=True, speculative=True)
|
use_cache=True, speculative=True)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
elif repo_id in LLAMA_IDS:
|
elif repo_id in LLAMA_IDS:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||||
use_cache=True, torchscript=True, speculative=True)
|
use_cache=True, speculative=True)
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
else:
|
else:
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
|
||||||
use_cache=True, torchscript=True, speculative=True)
|
use_cache=True, speculative=True)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
if tokenizer.pad_token is None:
|
if tokenizer.pad_token is None:
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,6 @@ if __name__ == '__main__':
|
||||||
optimize_model=True,
|
optimize_model=True,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
load_in_low_bit="bf16",
|
load_in_low_bit="bf16",
|
||||||
torchscript=True,
|
|
||||||
speculative=True,
|
speculative=True,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
use_cache=True)
|
use_cache=True)
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,6 @@ if __name__ == '__main__':
|
||||||
load_in_low_bit="bf16",
|
load_in_low_bit="bf16",
|
||||||
speculative=True,
|
speculative=True,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
torchscript=True,
|
|
||||||
use_cache=True)
|
use_cache=True)
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,6 @@ if __name__ == '__main__':
|
||||||
optimize_model=True,
|
optimize_model=True,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
load_in_low_bit="bf16",
|
load_in_low_bit="bf16",
|
||||||
torchscript=True,
|
|
||||||
speculative=True,
|
speculative=True,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
use_cache=True)
|
use_cache=True)
|
||||||
|
|
|
||||||
|
|
@ -65,7 +65,6 @@ if __name__ == '__main__':
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
load_in_low_bit="bf16",
|
load_in_low_bit="bf16",
|
||||||
speculative=True,
|
speculative=True,
|
||||||
torchscript=True,
|
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
use_cache=True)
|
use_cache=True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,6 @@ if __name__ == '__main__':
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
load_in_low_bit="bf16",
|
load_in_low_bit="bf16",
|
||||||
speculative=True,
|
speculative=True,
|
||||||
torchscript=True,
|
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
use_cache=True)
|
use_cache=True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -69,7 +69,6 @@ if __name__ == '__main__':
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
load_in_low_bit="bf16",
|
load_in_low_bit="bf16",
|
||||||
speculative=True,
|
speculative=True,
|
||||||
torchscript=True,
|
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
use_cache=True)
|
use_cache=True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,6 @@ if __name__ == '__main__':
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
load_in_low_bit="bf16",
|
load_in_low_bit="bf16",
|
||||||
speculative=True,
|
speculative=True,
|
||||||
torchscript=True,
|
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
use_cache=True)
|
use_cache=True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -150,7 +150,8 @@ def _ipex_optimize_model(model, rms_classes, qtype):
|
||||||
_ipex_optimize_attention(model, is_tpp=is_tpp, is_woq=is_woq)
|
_ipex_optimize_attention(model, is_tpp=is_tpp, is_woq=is_woq)
|
||||||
_ipex_optimize_decoder(model, is_tpp=is_tpp, is_woq=is_woq)
|
_ipex_optimize_decoder(model, is_tpp=is_tpp, is_woq=is_woq)
|
||||||
|
|
||||||
model.register_forward_hook(output_hook, with_kwargs=True)
|
# need to register_forward_hook after torch.jit.trace
|
||||||
|
# model.register_forward_hook(output_hook, with_kwargs=True)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue