diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 9b73d729..23148cce 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -1210,15 +1210,15 @@ def run_bigdl_ipex_bf16(repo_id, st = time.perf_counter() if repo_id in CHATGLM_IDS: model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, - use_cache=True, torchscript=True) + use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) elif repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, - use_cache=True, torchscript=True) + use_cache=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, - use_cache=True, torchscript=True) + use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if not hasattr(model.config, "token_latency"): model.config.token_latency = True @@ -1280,15 +1280,15 @@ def run_bigdl_ipex_int4(repo_id, st = time.perf_counter() if repo_id in CHATGLM_IDS: model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto', - use_cache=True, torchscript=True) + use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) elif repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto', - use_cache=True, torchscript=True) + use_cache=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto', - use_cache=True, torchscript=True) + use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if not hasattr(model.config, "token_latency"): model.config.token_latency = True @@ -1350,15 +1350,15 @@ def run_bigdl_ipex_int8(repo_id, st = time.perf_counter() if repo_id in CHATGLM_IDS: model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto', - use_cache=True, torchscript=True) + use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) elif repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto', - use_cache=True, torchscript=True) + use_cache=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto', - use_cache=True, torchscript=True) + use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if not hasattr(model.config, "token_latency"): model.config.token_latency = True @@ -1537,15 +1537,15 @@ def run_speculative_cpu(repo_id, st = time.perf_counter() if repo_id in CHATGLM_IDS: model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, - use_cache=True, torchscript=True, speculative=True) + use_cache=True, speculative=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) elif repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, - use_cache=True, torchscript=True, speculative=True) + use_cache=True, speculative=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16, - use_cache=True, torchscript=True, speculative=True) + use_cache=True, speculative=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token diff --git a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py index 84cb9112..fa59c410 100644 --- a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py @@ -59,7 +59,6 @@ if __name__ == '__main__': optimize_model=True, torch_dtype=torch.bfloat16, load_in_low_bit="bf16", - torchscript=True, speculative=True, trust_remote_code=True, use_cache=True) diff --git a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py index 5ec9a67c..7df302aa 100644 --- a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py @@ -57,7 +57,6 @@ if __name__ == '__main__': load_in_low_bit="bf16", speculative=True, trust_remote_code=True, - torchscript=True, use_cache=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py index f870a094..43da3fd6 100644 --- a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py @@ -74,7 +74,6 @@ if __name__ == '__main__': optimize_model=True, torch_dtype=torch.bfloat16, load_in_low_bit="bf16", - torchscript=True, speculative=True, trust_remote_code=True, use_cache=True) diff --git a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py index 1968ccaa..16563b81 100644 --- a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py @@ -65,7 +65,6 @@ if __name__ == '__main__': torch_dtype=torch.bfloat16, load_in_low_bit="bf16", speculative=True, - torchscript=True, trust_remote_code=True, use_cache=True) diff --git a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py index c35b0b65..852c528e 100644 --- a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py @@ -51,7 +51,6 @@ if __name__ == '__main__': torch_dtype=torch.bfloat16, load_in_low_bit="bf16", speculative=True, - torchscript=True, trust_remote_code=True, use_cache=True) diff --git a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py index 73970f67..9c9e36ba 100644 --- a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py @@ -69,7 +69,6 @@ if __name__ == '__main__': torch_dtype=torch.bfloat16, load_in_low_bit="bf16", speculative=True, - torchscript=True, trust_remote_code=True, use_cache=True) diff --git a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py index a8a82474..4a957db6 100644 --- a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py @@ -51,7 +51,6 @@ if __name__ == '__main__': torch_dtype=torch.bfloat16, load_in_low_bit="bf16", speculative=True, - torchscript=True, trust_remote_code=True, use_cache=True) diff --git a/python/llm/src/ipex_llm/transformers/convert_ipex.py b/python/llm/src/ipex_llm/transformers/convert_ipex.py index 6368d13a..a934a1dd 100644 --- a/python/llm/src/ipex_llm/transformers/convert_ipex.py +++ b/python/llm/src/ipex_llm/transformers/convert_ipex.py @@ -150,7 +150,8 @@ def _ipex_optimize_model(model, rms_classes, qtype): _ipex_optimize_attention(model, is_tpp=is_tpp, is_woq=is_woq) _ipex_optimize_decoder(model, is_tpp=is_tpp, is_woq=is_woq) - model.register_forward_hook(output_hook, with_kwargs=True) + # need to register_forward_hook after torch.jit.trace + # model.register_forward_hook(output_hook, with_kwargs=True) return model