LLM: Fix ipex torchscript=True error (#10832)

* remove * update * remove torchscript
2024-04-22 15:53:09 +08:00 · 2024-04-22 15:53:09 +08:00 · 23c6a52fb0
commit 23c6a52fb0
parent fc33aa3721
9 changed files with 14 additions and 20 deletions
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@ -1210,15 +1210,15 @@ def run_bigdl_ipex_bf16(repo_id,
    st = time.perf_counter()
    if repo_id in CHATGLM_IDS:
        model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                          use_cache=True, torchscript=True)
+                                          use_cache=True)
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    elif repo_id in LLAMA_IDS:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    if not hasattr(model.config, "token_latency"):
        model.config.token_latency = True
@ -1280,15 +1280,15 @@ def run_bigdl_ipex_int4(repo_id,
    st = time.perf_counter()
    if repo_id in CHATGLM_IDS:
        model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
-                                          use_cache=True, torchscript=True)
+                                          use_cache=True)
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    elif repo_id in LLAMA_IDS:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    if not hasattr(model.config, "token_latency"):
        model.config.token_latency = True
@ -1350,15 +1350,15 @@ def run_bigdl_ipex_int8(repo_id,
    st = time.perf_counter()
    if repo_id in CHATGLM_IDS:
        model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
-                                          use_cache=True, torchscript=True)
+                                          use_cache=True)
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    elif repo_id in LLAMA_IDS:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    if not hasattr(model.config, "token_latency"):
        model.config.token_latency = True
@ -1537,15 +1537,15 @@ def run_speculative_cpu(repo_id,
    st = time.perf_counter()
    if repo_id in CHATGLM_IDS:
        model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                          use_cache=True, torchscript=True, speculative=True)
+                                          use_cache=True, speculative=True)
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    elif repo_id in LLAMA_IDS:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                                     use_cache=True, torchscript=True, speculative=True)
+                                                     use_cache=True, speculative=True)
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                                     use_cache=True, torchscript=True, speculative=True)
+                                                     use_cache=True, speculative=True)
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
--- a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
@ -59,7 +59,6 @@ if __name__ == '__main__':
                                                 optimize_model=True,
                                                 torch_dtype=torch.bfloat16,
                                                 load_in_low_bit="bf16",
-                                                 torchscript=True,
                                                 speculative=True,
                                                 trust_remote_code=True,
                                                 use_cache=True)
--- a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
@ -57,7 +57,6 @@ if __name__ == '__main__':
                                      load_in_low_bit="bf16",
                                      speculative=True,
                                      trust_remote_code=True,
-                                      torchscript=True,
                                      use_cache=True)

    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
--- a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
@ -74,7 +74,6 @@ if __name__ == '__main__':
                                                 optimize_model=True,
                                                 torch_dtype=torch.bfloat16,
                                                 load_in_low_bit="bf16",
-                                                 torchscript=True,
                                                 speculative=True,
                                                 trust_remote_code=True,
                                                 use_cache=True)
--- a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
@ -65,7 +65,6 @@ if __name__ == '__main__':
                                                 torch_dtype=torch.bfloat16,
                                                 load_in_low_bit="bf16",
                                                 speculative=True,
-                                                 torchscript=True,
                                                 trust_remote_code=True,
                                                 use_cache=True)

--- a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
@ -51,7 +51,6 @@ if __name__ == '__main__':
                                                 torch_dtype=torch.bfloat16,
                                                 load_in_low_bit="bf16",
                                                 speculative=True,
-                                                 torchscript=True,
                                                 trust_remote_code=True,
                                                 use_cache=True)

--- a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
@ -69,7 +69,6 @@ if __name__ == '__main__':
                                                 torch_dtype=torch.bfloat16,
                                                 load_in_low_bit="bf16",
                                                 speculative=True,
-                                                 torchscript=True,
                                                 trust_remote_code=True,
                                                 use_cache=True)

--- a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
@ -51,7 +51,6 @@ if __name__ == '__main__':
                                                 torch_dtype=torch.bfloat16,
                                                 load_in_low_bit="bf16",
                                                 speculative=True,
-                                                 torchscript=True,
                                                 trust_remote_code=True,
                                                 use_cache=True)

--- a/python/llm/src/ipex_llm/transformers/convert_ipex.py
+++ b/python/llm/src/ipex_llm/transformers/convert_ipex.py
@ -150,7 +150,8 @@ def _ipex_optimize_model(model, rms_classes, qtype):
    _ipex_optimize_attention(model, is_tpp=is_tpp, is_woq=is_woq)
    _ipex_optimize_decoder(model, is_tpp=is_tpp, is_woq=is_woq)

-    model.register_forward_hook(output_hook, with_kwargs=True)
+    # need to register_forward_hook after torch.jit.trace
+    # model.register_forward_hook(output_hook, with_kwargs=True)
    return model