LLM: add use_cache=True for all gpu examples (#8971)

2023-09-15 09:54:38 +08:00 · 2023-09-15 09:54:38 +08:00 · c12b8f24b6
commit c12b8f24b6
parent d1b62ef2f2
11 changed files with 22 additions and 12 deletions
--- a/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py
@ -43,7 +43,8 @@ if __name__ == '__main__':
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 load_in_4bit=True,
                                                 optimize_model=False,
-                                                 trust_remote_code=True)
+                                                 trust_remote_code=True,
+                                                 use_cache=True)
    model = model.to('xpu')

    # Load tokenizer
--- a/python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py
@ -45,7 +45,8 @@ if __name__ == '__main__':
    model = AutoModel.from_pretrained(model_path,
                                      load_in_4bit=True,
                                      optimize_model=True,
-                                      trust_remote_code=True)
+                                      trust_remote_code=True,
+                                      use_cache=True)
    model = model.to('xpu')

    # Load tokenizer
--- a/python/llm/example/gpu/hf-transformers-models/falcon/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/falcon/generate.py
@ -45,7 +45,8 @@ if __name__ == '__main__':
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 load_in_4bit=True,
                                                 optimize_model=False,
-                                                 trust_remote_code=True)
+                                                 trust_remote_code=True,
+                                                 use_cache=True)
    model = model.to('xpu')

    # Load tokenizer
--- a/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py
@ -43,7 +43,8 @@ if __name__ == '__main__':
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 load_in_4bit=True,
                                                 optimize_model=False,
-                                                 trust_remote_code=True)
+                                                 trust_remote_code=True,
+                                                 use_cache=True)
    model = model.to('xpu')
    model = ipex.optimize(model.eval(), dtype="float16", inplace=True)

--- a/python/llm/example/gpu/hf-transformers-models/internlm/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/internlm/generate.py
@ -44,7 +44,8 @@ if __name__ == '__main__':
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 load_in_4bit=True,
                                                 optimize_model=False,
-                                                 trust_remote_code=True)
+                                                 trust_remote_code=True,
+                                                 use_cache=True)
    model = model.to('xpu')

    # Load tokenizer
--- a/python/llm/example/gpu/hf-transformers-models/llama2/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/llama2/generate.py
@ -58,7 +58,8 @@ if __name__ == '__main__':
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 load_in_4bit=True,
                                                 optimize_model=True,
-                                                 trust_remote_code=True)
+                                                 trust_remote_code=True,
+                                                 use_cache=True)
    model = model.to('xpu')

    # Load tokenizer
--- a/python/llm/example/gpu/hf-transformers-models/mpt/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/mpt/generate.py
@ -45,7 +45,8 @@ if __name__ == '__main__':
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 load_in_4bit=True,
                                                 optimize_model=False,
-                                                 trust_remote_code=True)
+                                                 trust_remote_code=True,
+                                                 use_cache=True)
    model = model.to('xpu')

    # Load tokenizer
--- a/python/llm/example/gpu/hf-transformers-models/qwen/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/qwen/generate.py
@ -43,7 +43,8 @@ if __name__ == '__main__':
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 load_in_4bit=True,
                                                 optimize_model=False,
-                                                 trust_remote_code=True)
+                                                 trust_remote_code=True,
+                                                 use_cache=True)
    model = model.to('xpu')

    # Load tokenizer
--- a/python/llm/example/gpu/hf-transformers-models/starcoder/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/starcoder/generate.py
@ -43,7 +43,8 @@ if __name__ == '__main__':
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 load_in_4bit=True,
                                                 optimize_model=False,
-                                                 trust_remote_code=True)
+                                                 trust_remote_code=True,
+                                                 use_cache=True)
    model = model.to('xpu')

    # Load tokenizer
--- a/python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py
+++ b/python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py
@ -89,11 +89,11 @@ if __name__ == '__main__':
    processor = WhisperProcessor.from_pretrained(whisper_model_path)

    # generate token ids
-    whisper =  AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True, optimize_model=False)
+    whisper =  AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True, optimize_model=False, use_cache=True)
    whisper.config.forced_decoder_ids = None
    whisper = whisper.to('xpu')
    
-    llama_model = AutoModelForCausalLM.from_pretrained(llama_model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
+    llama_model = AutoModelForCausalLM.from_pretrained(llama_model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False, use_cache=True)
    llama_model = llama_model.to('xpu')
    tokenizer = LlamaTokenizer.from_pretrained(llama_model_path)

--- a/python/llm/example/gpu/hf-transformers-models/whisper/recognize.py
+++ b/python/llm/example/gpu/hf-transformers-models/whisper/recognize.py
@ -45,7 +45,8 @@ if __name__ == '__main__':
    # which convert the relevant layers in the model into INT4 format
    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path,
                                                      load_in_4bit=True,
-                                                      optimize_model=False)
+                                                      optimize_model=False,
+                                                      use_cache=True)
    model.to('xpu')
    model.config.forced_decoder_ids = None