From c12b8f24b6dddbe691763207c58df6ed8a942896 Mon Sep 17 00:00:00 2001 From: JinBridge <89779290+JinBridger@users.noreply.github.com> Date: Fri, 15 Sep 2023 09:54:38 +0800 Subject: [PATCH] LLM: add use_cache=True for all gpu examples (#8971) --- .../example/gpu/hf-transformers-models/baichuan/generate.py | 3 ++- .../example/gpu/hf-transformers-models/chatglm2/generate.py | 3 ++- .../llm/example/gpu/hf-transformers-models/falcon/generate.py | 3 ++- .../llm/example/gpu/hf-transformers-models/gpt-j/generate.py | 3 ++- .../example/gpu/hf-transformers-models/internlm/generate.py | 3 ++- .../llm/example/gpu/hf-transformers-models/llama2/generate.py | 3 ++- python/llm/example/gpu/hf-transformers-models/mpt/generate.py | 3 ++- .../llm/example/gpu/hf-transformers-models/qwen/generate.py | 3 ++- .../example/gpu/hf-transformers-models/starcoder/generate.py | 3 ++- .../gpu/hf-transformers-models/voiceassistant/generate.py | 4 ++-- .../example/gpu/hf-transformers-models/whisper/recognize.py | 3 ++- 11 files changed, 22 insertions(+), 12 deletions(-) diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py b/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py index cc7bee1d..e5c099ba 100644 --- a/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py @@ -43,7 +43,8 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, optimize_model=False, - trust_remote_code=True) + trust_remote_code=True, + use_cache=True) model = model.to('xpu') # Load tokenizer diff --git a/python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py b/python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py index e0a0b716..bc69ae6a 100644 --- a/python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py @@ -45,7 +45,8 @@ if __name__ == '__main__': model = AutoModel.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, - trust_remote_code=True) + trust_remote_code=True, + use_cache=True) model = model.to('xpu') # Load tokenizer diff --git a/python/llm/example/gpu/hf-transformers-models/falcon/generate.py b/python/llm/example/gpu/hf-transformers-models/falcon/generate.py index f84b3869..41113d46 100644 --- a/python/llm/example/gpu/hf-transformers-models/falcon/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/falcon/generate.py @@ -45,7 +45,8 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, optimize_model=False, - trust_remote_code=True) + trust_remote_code=True, + use_cache=True) model = model.to('xpu') # Load tokenizer diff --git a/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py b/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py index 7f98f15e..28c385dd 100644 --- a/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py @@ -43,7 +43,8 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, optimize_model=False, - trust_remote_code=True) + trust_remote_code=True, + use_cache=True) model = model.to('xpu') model = ipex.optimize(model.eval(), dtype="float16", inplace=True) diff --git a/python/llm/example/gpu/hf-transformers-models/internlm/generate.py b/python/llm/example/gpu/hf-transformers-models/internlm/generate.py index 92b610be..fd035ab6 100644 --- a/python/llm/example/gpu/hf-transformers-models/internlm/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/internlm/generate.py @@ -44,7 +44,8 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, optimize_model=False, - trust_remote_code=True) + trust_remote_code=True, + use_cache=True) model = model.to('xpu') # Load tokenizer diff --git a/python/llm/example/gpu/hf-transformers-models/llama2/generate.py b/python/llm/example/gpu/hf-transformers-models/llama2/generate.py index 9d4e8e58..3959a667 100644 --- a/python/llm/example/gpu/hf-transformers-models/llama2/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/llama2/generate.py @@ -58,7 +58,8 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, optimize_model=True, - trust_remote_code=True) + trust_remote_code=True, + use_cache=True) model = model.to('xpu') # Load tokenizer diff --git a/python/llm/example/gpu/hf-transformers-models/mpt/generate.py b/python/llm/example/gpu/hf-transformers-models/mpt/generate.py index a64148e5..647bed99 100644 --- a/python/llm/example/gpu/hf-transformers-models/mpt/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/mpt/generate.py @@ -45,7 +45,8 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, optimize_model=False, - trust_remote_code=True) + trust_remote_code=True, + use_cache=True) model = model.to('xpu') # Load tokenizer diff --git a/python/llm/example/gpu/hf-transformers-models/qwen/generate.py b/python/llm/example/gpu/hf-transformers-models/qwen/generate.py index 7418ddf0..35815e68 100644 --- a/python/llm/example/gpu/hf-transformers-models/qwen/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/qwen/generate.py @@ -43,7 +43,8 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, optimize_model=False, - trust_remote_code=True) + trust_remote_code=True, + use_cache=True) model = model.to('xpu') # Load tokenizer diff --git a/python/llm/example/gpu/hf-transformers-models/starcoder/generate.py b/python/llm/example/gpu/hf-transformers-models/starcoder/generate.py index 98e2c8fa..1a3b1672 100644 --- a/python/llm/example/gpu/hf-transformers-models/starcoder/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/starcoder/generate.py @@ -43,7 +43,8 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, optimize_model=False, - trust_remote_code=True) + trust_remote_code=True, + use_cache=True) model = model.to('xpu') # Load tokenizer diff --git a/python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py b/python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py index e669af29..64f9dea4 100644 --- a/python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py @@ -89,11 +89,11 @@ if __name__ == '__main__': processor = WhisperProcessor.from_pretrained(whisper_model_path) # generate token ids - whisper = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True, optimize_model=False) + whisper = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True, optimize_model=False, use_cache=True) whisper.config.forced_decoder_ids = None whisper = whisper.to('xpu') - llama_model = AutoModelForCausalLM.from_pretrained(llama_model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False) + llama_model = AutoModelForCausalLM.from_pretrained(llama_model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False, use_cache=True) llama_model = llama_model.to('xpu') tokenizer = LlamaTokenizer.from_pretrained(llama_model_path) diff --git a/python/llm/example/gpu/hf-transformers-models/whisper/recognize.py b/python/llm/example/gpu/hf-transformers-models/whisper/recognize.py index f31f02ec..268e78da 100644 --- a/python/llm/example/gpu/hf-transformers-models/whisper/recognize.py +++ b/python/llm/example/gpu/hf-transformers-models/whisper/recognize.py @@ -45,7 +45,8 @@ if __name__ == '__main__': # which convert the relevant layers in the model into INT4 format model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, load_in_4bit=True, - optimize_model=False) + optimize_model=False, + use_cache=True) model.to('xpu') model.config.forced_decoder_ids = None