LLM: add use_cache=True for all gpu examples (#8971)

This commit is contained in:
JinBridge 2023-09-15 09:54:38 +08:00 committed by GitHub
parent d1b62ef2f2
commit c12b8f24b6
11 changed files with 22 additions and 12 deletions

View file

@ -43,7 +43,8 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False,
trust_remote_code=True)
trust_remote_code=True,
use_cache=True)
model = model.to('xpu')
# Load tokenizer

View file

@ -45,7 +45,8 @@ if __name__ == '__main__':
model = AutoModel.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=True,
trust_remote_code=True)
trust_remote_code=True,
use_cache=True)
model = model.to('xpu')
# Load tokenizer

View file

@ -45,7 +45,8 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False,
trust_remote_code=True)
trust_remote_code=True,
use_cache=True)
model = model.to('xpu')
# Load tokenizer

View file

@ -43,7 +43,8 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False,
trust_remote_code=True)
trust_remote_code=True,
use_cache=True)
model = model.to('xpu')
model = ipex.optimize(model.eval(), dtype="float16", inplace=True)

View file

@ -44,7 +44,8 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False,
trust_remote_code=True)
trust_remote_code=True,
use_cache=True)
model = model.to('xpu')
# Load tokenizer

View file

@ -58,7 +58,8 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=True,
trust_remote_code=True)
trust_remote_code=True,
use_cache=True)
model = model.to('xpu')
# Load tokenizer

View file

@ -45,7 +45,8 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False,
trust_remote_code=True)
trust_remote_code=True,
use_cache=True)
model = model.to('xpu')
# Load tokenizer

View file

@ -43,7 +43,8 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False,
trust_remote_code=True)
trust_remote_code=True,
use_cache=True)
model = model.to('xpu')
# Load tokenizer

View file

@ -43,7 +43,8 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False,
trust_remote_code=True)
trust_remote_code=True,
use_cache=True)
model = model.to('xpu')
# Load tokenizer

View file

@ -89,11 +89,11 @@ if __name__ == '__main__':
processor = WhisperProcessor.from_pretrained(whisper_model_path)
# generate token ids
whisper = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True, optimize_model=False)
whisper = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True, optimize_model=False, use_cache=True)
whisper.config.forced_decoder_ids = None
whisper = whisper.to('xpu')
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False, use_cache=True)
llama_model = llama_model.to('xpu')
tokenizer = LlamaTokenizer.from_pretrained(llama_model_path)

View file

@ -45,7 +45,8 @@ if __name__ == '__main__':
# which convert the relevant layers in the model into INT4 format
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False)
optimize_model=False,
use_cache=True)
model.to('xpu')
model.config.forced_decoder_ids = None