LLM: add use_cache=True for all gpu examples (#8971)
This commit is contained in:
parent
d1b62ef2f2
commit
c12b8f24b6
11 changed files with 22 additions and 12 deletions
|
|
@ -43,7 +43,8 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
optimize_model=False,
|
optimize_model=False,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True,
|
||||||
|
use_cache=True)
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,8 @@ if __name__ == '__main__':
|
||||||
model = AutoModel.from_pretrained(model_path,
|
model = AutoModel.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
optimize_model=True,
|
optimize_model=True,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True,
|
||||||
|
use_cache=True)
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,8 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
optimize_model=False,
|
optimize_model=False,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True,
|
||||||
|
use_cache=True)
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,8 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
optimize_model=False,
|
optimize_model=False,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True,
|
||||||
|
use_cache=True)
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
model = ipex.optimize(model.eval(), dtype="float16", inplace=True)
|
model = ipex.optimize(model.eval(), dtype="float16", inplace=True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,8 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
optimize_model=False,
|
optimize_model=False,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True,
|
||||||
|
use_cache=True)
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -58,7 +58,8 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
optimize_model=True,
|
optimize_model=True,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True,
|
||||||
|
use_cache=True)
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,8 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
optimize_model=False,
|
optimize_model=False,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True,
|
||||||
|
use_cache=True)
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,8 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
optimize_model=False,
|
optimize_model=False,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True,
|
||||||
|
use_cache=True)
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,8 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
optimize_model=False,
|
optimize_model=False,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True,
|
||||||
|
use_cache=True)
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -89,11 +89,11 @@ if __name__ == '__main__':
|
||||||
processor = WhisperProcessor.from_pretrained(whisper_model_path)
|
processor = WhisperProcessor.from_pretrained(whisper_model_path)
|
||||||
|
|
||||||
# generate token ids
|
# generate token ids
|
||||||
whisper = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True, optimize_model=False)
|
whisper = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True, optimize_model=False, use_cache=True)
|
||||||
whisper.config.forced_decoder_ids = None
|
whisper.config.forced_decoder_ids = None
|
||||||
whisper = whisper.to('xpu')
|
whisper = whisper.to('xpu')
|
||||||
|
|
||||||
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
|
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False, use_cache=True)
|
||||||
llama_model = llama_model.to('xpu')
|
llama_model = llama_model.to('xpu')
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(llama_model_path)
|
tokenizer = LlamaTokenizer.from_pretrained(llama_model_path)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,8 @@ if __name__ == '__main__':
|
||||||
# which convert the relevant layers in the model into INT4 format
|
# which convert the relevant layers in the model into INT4 format
|
||||||
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path,
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
optimize_model=False)
|
optimize_model=False,
|
||||||
|
use_cache=True)
|
||||||
model.to('xpu')
|
model.to('xpu')
|
||||||
model.config.forced_decoder_ids = None
|
model.config.forced_decoder_ids = None
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue