Replace with IPEX-LLM in example comments (#10671)
* Replace with IPEX-LLM in example comments * More replacement * revert some changes
This commit is contained in:
parent
08018a18df
commit
10ee786920
159 changed files with 183 additions and 183 deletions
|
|
@ -48,7 +48,7 @@ import ssl
|
|||
import urllib.request
|
||||
import os
|
||||
import json
|
||||
# code change to import from bigdl-llm API instead of using transformers API
|
||||
# code change to import from IPEX-LLM API instead of using transformers API
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
from transformers import LlamaTokenizer
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ if __name__ == '__main__':
|
|||
replace_method="auto"
|
||||
)
|
||||
|
||||
# Apply BigDL-LLM INT4 optimizations on transformers
|
||||
# Apply IPEX-LLM INT4 optimizations on transformers
|
||||
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
|
||||
|
||||
model = model.to(f'cpu:{local_rank}')
|
||||
|
|
@ -111,7 +111,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
do_sample=False,
|
||||
max_new_tokens=args.n_predict)
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
|||
|
||||
model_path = args.model
|
||||
|
||||
# Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
|
||||
# Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
|
||||
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)
|
||||
|
||||
# Generate predicted tokens
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
load_in_4bit=True,
|
||||
trust_remote_code=True,
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
|
||||
output = model.generate(input_ids, max_new_tokens=args.n_predict)
|
||||
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ if __name__ == '__main__':
|
|||
st = time.time()
|
||||
# enabling `use_cache=True` allows the model to utilize the previous
|
||||
# key/values attentions to speed up decoding;
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
|
||||
# it is important to set use_cache=True for Dolly v1 models
|
||||
output = model.generate(input_ids,
|
||||
use_cache=True,
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ if __name__ == '__main__':
|
|||
image = Image.open(args.image_path)
|
||||
|
||||
# Load model
|
||||
# For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
|
||||
# For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu',
|
||||
load_in_4bit = True,
|
||||
trust_remote_code=True,
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ if __name__ == '__main__':
|
|||
image = args.image_path
|
||||
|
||||
# Load model
|
||||
# For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
|
||||
# For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', load_in_4bit=True,
|
||||
trust_remote_code=True, modules_to_not_convert=['qkv'])
|
||||
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
|||
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
||||
# enabling `use_cache=True` allows the model to utilize the previous
|
||||
# key/values attentions to speed up decoding;
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
|
||||
# it is important to set use_cache=True for MPT models
|
||||
mpt_generation_config = GenerationConfig(
|
||||
max_new_tokens=args.n_predict,
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
|
||||
# Note that phi-1_5 uses GenerationConfig to enable 'use_cache'
|
||||
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
|
||||
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
||||
# Note that phi-2 uses GenerationConfig to enable 'use_cache'
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
|
||||
# Note that phixtral uses GenerationConfig to enable 'use_cache'
|
||||
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
do_sample=False,
|
||||
max_new_tokens=args.n_predict)
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
|||
model_path = args.repo_id_or_model_path
|
||||
|
||||
# Load model
|
||||
# For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
|
||||
# For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
load_in_4bit=True,
|
||||
device_map="cpu",
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(**inputs,
|
||||
max_new_tokens=args.n_predict,
|
||||
do_sample=True,
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ if __name__ == '__main__':
|
|||
st = time.time()
|
||||
# enabling `use_cache=True` allows the model to utilize the previous
|
||||
# key/values attentions to speed up decoding;
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
|
||||
# it is important to set use_cache=True for vicuna-v1.3 models
|
||||
output = model.generate(input_ids,
|
||||
use_cache=True,
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
predicted_ids = model.generate(input_features,
|
||||
forced_decoder_ids=forced_decoder_ids)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
|
||||
end_time = time.time()
|
||||
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ if __name__ == '__main__':
|
|||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
# enabling `use_cache=True` allows the model to utilize the previous
|
||||
# key/values attentions to speed up decoding;
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
|
||||
# it is important to set use_cache=True for Ziya models
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
load_in_4bit=True,
|
||||
|
|
|
|||
|
|
@ -163,7 +163,7 @@ def messages_to_prompt(messages):
|
|||
def main(args):
|
||||
embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path)
|
||||
|
||||
# Use custom LLM in BigDL
|
||||
# Use custom LLM in IPEX-LLM
|
||||
from ipex_llm.llamaindex.llms import BigdlLLM
|
||||
llm = BigdlLLM(
|
||||
model_name=args.model_path,
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -56,8 +56,8 @@ def load(model_path, model_family, n_threads):
|
|||
def inference(llm, repo_id_or_model_path, model_family, prompt):
|
||||
|
||||
if model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm']:
|
||||
# ------ Option 1: Use bigdl-llm based tokenizer
|
||||
print('-'*20, ' bigdl-llm based tokenizer ', '-'*20)
|
||||
# ------ Option 1: Use IPEX-LLM based tokenizer
|
||||
print('-'*20, ' IPEX-LLM based tokenizer ', '-'*20)
|
||||
st = time.time()
|
||||
|
||||
# please note that the prompt here can either be a string or a list of string
|
||||
|
|
@ -126,13 +126,13 @@ def main():
|
|||
if args.model_family == 'llama2':
|
||||
args.model_family = 'llama'
|
||||
|
||||
# Step 1: convert original model to BigDL llm model
|
||||
bigdl_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
|
||||
# Step 1: convert original model to IPEX-LLM model
|
||||
ipex_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
|
||||
model_family=args.model_family,
|
||||
tmp_path=args.tmp_path)
|
||||
|
||||
# Step 2: load int4 model
|
||||
llm = load(model_path=bigdl_llm_path,
|
||||
llm = load(model_path=ipex_llm_path,
|
||||
model_family=args.model_family,
|
||||
n_threads=args.thread_num)
|
||||
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
|||
torch_dtype='auto',
|
||||
low_cpu_mem_usage=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ if __name__ == '__main__':
|
|||
model = Bark.init_from_config(config)
|
||||
model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Synthesize speech with the given input
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ if __name__ == '__main__':
|
|||
torch_dtype="auto",
|
||||
low_cpu_mem_usage=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ if __name__ == '__main__':
|
|||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
# "wo" module is not converted due to some issues of T5 model
|
||||
# (https://github.com/huggingface/transformers/issues/20287),
|
||||
# "lm_head" module is not converted to generate outputs with better quality
|
||||
|
|
|
|||
|
|
@ -40,8 +40,8 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
# For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
|
||||
model = optimize_model(model,
|
||||
low_bit='sym_int4',
|
||||
modules_to_not_convert=['vision_embed_tokens'])
|
||||
|
|
|
|||
|
|
@ -38,8 +38,8 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
# For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
|
||||
model = optimize_model(model,
|
||||
low_bit='sym_int4',
|
||||
modules_to_not_convert=['qkv'])
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -292,7 +292,7 @@ if __name__ == '__main__':
|
|||
model_base=None,
|
||||
model_name=model_name)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Generate image tensor
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = MambaLMHeadModel.from_pretrained(model_path)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model, low_bit='asym_int4', modules_to_not_convert=["dt_proj", "x_proj", "out_proj"])
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
|||
torch_dtype='auto',
|
||||
low_cpu_mem_usage=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
|||
torch_dtype='auto',
|
||||
low_cpu_mem_usage=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ if __name__ == '__main__':
|
|||
# Load whisper model under pytorch framework
|
||||
model = whisper.load_model(args.model_name)
|
||||
|
||||
# With only one line to enable bigdl optimize on a pytorch model
|
||||
# With only one line to enable IPEX-LLM optimize on a pytorch model
|
||||
model = optimize_model(model)
|
||||
|
||||
st = time.time()
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -37,8 +37,8 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
# For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
|
||||
model = optimize_model(model,
|
||||
low_bit='sym_int4',
|
||||
modules_to_not_convert=['c_fc', 'out_proj'])
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ if __name__ == '__main__':
|
|||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ if __name__ == '__main__':
|
|||
torch_dtype=torch.float16,
|
||||
trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
# Load tokenizer
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ if __name__ == '__main__':
|
|||
print("Creating model...")
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True, torch_dtype=torch.float16).eval()
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = optimize_model(model)
|
||||
|
||||
prompt = YUAN2_PROMPT_FORMAT.format(prompt=args.prompt)
|
||||
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
|
||||
end_time = time.time()
|
||||
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ if __name__ == '__main__':
|
|||
from ipex_llm import optimize_model
|
||||
# enabling `use_cache=True` allows the model to utilize the previous
|
||||
# key/values attentions to speed up decoding;
|
||||
# to obtain optimal performance with BigDL-LLM `optimization_model` API optimizations,
|
||||
# to obtain optimal performance with IPEX-LLM `optimization_model` API optimizations,
|
||||
# it is important to set use_cache=True for Ziya models
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
trust_remote_code=True,
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ if __name__ == '__main__':
|
|||
# Load model
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
# `low_bit` param support `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8`
|
||||
# By specifying `low_bit` param, relevant low bit optimizations will be applied to the model
|
||||
model = optimize_model(model, low_bit=low_bit)
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ from utils.prompter import Prompter
|
|||
from transformers import BitsAndBytesConfig
|
||||
from ipex_llm.transformers import AutoModelForCausalLM
|
||||
|
||||
# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
|
||||
# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
|
||||
from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
|
||||
from ipex_llm.utils.isa_checker import ISAChecker
|
||||
|
||||
|
|
@ -64,7 +64,7 @@ def get_int_from_env(env_keys, default):
|
|||
def train(
|
||||
# model/data params
|
||||
base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
|
||||
saved_low_bit_model: str = None, # optional, the path to the saved model with bigdl-llm low-bit optimization
|
||||
saved_low_bit_model: str = None, # optional, the path to the saved model with ipex-llm low-bit optimization
|
||||
data_path: str = "yahma/alpaca-cleaned",
|
||||
output_dir: str = "./bigdl-qlora-alpaca",
|
||||
# training hyperparams
|
||||
|
|
@ -256,7 +256,7 @@ def train(
|
|||
] # could be sped up, probably
|
||||
return tokenized_full_prompt
|
||||
|
||||
# Prepare a BigDL-LLM compatible Peft model
|
||||
# Prepare a IPEX-LLM compatible Peft model
|
||||
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
|
||||
|
||||
config = LoraConfig(
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ if __name__ == '__main__':
|
|||
low_bit = args.low_bit
|
||||
|
||||
# First use CPU as accelerator
|
||||
# Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage
|
||||
# Convert to deepspeed model and apply IPEX-LLM optimization on CPU to decrease GPU memory usage
|
||||
current_accel = CPU_Accelerator()
|
||||
set_accelerator(current_accel)
|
||||
model = AutoModelForCausalLM.from_pretrained(args.repo_id_or_model_path,
|
||||
|
|
@ -80,7 +80,7 @@ if __name__ == '__main__':
|
|||
replace_method="auto",
|
||||
)
|
||||
|
||||
# Use bigdl-llm `optimize_model` to convert the model into optimized low bit format
|
||||
# Use IPEX-LLM `optimize_model` to convert the model into optimized low bit format
|
||||
# Convert the rest of the model into float16 to reduce allreduce traffic
|
||||
model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16)
|
||||
|
||||
|
|
@ -119,7 +119,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
do_sample=False,
|
||||
max_new_tokens=args.n_predict)
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM Low Bit optimizations
|
||||
# to obtain optimal performance with IPEX-LLM Low Bit optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict,
|
||||
repetition_penalty=1.1)
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
|||
|
||||
model_path = args.model
|
||||
|
||||
# Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
|
||||
# Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
|
||||
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path)
|
||||
model = model.to('xpu')
|
||||
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
end = time.time()
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
# When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
|
||||
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly to obtain optimal
|
||||
# performance with BigDL-LLM INT4 optimizations
|
||||
# performance with IPEX-LLM INT4 optimizations
|
||||
# When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
|
||||
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
|||
cpu_embedding=True
|
||||
)
|
||||
|
||||
# With only one line to enable BigDL-LLM optimization on model
|
||||
# With only one line to enable IPEX-LLM optimization on model
|
||||
model = model.to('xpu')
|
||||
# Load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
|||
st = time.time()
|
||||
# enabling `use_cache=True` allows the model to utilize the previous
|
||||
# key/values attentions to speed up decoding;
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
|
||||
# it is important to set use_cache=True for Dolly v1 models
|
||||
output = model.generate(input_ids,
|
||||
use_cache=True,
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ if __name__ == '__main__':
|
|||
# if your selected model is capable of utilizing previous key/value attentions
|
||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
||||
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=args.n_predict)
|
||||
torch.xpu.synchronize()
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue