Replace with IPEX-LLM in example comments (#10671)
* Replace with IPEX-LLM in example comments * More replacement * revert some changes
This commit is contained in:
parent
08018a18df
commit
10ee786920
159 changed files with 183 additions and 183 deletions
|
|
@ -48,7 +48,7 @@ import ssl
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
# code change to import from bigdl-llm API instead of using transformers API
|
# code change to import from IPEX-LLM API instead of using transformers API
|
||||||
from ipex_llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
from transformers import LlamaTokenizer
|
from transformers import LlamaTokenizer
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,7 @@ if __name__ == '__main__':
|
||||||
replace_method="auto"
|
replace_method="auto"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Apply BigDL-LLM INT4 optimizations on transformers
|
# Apply IPEX-LLM INT4 optimizations on transformers
|
||||||
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
|
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
|
||||||
|
|
||||||
model = model.to(f'cpu:{local_rank}')
|
model = model.to(f'cpu:{local_rank}')
|
||||||
|
|
@ -111,7 +111,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
do_sample=False,
|
do_sample=False,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
model_path = args.model
|
model_path = args.model
|
||||||
|
|
||||||
# Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
|
# Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
|
||||||
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)
|
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)
|
||||||
|
|
||||||
# Generate predicted tokens
|
# Generate predicted tokens
|
||||||
|
|
|
||||||
|
|
@ -60,7 +60,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
|
|
||||||
output = model.generate(input_ids, max_new_tokens=args.n_predict)
|
output = model.generate(input_ids, max_new_tokens=args.n_predict)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ if __name__ == '__main__':
|
||||||
st = time.time()
|
st = time.time()
|
||||||
# enabling `use_cache=True` allows the model to utilize the previous
|
# enabling `use_cache=True` allows the model to utilize the previous
|
||||||
# key/values attentions to speed up decoding;
|
# key/values attentions to speed up decoding;
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
|
||||||
# it is important to set use_cache=True for Dolly v1 models
|
# it is important to set use_cache=True for Dolly v1 models
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict,
|
max_new_tokens=args.n_predict,
|
||||||
pad_token_id=tokenizer.pad_token_id,
|
pad_token_id=tokenizer.pad_token_id,
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -60,7 +60,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ if __name__ == '__main__':
|
||||||
image = Image.open(args.image_path)
|
image = Image.open(args.image_path)
|
||||||
|
|
||||||
# Load model
|
# Load model
|
||||||
# For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
|
# For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu',
|
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu',
|
||||||
load_in_4bit = True,
|
load_in_4bit = True,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ if __name__ == '__main__':
|
||||||
image = args.image_path
|
image = args.image_path
|
||||||
|
|
||||||
# Load model
|
# Load model
|
||||||
# For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
|
# For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', load_in_4bit=True,
|
model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', load_in_4bit=True,
|
||||||
trust_remote_code=True, modules_to_not_convert=['qkv'])
|
trust_remote_code=True, modules_to_not_convert=['qkv'])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -60,7 +60,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
||||||
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
||||||
# enabling `use_cache=True` allows the model to utilize the previous
|
# enabling `use_cache=True` allows the model to utilize the previous
|
||||||
# key/values attentions to speed up decoding;
|
# key/values attentions to speed up decoding;
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
|
||||||
# it is important to set use_cache=True for MPT models
|
# it is important to set use_cache=True for MPT models
|
||||||
mpt_generation_config = GenerationConfig(
|
mpt_generation_config = GenerationConfig(
|
||||||
max_new_tokens=args.n_predict,
|
max_new_tokens=args.n_predict,
|
||||||
|
|
|
||||||
|
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
|
|
||||||
# Note that phi-1_5 uses GenerationConfig to enable 'use_cache'
|
# Note that phi-1_5 uses GenerationConfig to enable 'use_cache'
|
||||||
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
|
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
|
||||||
|
|
|
||||||
|
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
|
|
||||||
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
||||||
# Note that phi-2 uses GenerationConfig to enable 'use_cache'
|
# Note that phi-2 uses GenerationConfig to enable 'use_cache'
|
||||||
|
|
|
||||||
|
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
|
|
||||||
# Note that phixtral uses GenerationConfig to enable 'use_cache'
|
# Note that phixtral uses GenerationConfig to enable 'use_cache'
|
||||||
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
|
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
do_sample=False,
|
do_sample=False,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
||||||
model_path = args.repo_id_or_model_path
|
model_path = args.repo_id_or_model_path
|
||||||
|
|
||||||
# Load model
|
# Load model
|
||||||
# For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
|
# For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
device_map="cpu",
|
device_map="cpu",
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(**inputs,
|
output = model.generate(**inputs,
|
||||||
max_new_tokens=args.n_predict,
|
max_new_tokens=args.n_predict,
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,7 @@ if __name__ == '__main__':
|
||||||
st = time.time()
|
st = time.time()
|
||||||
# enabling `use_cache=True` allows the model to utilize the previous
|
# enabling `use_cache=True` allows the model to utilize the previous
|
||||||
# key/values attentions to speed up decoding;
|
# key/values attentions to speed up decoding;
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
|
||||||
# it is important to set use_cache=True for vicuna-v1.3 models
|
# it is important to set use_cache=True for vicuna-v1.3 models
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
|
|
|
||||||
|
|
@ -63,7 +63,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
predicted_ids = model.generate(input_features,
|
predicted_ids = model.generate(input_features,
|
||||||
forced_decoder_ids=forced_decoder_ids)
|
forced_decoder_ids=forced_decoder_ids)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
|
outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -42,7 +42,7 @@ if __name__ == '__main__':
|
||||||
from ipex_llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
# enabling `use_cache=True` allows the model to utilize the previous
|
# enabling `use_cache=True` allows the model to utilize the previous
|
||||||
# key/values attentions to speed up decoding;
|
# key/values attentions to speed up decoding;
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
|
||||||
# it is important to set use_cache=True for Ziya models
|
# it is important to set use_cache=True for Ziya models
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
|
|
|
||||||
|
|
@ -163,7 +163,7 @@ def messages_to_prompt(messages):
|
||||||
def main(args):
|
def main(args):
|
||||||
embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path)
|
embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path)
|
||||||
|
|
||||||
# Use custom LLM in BigDL
|
# Use custom LLM in IPEX-LLM
|
||||||
from ipex_llm.llamaindex.llms import BigdlLLM
|
from ipex_llm.llamaindex.llms import BigdlLLM
|
||||||
llm = BigdlLLM(
|
llm = BigdlLLM(
|
||||||
model_name=args.model_path,
|
model_name=args.model_path,
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -56,8 +56,8 @@ def load(model_path, model_family, n_threads):
|
||||||
def inference(llm, repo_id_or_model_path, model_family, prompt):
|
def inference(llm, repo_id_or_model_path, model_family, prompt):
|
||||||
|
|
||||||
if model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm']:
|
if model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm']:
|
||||||
# ------ Option 1: Use bigdl-llm based tokenizer
|
# ------ Option 1: Use IPEX-LLM based tokenizer
|
||||||
print('-'*20, ' bigdl-llm based tokenizer ', '-'*20)
|
print('-'*20, ' IPEX-LLM based tokenizer ', '-'*20)
|
||||||
st = time.time()
|
st = time.time()
|
||||||
|
|
||||||
# please note that the prompt here can either be a string or a list of string
|
# please note that the prompt here can either be a string or a list of string
|
||||||
|
|
@ -126,13 +126,13 @@ def main():
|
||||||
if args.model_family == 'llama2':
|
if args.model_family == 'llama2':
|
||||||
args.model_family = 'llama'
|
args.model_family = 'llama'
|
||||||
|
|
||||||
# Step 1: convert original model to BigDL llm model
|
# Step 1: convert original model to IPEX-LLM model
|
||||||
bigdl_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
|
ipex_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
|
||||||
model_family=args.model_family,
|
model_family=args.model_family,
|
||||||
tmp_path=args.tmp_path)
|
tmp_path=args.tmp_path)
|
||||||
|
|
||||||
# Step 2: load int4 model
|
# Step 2: load int4 model
|
||||||
llm = load(model_path=bigdl_llm_path,
|
llm = load(model_path=ipex_llm_path,
|
||||||
model_family=args.model_family,
|
model_family=args.model_family,
|
||||||
n_threads=args.thread_num)
|
n_threads=args.thread_num)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
||||||
torch_dtype='auto',
|
torch_dtype='auto',
|
||||||
low_cpu_mem_usage=True)
|
low_cpu_mem_usage=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ if __name__ == '__main__':
|
||||||
model = Bark.init_from_config(config)
|
model = Bark.init_from_config(config)
|
||||||
model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
|
model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Synthesize speech with the given input
|
# Synthesize speech with the given input
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ if __name__ == '__main__':
|
||||||
torch_dtype="auto",
|
torch_dtype="auto",
|
||||||
low_cpu_mem_usage=True)
|
low_cpu_mem_usage=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -50,7 +50,7 @@ if __name__ == '__main__':
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
# "wo" module is not converted due to some issues of T5 model
|
# "wo" module is not converted due to some issues of T5 model
|
||||||
# (https://github.com/huggingface/transformers/issues/20287),
|
# (https://github.com/huggingface/transformers/issues/20287),
|
||||||
# "lm_head" module is not converted to generate outputs with better quality
|
# "lm_head" module is not converted to generate outputs with better quality
|
||||||
|
|
|
||||||
|
|
@ -40,8 +40,8 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
# For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
|
# For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
|
||||||
model = optimize_model(model,
|
model = optimize_model(model,
|
||||||
low_bit='sym_int4',
|
low_bit='sym_int4',
|
||||||
modules_to_not_convert=['vision_embed_tokens'])
|
modules_to_not_convert=['vision_embed_tokens'])
|
||||||
|
|
|
||||||
|
|
@ -38,8 +38,8 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
# For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
|
# For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
|
||||||
model = optimize_model(model,
|
model = optimize_model(model,
|
||||||
low_bit='sym_int4',
|
low_bit='sym_int4',
|
||||||
modules_to_not_convert=['qkv'])
|
modules_to_not_convert=['qkv'])
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -292,7 +292,7 @@ if __name__ == '__main__':
|
||||||
model_base=None,
|
model_base=None,
|
||||||
model_name=model_name)
|
model_name=model_name)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Generate image tensor
|
# Generate image tensor
|
||||||
|
|
|
||||||
|
|
@ -42,7 +42,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = MambaLMHeadModel.from_pretrained(model_path)
|
model = MambaLMHeadModel.from_pretrained(model_path)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model, low_bit='asym_int4', modules_to_not_convert=["dt_proj", "x_proj", "out_proj"])
|
model = optimize_model(model, low_bit='asym_int4', modules_to_not_convert=["dt_proj", "x_proj", "out_proj"])
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
||||||
torch_dtype='auto',
|
torch_dtype='auto',
|
||||||
low_cpu_mem_usage=True)
|
low_cpu_mem_usage=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
||||||
torch_dtype='auto',
|
torch_dtype='auto',
|
||||||
low_cpu_mem_usage=True)
|
low_cpu_mem_usage=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,7 @@ if __name__ == '__main__':
|
||||||
# Load whisper model under pytorch framework
|
# Load whisper model under pytorch framework
|
||||||
model = whisper.load_model(args.model_name)
|
model = whisper.load_model(args.model_name)
|
||||||
|
|
||||||
# With only one line to enable bigdl optimize on a pytorch model
|
# With only one line to enable IPEX-LLM optimize on a pytorch model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
st = time.time()
|
st = time.time()
|
||||||
|
|
|
||||||
|
|
@ -42,7 +42,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -42,7 +42,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -37,8 +37,8 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
# For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
|
# For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
|
||||||
model = optimize_model(model,
|
model = optimize_model(model,
|
||||||
low_bit='sym_int4',
|
low_bit='sym_int4',
|
||||||
modules_to_not_convert=['c_fc', 'out_proj'])
|
modules_to_not_convert=['c_fc', 'out_proj'])
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ if __name__ == '__main__':
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ if __name__ == '__main__':
|
||||||
torch_dtype=torch.float16,
|
torch_dtype=torch.float16,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
|
|
|
||||||
|
|
@ -54,7 +54,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ if __name__ == '__main__':
|
||||||
print("Creating model...")
|
print("Creating model...")
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True, torch_dtype=torch.float16).eval()
|
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True, torch_dtype=torch.float16).eval()
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = optimize_model(model)
|
model = optimize_model(model)
|
||||||
|
|
||||||
prompt = YUAN2_PROMPT_FORMAT.format(prompt=args.prompt)
|
prompt = YUAN2_PROMPT_FORMAT.format(prompt=args.prompt)
|
||||||
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
|
outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -42,7 +42,7 @@ if __name__ == '__main__':
|
||||||
from ipex_llm import optimize_model
|
from ipex_llm import optimize_model
|
||||||
# enabling `use_cache=True` allows the model to utilize the previous
|
# enabling `use_cache=True` allows the model to utilize the previous
|
||||||
# key/values attentions to speed up decoding;
|
# key/values attentions to speed up decoding;
|
||||||
# to obtain optimal performance with BigDL-LLM `optimization_model` API optimizations,
|
# to obtain optimal performance with IPEX-LLM `optimization_model` API optimizations,
|
||||||
# it is important to set use_cache=True for Ziya models
|
# it is important to set use_cache=True for Ziya models
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ if __name__ == '__main__':
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
# `low_bit` param support `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8`
|
# `low_bit` param support `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8`
|
||||||
# By specifying `low_bit` param, relevant low bit optimizations will be applied to the model
|
# By specifying `low_bit` param, relevant low bit optimizations will be applied to the model
|
||||||
model = optimize_model(model, low_bit=low_bit)
|
model = optimize_model(model, low_bit=low_bit)
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ from utils.prompter import Prompter
|
||||||
from transformers import BitsAndBytesConfig
|
from transformers import BitsAndBytesConfig
|
||||||
from ipex_llm.transformers import AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
|
# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
|
||||||
from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
|
from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
|
||||||
from ipex_llm.utils.isa_checker import ISAChecker
|
from ipex_llm.utils.isa_checker import ISAChecker
|
||||||
|
|
||||||
|
|
@ -64,7 +64,7 @@ def get_int_from_env(env_keys, default):
|
||||||
def train(
|
def train(
|
||||||
# model/data params
|
# model/data params
|
||||||
base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
|
base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
|
||||||
saved_low_bit_model: str = None, # optional, the path to the saved model with bigdl-llm low-bit optimization
|
saved_low_bit_model: str = None, # optional, the path to the saved model with ipex-llm low-bit optimization
|
||||||
data_path: str = "yahma/alpaca-cleaned",
|
data_path: str = "yahma/alpaca-cleaned",
|
||||||
output_dir: str = "./bigdl-qlora-alpaca",
|
output_dir: str = "./bigdl-qlora-alpaca",
|
||||||
# training hyperparams
|
# training hyperparams
|
||||||
|
|
@ -256,7 +256,7 @@ def train(
|
||||||
] # could be sped up, probably
|
] # could be sped up, probably
|
||||||
return tokenized_full_prompt
|
return tokenized_full_prompt
|
||||||
|
|
||||||
# Prepare a BigDL-LLM compatible Peft model
|
# Prepare a IPEX-LLM compatible Peft model
|
||||||
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
|
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
|
||||||
|
|
||||||
config = LoraConfig(
|
config = LoraConfig(
|
||||||
|
|
|
||||||
|
|
@ -63,7 +63,7 @@ if __name__ == '__main__':
|
||||||
low_bit = args.low_bit
|
low_bit = args.low_bit
|
||||||
|
|
||||||
# First use CPU as accelerator
|
# First use CPU as accelerator
|
||||||
# Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage
|
# Convert to deepspeed model and apply IPEX-LLM optimization on CPU to decrease GPU memory usage
|
||||||
current_accel = CPU_Accelerator()
|
current_accel = CPU_Accelerator()
|
||||||
set_accelerator(current_accel)
|
set_accelerator(current_accel)
|
||||||
model = AutoModelForCausalLM.from_pretrained(args.repo_id_or_model_path,
|
model = AutoModelForCausalLM.from_pretrained(args.repo_id_or_model_path,
|
||||||
|
|
@ -80,7 +80,7 @@ if __name__ == '__main__':
|
||||||
replace_method="auto",
|
replace_method="auto",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use bigdl-llm `optimize_model` to convert the model into optimized low bit format
|
# Use IPEX-LLM `optimize_model` to convert the model into optimized low bit format
|
||||||
# Convert the rest of the model into float16 to reduce allreduce traffic
|
# Convert the rest of the model into float16 to reduce allreduce traffic
|
||||||
model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16)
|
model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16)
|
||||||
|
|
||||||
|
|
@ -119,7 +119,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
do_sample=False,
|
do_sample=False,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
|
|
|
||||||
|
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM Low Bit optimizations
|
# to obtain optimal performance with IPEX-LLM Low Bit optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict,
|
max_new_tokens=args.n_predict,
|
||||||
repetition_penalty=1.1)
|
repetition_penalty=1.1)
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
model_path = args.model
|
model_path = args.model
|
||||||
|
|
||||||
# Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
|
# Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
|
||||||
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path)
|
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path)
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
||||||
|
|
@ -60,7 +60,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
|
||||||
|
|
@ -60,7 +60,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
# When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
|
# When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
|
||||||
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
|
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly to obtain optimal
|
# it is important to set `use_cache=True` explicitly to obtain optimal
|
||||||
# performance with BigDL-LLM INT4 optimizations
|
# performance with IPEX-LLM INT4 optimizations
|
||||||
# When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
|
# When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
|
||||||
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
|
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ if __name__ == '__main__':
|
||||||
cpu_embedding=True
|
cpu_embedding=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# With only one line to enable BigDL-LLM optimization on model
|
# With only one line to enable IPEX-LLM optimization on model
|
||||||
model = model.to('xpu')
|
model = model.to('xpu')
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ if __name__ == '__main__':
|
||||||
st = time.time()
|
st = time.time()
|
||||||
# enabling `use_cache=True` allows the model to utilize the previous
|
# enabling `use_cache=True` allows the model to utilize the previous
|
||||||
# key/values attentions to speed up decoding;
|
# key/values attentions to speed up decoding;
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
|
||||||
# it is important to set use_cache=True for Dolly v1 models
|
# it is important to set use_cache=True for Dolly v1 models
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict,
|
max_new_tokens=args.n_predict,
|
||||||
pad_token_id=tokenizer.pad_token_id,
|
pad_token_id=tokenizer.pad_token_id,
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
|
||||||
|
|
@ -69,7 +69,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,7 @@ if __name__ == '__main__':
|
||||||
# if your selected model is capable of utilizing previous key/value attentions
|
# if your selected model is capable of utilizing previous key/value attentions
|
||||||
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
# to enhance decoding speed, but has `"use_cache": false` in its model config,
|
||||||
# it is important to set `use_cache=True` explicitly in the `generate` function
|
# it is important to set `use_cache=True` explicitly in the `generate` function
|
||||||
# to obtain optimal performance with BigDL-LLM INT4 optimizations
|
# to obtain optimal performance with IPEX-LLM INT4 optimizations
|
||||||
output = model.generate(input_ids,
|
output = model.generate(input_ids,
|
||||||
max_new_tokens=args.n_predict)
|
max_new_tokens=args.n_predict)
|
||||||
torch.xpu.synchronize()
|
torch.xpu.synchronize()
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue