Replace with IPEX-LLM in example comments (#10671)

* Replace with IPEX-LLM in example comments

* More replacement

* revert some changes
This commit is contained in:
Jin Qiao 2024-04-07 13:29:51 +08:00 committed by GitHub
parent 08018a18df
commit 10ee786920
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
159 changed files with 183 additions and 183 deletions

View file

@ -48,7 +48,7 @@ import ssl
import urllib.request
import os
import json
# code change to import from bigdl-llm API instead of using transformers API
# code change to import from IPEX-LLM API instead of using transformers API
from ipex_llm.transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer
import intel_extension_for_pytorch as ipex

View file

@ -87,7 +87,7 @@ if __name__ == '__main__':
replace_method="auto"
)
# Apply BigDL-LLM INT4 optimizations on transformers
# Apply IPEX-LLM INT4 optimizations on transformers
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
model = model.to(f'cpu:{local_rank}')
@ -111,7 +111,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
do_sample=False,
max_new_tokens=args.n_predict)

View file

@ -59,7 +59,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -44,7 +44,7 @@ if __name__ == '__main__':
model_path = args.model
# Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
# Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)
# Generate predicted tokens

View file

@ -60,7 +60,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -56,7 +56,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -56,7 +56,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -56,7 +56,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -45,7 +45,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
trust_remote_code=True,

View file

@ -56,7 +56,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -57,7 +57,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -57,7 +57,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -57,7 +57,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -55,7 +55,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -56,7 +56,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids, max_new_tokens=args.n_predict)

View file

@ -61,7 +61,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -61,7 +61,7 @@ if __name__ == '__main__':
st = time.time()
# enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
# it is important to set use_cache=True for Dolly v1 models
output = model.generate(input_ids,
use_cache=True,

View file

@ -64,7 +64,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict,
pad_token_id=tokenizer.pad_token_id,

View file

@ -57,7 +57,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -60,7 +60,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -38,7 +38,7 @@ if __name__ == '__main__':
image = Image.open(args.image_path)
# Load model
# For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
# For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu',
load_in_4bit = True,
trust_remote_code=True,

View file

@ -61,7 +61,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -37,7 +37,7 @@ if __name__ == '__main__':
image = args.image_path
# Load model
# For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
# For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', load_in_4bit=True,
trust_remote_code=True, modules_to_not_convert=['qkv'])

View file

@ -57,7 +57,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -56,7 +56,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -60,7 +60,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -55,7 +55,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -61,7 +61,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -56,7 +56,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -55,7 +55,7 @@ if __name__ == '__main__':
input_ids = tokenizer.encode(prompt, return_tensors="pt")
# enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
# it is important to set use_cache=True for MPT models
mpt_generation_config = GenerationConfig(
max_new_tokens=args.n_predict,

View file

@ -58,7 +58,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
# Note that phi-1_5 uses GenerationConfig to enable 'use_cache'
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)

View file

@ -58,7 +58,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
model.generation_config.pad_token_id = model.generation_config.eos_token_id
# Note that phi-2 uses GenerationConfig to enable 'use_cache'

View file

@ -58,7 +58,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
# Note that phixtral uses GenerationConfig to enable 'use_cache'
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)

View file

@ -55,7 +55,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
do_sample=False,
max_new_tokens=args.n_predict)

View file

@ -36,7 +36,7 @@ if __name__ == '__main__':
model_path = args.repo_id_or_model_path
# Load model
# For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
# For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
device_map="cpu",

View file

@ -64,7 +64,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -56,7 +56,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(**inputs,
max_new_tokens=args.n_predict,
do_sample=True,

View file

@ -55,7 +55,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -55,7 +55,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -59,7 +59,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -55,7 +55,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -53,7 +53,7 @@ if __name__ == '__main__':
st = time.time()
# enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
# it is important to set use_cache=True for vicuna-v1.3 models
output = model.generate(input_ids,
use_cache=True,

View file

@ -63,7 +63,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
predicted_ids = model.generate(input_features,
forced_decoder_ids=forced_decoder_ids)
end = time.time()

View file

@ -58,7 +58,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -61,7 +61,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -57,7 +57,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
end_time = time.time()

View file

@ -42,7 +42,7 @@ if __name__ == '__main__':
from ipex_llm.transformers import AutoModelForCausalLM
# enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
# it is important to set use_cache=True for Ziya models
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,

View file

@ -163,7 +163,7 @@ def messages_to_prompt(messages):
def main(args):
embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path)
# Use custom LLM in BigDL
# Use custom LLM in IPEX-LLM
from ipex_llm.llamaindex.llms import BigdlLLM
llm = BigdlLLM(
model_name=args.model_path,

View file

@ -59,7 +59,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -56,8 +56,8 @@ def load(model_path, model_family, n_threads):
def inference(llm, repo_id_or_model_path, model_family, prompt):
if model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm']:
# ------ Option 1: Use bigdl-llm based tokenizer
print('-'*20, ' bigdl-llm based tokenizer ', '-'*20)
# ------ Option 1: Use IPEX-LLM based tokenizer
print('-'*20, ' IPEX-LLM based tokenizer ', '-'*20)
st = time.time()
# please note that the prompt here can either be a string or a list of string
@ -126,13 +126,13 @@ def main():
if args.model_family == 'llama2':
args.model_family = 'llama'
# Step 1: convert original model to BigDL llm model
bigdl_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
# Step 1: convert original model to IPEX-LLM model
ipex_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
model_family=args.model_family,
tmp_path=args.tmp_path)
# Step 2: load int4 model
llm = load(model_path=bigdl_llm_path,
llm = load(model_path=ipex_llm_path,
model_family=args.model_family,
n_threads=args.thread_num)

View file

@ -44,7 +44,7 @@ if __name__ == '__main__':
torch_dtype='auto',
low_cpu_mem_usage=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -38,7 +38,7 @@ if __name__ == '__main__':
model = Bark.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Synthesize speech with the given input

View file

@ -38,7 +38,7 @@ if __name__ == '__main__':
torch_dtype="auto",
low_cpu_mem_usage=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -40,7 +40,7 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -41,7 +41,7 @@ if __name__ == '__main__':
# Load model
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -41,7 +41,7 @@ if __name__ == '__main__':
# Load model
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -41,7 +41,7 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -40,7 +40,7 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -50,7 +50,7 @@ if __name__ == '__main__':
trust_remote_code=True,
)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -46,7 +46,7 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -40,7 +40,7 @@ if __name__ == '__main__':
# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
# "wo" module is not converted due to some issues of T5 model
# (https://github.com/huggingface/transformers/issues/20287),
# "lm_head" module is not converted to generate outputs with better quality

View file

@ -40,8 +40,8 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
# With only one line to enable IPEX-LLM optimization on model
# For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
model = optimize_model(model,
low_bit='sym_int4',
modules_to_not_convert=['vision_embed_tokens'])

View file

@ -38,8 +38,8 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
# With only one line to enable IPEX-LLM optimization on model
# For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
model = optimize_model(model,
low_bit='sym_int4',
modules_to_not_convert=['qkv'])

View file

@ -45,7 +45,7 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -292,7 +292,7 @@ if __name__ == '__main__':
model_base=None,
model_name=model_name)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Generate image tensor

View file

@ -42,7 +42,7 @@ if __name__ == '__main__':
# Load model
model = MambaLMHeadModel.from_pretrained(model_path)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model, low_bit='asym_int4', modules_to_not_convert=["dt_proj", "x_proj", "out_proj"])
# Load tokenizer

View file

@ -44,7 +44,7 @@ if __name__ == '__main__':
torch_dtype='auto',
low_cpu_mem_usage=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -44,7 +44,7 @@ if __name__ == '__main__':
torch_dtype='auto',
low_cpu_mem_usage=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -46,7 +46,7 @@ if __name__ == '__main__':
# Load whisper model under pytorch framework
model = whisper.load_model(args.model_name)
# With only one line to enable bigdl optimize on a pytorch model
# With only one line to enable IPEX-LLM optimize on a pytorch model
model = optimize_model(model)
st = time.time()

View file

@ -42,7 +42,7 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -42,7 +42,7 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -37,8 +37,8 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
# With only one line to enable IPEX-LLM optimization on model
# For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
model = optimize_model(model,
low_bit='sym_int4',
modules_to_not_convert=['c_fc', 'out_proj'])

View file

@ -41,7 +41,7 @@ if __name__ == '__main__':
model = AutoModelForCausalLM.from_pretrained(model_path,
trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -45,7 +45,7 @@ if __name__ == '__main__':
torch_dtype=torch.float16,
trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -44,7 +44,7 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
# Load tokenizer

View file

@ -54,7 +54,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -48,7 +48,7 @@ if __name__ == '__main__':
print("Creating model...")
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True, torch_dtype=torch.float16).eval()
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = optimize_model(model)
prompt = YUAN2_PROMPT_FORMAT.format(prompt=args.prompt)
@ -59,7 +59,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
end_time = time.time()

View file

@ -42,7 +42,7 @@ if __name__ == '__main__':
from ipex_llm import optimize_model
# enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM `optimization_model` API optimizations,
# to obtain optimal performance with IPEX-LLM `optimization_model` API optimizations,
# it is important to set use_cache=True for Ziya models
model = AutoModelForCausalLM.from_pretrained(model_path,
trust_remote_code=True,

View file

@ -49,7 +49,7 @@ if __name__ == '__main__':
# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
# `low_bit` param support `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8`
# By specifying `low_bit` param, relevant low bit optimizations will be applied to the model
model = optimize_model(model, low_bit=low_bit)

View file

@ -49,7 +49,7 @@ from utils.prompter import Prompter
from transformers import BitsAndBytesConfig
from ipex_llm.transformers import AutoModelForCausalLM
# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
from ipex_llm.utils.isa_checker import ISAChecker
@ -64,7 +64,7 @@ def get_int_from_env(env_keys, default):
def train(
# model/data params
base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
saved_low_bit_model: str = None, # optional, the path to the saved model with bigdl-llm low-bit optimization
saved_low_bit_model: str = None, # optional, the path to the saved model with ipex-llm low-bit optimization
data_path: str = "yahma/alpaca-cleaned",
output_dir: str = "./bigdl-qlora-alpaca",
# training hyperparams
@ -256,7 +256,7 @@ def train(
] # could be sped up, probably
return tokenized_full_prompt
# Prepare a BigDL-LLM compatible Peft model
# Prepare a IPEX-LLM compatible Peft model
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
config = LoraConfig(

View file

@ -63,7 +63,7 @@ if __name__ == '__main__':
low_bit = args.low_bit
# First use CPU as accelerator
# Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage
# Convert to deepspeed model and apply IPEX-LLM optimization on CPU to decrease GPU memory usage
current_accel = CPU_Accelerator()
set_accelerator(current_accel)
model = AutoModelForCausalLM.from_pretrained(args.repo_id_or_model_path,
@ -80,7 +80,7 @@ if __name__ == '__main__':
replace_method="auto",
)
# Use bigdl-llm `optimize_model` to convert the model into optimized low bit format
# Use IPEX-LLM `optimize_model` to convert the model into optimized low bit format
# Convert the rest of the model into float16 to reduce allreduce traffic
model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16)
@ -119,7 +119,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
do_sample=False,
max_new_tokens=args.n_predict)

View file

@ -58,7 +58,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -71,7 +71,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM Low Bit optimizations
# to obtain optimal performance with IPEX-LLM Low Bit optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict,
repetition_penalty=1.1)

View file

@ -44,7 +44,7 @@ if __name__ == '__main__':
model_path = args.model
# Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
# Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path)
model = model.to('xpu')

View file

@ -59,7 +59,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
end = time.time()

View file

@ -60,7 +60,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()

View file

@ -60,7 +60,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()

View file

@ -64,7 +64,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()

View file

@ -44,7 +44,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
# When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
model = AutoModelForCausalLM.from_pretrained(model_path,

View file

@ -64,7 +64,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()

View file

@ -67,7 +67,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()

View file

@ -67,7 +67,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()

View file

@ -57,7 +57,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly to obtain optimal
# performance with BigDL-LLM INT4 optimizations
# performance with IPEX-LLM INT4 optimizations
# When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
model = AutoModelForCausalLM.from_pretrained(model_path,

View file

@ -67,7 +67,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()

View file

@ -55,7 +55,7 @@ if __name__ == '__main__':
cpu_embedding=True
)
# With only one line to enable BigDL-LLM optimization on model
# With only one line to enable IPEX-LLM optimization on model
model = model.to('xpu')
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

View file

@ -64,7 +64,7 @@ if __name__ == '__main__':
st = time.time()
# enabling `use_cache=True` allows the model to utilize the previous
# key/values attentions to speed up decoding;
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
# it is important to set use_cache=True for Dolly v1 models
output = model.generate(input_ids,
use_cache=True,

View file

@ -67,7 +67,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict,
pad_token_id=tokenizer.pad_token_id,

View file

@ -67,7 +67,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()

View file

@ -69,7 +69,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()

View file

@ -68,7 +68,7 @@ if __name__ == '__main__':
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
# to obtain optimal performance with IPEX-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=args.n_predict)
torch.xpu.synchronize()

Some files were not shown because too many files have changed in this diff Show more