From 10ee786920e8d9a1deab3e2bd7c20e00ebdf6a84 Mon Sep 17 00:00:00 2001 From: Jin Qiao <89779290+JinBridger@users.noreply.github.com> Date: Sun, 7 Apr 2024 13:29:51 +0800 Subject: [PATCH] Replace with IPEX-LLM in example comments (#10671) * Replace with IPEX-LLM in example comments * More replacement * revert some changes --- .../Applications/streaming-llm/streaming_llm/utils.py | 2 +- .../example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py | 4 ++-- .../Advanced-Quantizations/AWQ/generate.py | 2 +- .../Advanced-Quantizations/GGUF/generate.py | 2 +- .../Advanced-Quantizations/GPTQ/generate.py | 2 +- .../Model/aquila/generate.py | 2 +- .../Model/aquila2/generate.py | 2 +- .../Model/baichuan/generate.py | 2 +- .../Model/baichuan2/generate.py | 2 +- .../Model/bluelm/generate.py | 2 +- .../Model/chatglm/generate.py | 2 +- .../Model/chatglm2/generate.py | 2 +- .../Model/chatglm3/generate.py | 2 +- .../Model/codellama/generate.py | 2 +- .../Model/codeshell/generate.py | 2 +- .../Model/deepseek/generate.py | 2 +- .../Model/dolly_v1/generate.py | 2 +- .../Model/dolly_v2/generate.py | 2 +- .../Model/falcon/generate.py | 2 +- .../Model/flan-t5/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/fuyu/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/gemma/generate.py | 2 +- .../Model/internlm-xcomposer/chat.py | 2 +- .../Model/internlm/generate.py | 2 +- .../Model/internlm2/generate.py | 2 +- .../Model/llama2/generate.py | 2 +- .../Model/mistral/generate.py | 2 +- .../Model/mixtral/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/moss/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/mpt/generate.py | 2 +- .../Model/phi-1_5/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/phi-2/generate.py | 2 +- .../Model/phixtral/generate.py | 2 +- .../Model/phoenix/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/qwen-vl/chat.py | 2 +- .../HF-Transformers-AutoModels/Model/qwen/generate.py | 2 +- .../Model/redpajama/generate.py | 2 +- .../Model/replit/generate.py | 2 +- .../Model/skywork/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/solar/generate.py | 2 +- .../Model/starcoder/generate.py | 2 +- .../Model/vicuna/generate.py | 2 +- .../Model/whisper/recognize.py | 2 +- .../Model/wizardcoder-python/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/yi/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/yuan2/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/ziya/generate.py | 2 +- python/llm/example/CPU/LlamaIndex/rag.py | 2 +- python/llm/example/CPU/ModelScope-Models/generate.py | 2 +- .../example/CPU/Native-Models/native_int4_pipeline.py | 10 +++++----- .../CPU/PyTorch-Models/Model/aquila2/generate.py | 2 +- .../CPU/PyTorch-Models/Model/bark/synthesize_speech.py | 2 +- .../CPU/PyTorch-Models/Model/bert/extract_feature.py | 2 +- .../CPU/PyTorch-Models/Model/bluelm/generate.py | 2 +- .../CPU/PyTorch-Models/Model/chatglm/generate.py | 2 +- .../CPU/PyTorch-Models/Model/chatglm3/generate.py | 2 +- .../CPU/PyTorch-Models/Model/codellama/generate.py | 2 +- .../CPU/PyTorch-Models/Model/codeshell/generate.py | 2 +- .../CPU/PyTorch-Models/Model/deciLM-7b/generate.py | 2 +- .../CPU/PyTorch-Models/Model/deepseek/generate.py | 2 +- .../CPU/PyTorch-Models/Model/flan-t5/generate.py | 2 +- .../example/CPU/PyTorch-Models/Model/fuyu/generate.py | 4 ++-- .../PyTorch-Models/Model/internlm-xcomposer/chat.py | 4 ++-- .../CPU/PyTorch-Models/Model/llama2/generate.py | 2 +- .../example/CPU/PyTorch-Models/Model/llava/generate.py | 2 +- .../example/CPU/PyTorch-Models/Model/mamba/generate.py | 2 +- .../CPU/PyTorch-Models/Model/mistral/generate.py | 2 +- .../CPU/PyTorch-Models/Model/mixtral/generate.py | 2 +- .../PyTorch-Models/Model/openai-whisper/recognize.py | 2 +- .../CPU/PyTorch-Models/Model/phi-1_5/generate.py | 2 +- .../example/CPU/PyTorch-Models/Model/phi-2/generate.py | 2 +- .../example/CPU/PyTorch-Models/Model/qwen-vl/chat.py | 4 ++-- .../CPU/PyTorch-Models/Model/skywork/generate.py | 2 +- .../example/CPU/PyTorch-Models/Model/solar/generate.py | 2 +- .../Model/wizardcoder-python/generate.py | 2 +- .../example/CPU/PyTorch-Models/Model/yi/generate.py | 2 +- .../example/CPU/PyTorch-Models/Model/yuan2/generate.py | 4 ++-- .../example/CPU/PyTorch-Models/Model/ziya/generate.py | 2 +- .../CPU/PyTorch-Models/More-Data-Types/generate.py | 2 +- .../alpaca-qlora/alpaca_qlora_finetuning_cpu.py | 6 +++--- .../example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 6 +++--- .../Advanced-Quantizations/AWQ/generate.py | 2 +- .../Advanced-Quantizations/GGUF-IQ2/generate.py | 2 +- .../Advanced-Quantizations/GGUF/generate.py | 2 +- .../Advanced-Quantizations/GPTQ/generate.py | 2 +- .../Model/aquila/generate.py | 2 +- .../Model/aquila2/generate.py | 2 +- .../Model/baichuan/generate.py | 2 +- .../Model/baichuan2/generate.py | 2 +- .../Model/bluelm/generate.py | 2 +- .../Model/chatglm2/generate.py | 2 +- .../Model/chatglm3/generate.py | 2 +- .../Model/chinese-llama2/generate.py | 2 +- .../Model/codellama/generate.py | 2 +- .../Model/deciLM-7b/generate.py | 2 +- .../Model/dolly-v1/generate.py | 2 +- .../Model/dolly-v2/generate.py | 2 +- .../Model/falcon/generate.py | 2 +- .../Model/flan-t5/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/gemma/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/gpt-j/generate.py | 2 +- .../Model/internlm/generate.py | 2 +- .../Model/internlm2/generate.py | 2 +- .../Model/llama2/generate.py | 2 +- .../Model/mistral/generate.py | 2 +- .../Model/mixtral/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/mpt/generate.py | 2 +- .../Model/phi-1_5/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/phi-2/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/qwen-vl/chat.py | 2 +- .../HF-Transformers-AutoModels/Model/qwen/generate.py | 2 +- .../Model/redpajama/generate.py | 2 +- .../Model/replit/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/solar/generate.py | 2 +- .../Model/starcoder/generate.py | 2 +- .../Model/vicuna/generate.py | 2 +- .../Model/whisper/recognize.py | 2 +- .../HF-Transformers-AutoModels/Model/yi/generate.py | 2 +- .../HF-Transformers-AutoModels/Model/yuan2/generate.py | 2 +- .../example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py | 2 +- .../GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py | 2 +- .../GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py | 6 +++--- .../LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py | 6 +++--- .../QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py | 6 +++--- .../LLM-Finetuning/ReLora/alpaca_relora_finetuning.py | 6 +++--- python/llm/example/GPU/LlamaIndex/rag.py | 2 +- python/llm/example/GPU/ModelScope-Models/generate.py | 2 +- .../GPU/Pipeline-Parallel-Inference/generate.py | 2 +- .../GPU/PyTorch-Models/Model/aquila2/generate.py | 2 +- .../GPU/PyTorch-Models/Model/baichuan/generate.py | 2 +- .../GPU/PyTorch-Models/Model/baichuan2/generate.py | 2 +- .../GPU/PyTorch-Models/Model/bark/synthesize_speech.py | 2 +- .../GPU/PyTorch-Models/Model/bluelm/generate.py | 2 +- .../GPU/PyTorch-Models/Model/chatglm2/generate.py | 2 +- .../GPU/PyTorch-Models/Model/chatglm2/streamchat.py | 2 +- .../GPU/PyTorch-Models/Model/chatglm3/generate.py | 2 +- .../GPU/PyTorch-Models/Model/chatglm3/streamchat.py | 2 +- .../GPU/PyTorch-Models/Model/codellama/generate.py | 2 +- .../GPU/PyTorch-Models/Model/deciLM-7b/generate.py | 2 +- .../GPU/PyTorch-Models/Model/deepseek/generate.py | 2 +- .../GPU/PyTorch-Models/Model/dolly-v1/generate.py | 2 +- .../GPU/PyTorch-Models/Model/dolly-v2/generate.py | 2 +- .../GPU/PyTorch-Models/Model/flan-t5/generate.py | 2 +- .../GPU/PyTorch-Models/Model/llama2/generate.py | 2 +- .../example/GPU/PyTorch-Models/Model/llava/generate.py | 2 +- .../example/GPU/PyTorch-Models/Model/mamba/generate.py | 2 +- .../GPU/PyTorch-Models/Model/mistral/generate.py | 2 +- .../GPU/PyTorch-Models/Model/mixtral/generate.py | 2 +- .../GPU/PyTorch-Models/Model/phi-1_5/generate.py | 2 +- .../example/GPU/PyTorch-Models/Model/phi-2/generate.py | 2 +- .../GPU/PyTorch-Models/Model/phixtral/generate.py | 2 +- .../example/GPU/PyTorch-Models/Model/qwen-vl/chat.py | 4 ++-- .../GPU/PyTorch-Models/Model/replit/generate.py | 2 +- .../example/GPU/PyTorch-Models/Model/solar/generate.py | 2 +- .../Model/speech-t5/synthesize_speech.py | 2 +- .../GPU/PyTorch-Models/Model/starcoder/generate.py | 2 +- .../example/GPU/PyTorch-Models/Model/yi/generate.py | 4 ++-- .../example/GPU/PyTorch-Models/Model/yuan2/generate.py | 4 ++-- .../GPU/PyTorch-Models/More-Data-Types/generate.py | 2 +- 159 files changed, 183 insertions(+), 183 deletions(-) diff --git a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py index 163ccc71..61ca1b4a 100644 --- a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py +++ b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py @@ -48,7 +48,7 @@ import ssl import urllib.request import os import json -# code change to import from bigdl-llm API instead of using transformers API +# code change to import from IPEX-LLM API instead of using transformers API from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer import intel_extension_for_pytorch as ipex diff --git a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py index 63f5660a..77383dd8 100644 --- a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -87,7 +87,7 @@ if __name__ == '__main__': replace_method="auto" ) - # Apply BigDL-LLM INT4 optimizations on transformers + # Apply IPEX-LLM INT4 optimizations on transformers model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4') model = model.to(f'cpu:{local_rank}') @@ -111,7 +111,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py index 37843751..0a975d60 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py @@ -59,7 +59,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py index 4acad805..c16cfb63 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': model_path = args.model - # Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer + # Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,) # Generate predicted tokens diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py index 1f5852b6..2929194b 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py @@ -60,7 +60,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py index 69d9045f..40133167 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py @@ -56,7 +56,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py index b9bc0ee2..45e6e001 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py @@ -56,7 +56,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py index df64f80e..0961f679 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py @@ -56,7 +56,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py index 59dccfe8..7a3adcfb 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py @@ -45,7 +45,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py index 07a4359e..be839ca3 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py @@ -56,7 +56,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py index e38f56c4..5f7ed211 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py index fb1423fa..605bcf0d 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py index d3d8daae..a9386ba5 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py index b8329d61..332aba75 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py @@ -55,7 +55,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py index adc79339..0a8cc749 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py @@ -56,7 +56,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py index 679fe2e6..5ec2eaea 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py @@ -61,7 +61,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py index ee043e0b..3dd419a1 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py @@ -61,7 +61,7 @@ if __name__ == '__main__': st = time.time() # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; - # to obtain optimal performance with BigDL-LLM INT4 optimizations, + # to obtain optimal performance with IPEX-LLM INT4 optimizations, # it is important to set use_cache=True for Dolly v1 models output = model.generate(input_ids, use_cache=True, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py index 18867636..6f119c0f 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py @@ -64,7 +64,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict, pad_token_id=tokenizer.pad_token_id, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py index 6419aa5a..4f66c335 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py index 91b8addc..54dae699 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py @@ -60,7 +60,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py index 271b1d4f..6968b408 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py @@ -38,7 +38,7 @@ if __name__ == '__main__': image = Image.open(args.image_path) # Load model - # For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization + # For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', load_in_4bit = True, trust_remote_code=True, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py index 4606e2b5..20b894f2 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py @@ -61,7 +61,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py index 6834f582..9868bb4e 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py @@ -37,7 +37,7 @@ if __name__ == '__main__': image = args.image_path # Load model - # For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization + # For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', load_in_4bit=True, trust_remote_code=True, modules_to_not_convert=['qkv']) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py index 1c33a1fe..87bb5653 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py index 7e05e153..cd0bc62c 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py @@ -56,7 +56,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py index ed5c93b0..dd5080ef 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py @@ -60,7 +60,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py index 94e6ab48..ac90570a 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py @@ -55,7 +55,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py index cd8b9f60..361ed02e 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py @@ -61,7 +61,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py index 786f7f0e..cc56bd84 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py @@ -56,7 +56,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py index e4caa938..7e54712a 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py @@ -55,7 +55,7 @@ if __name__ == '__main__': input_ids = tokenizer.encode(prompt, return_tensors="pt") # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; - # to obtain optimal performance with BigDL-LLM INT4 optimizations, + # to obtain optimal performance with IPEX-LLM INT4 optimizations, # it is important to set use_cache=True for MPT models mpt_generation_config = GenerationConfig( max_new_tokens=args.n_predict, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py index 710d2a39..07ae8dc5 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py @@ -58,7 +58,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations # Note that phi-1_5 uses GenerationConfig to enable 'use_cache' output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py index 91930b72..882aef9f 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py @@ -58,7 +58,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations model.generation_config.pad_token_id = model.generation_config.eos_token_id # Note that phi-2 uses GenerationConfig to enable 'use_cache' diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py index 395481ae..50e2a76a 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py @@ -58,7 +58,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations # Note that phixtral uses GenerationConfig to enable 'use_cache' output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py index 264f27e2..83026fd5 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py @@ -55,7 +55,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py index 5796177f..af5d3e04 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py @@ -36,7 +36,7 @@ if __name__ == '__main__': model_path = args.repo_id_or_model_path # Load model - # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization + # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, device_map="cpu", diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py index 4f260181..3e43b632 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py @@ -64,7 +64,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py index da5f69ee..7eaec908 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py @@ -56,7 +56,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(**inputs, max_new_tokens=args.n_predict, do_sample=True, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py index 0599df2c..183e0bc4 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py @@ -55,7 +55,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py index 47bc1c79..e288f914 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py @@ -55,7 +55,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py index b84d7b61..36fbfa76 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py @@ -59,7 +59,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py index e6a80a71..1b654a26 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py @@ -55,7 +55,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py index 118b6084..074f9552 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py @@ -53,7 +53,7 @@ if __name__ == '__main__': st = time.time() # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; - # to obtain optimal performance with BigDL-LLM INT4 optimizations, + # to obtain optimal performance with IPEX-LLM INT4 optimizations, # it is important to set use_cache=True for vicuna-v1.3 models output = model.generate(input_ids, use_cache=True, diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py index 60de9751..a5bbb759 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py @@ -63,7 +63,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py index 72d5dd97..8a41149f 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py @@ -58,7 +58,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py index f809c44b..a28a5a88 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py @@ -61,7 +61,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py index 46115bc0..17e918d8 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict) end_time = time.time() diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py index cf4914c2..5980ac60 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py @@ -42,7 +42,7 @@ if __name__ == '__main__': from ipex_llm.transformers import AutoModelForCausalLM # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; - # to obtain optimal performance with BigDL-LLM INT4 optimizations, + # to obtain optimal performance with IPEX-LLM INT4 optimizations, # it is important to set use_cache=True for Ziya models model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, diff --git a/python/llm/example/CPU/LlamaIndex/rag.py b/python/llm/example/CPU/LlamaIndex/rag.py index c4c4c8f8..9f26e55d 100644 --- a/python/llm/example/CPU/LlamaIndex/rag.py +++ b/python/llm/example/CPU/LlamaIndex/rag.py @@ -163,7 +163,7 @@ def messages_to_prompt(messages): def main(args): embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path) - # Use custom LLM in BigDL + # Use custom LLM in IPEX-LLM from ipex_llm.llamaindex.llms import BigdlLLM llm = BigdlLLM( model_name=args.model_path, diff --git a/python/llm/example/CPU/ModelScope-Models/generate.py b/python/llm/example/CPU/ModelScope-Models/generate.py index 274566f3..0e770065 100644 --- a/python/llm/example/CPU/ModelScope-Models/generate.py +++ b/python/llm/example/CPU/ModelScope-Models/generate.py @@ -59,7 +59,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py index aa349c29..e576cfd1 100644 --- a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py +++ b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py @@ -56,8 +56,8 @@ def load(model_path, model_family, n_threads): def inference(llm, repo_id_or_model_path, model_family, prompt): if model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm']: - # ------ Option 1: Use bigdl-llm based tokenizer - print('-'*20, ' bigdl-llm based tokenizer ', '-'*20) + # ------ Option 1: Use IPEX-LLM based tokenizer + print('-'*20, ' IPEX-LLM based tokenizer ', '-'*20) st = time.time() # please note that the prompt here can either be a string or a list of string @@ -126,13 +126,13 @@ def main(): if args.model_family == 'llama2': args.model_family = 'llama' - # Step 1: convert original model to BigDL llm model - bigdl_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path, + # Step 1: convert original model to IPEX-LLM model + ipex_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path, model_family=args.model_family, tmp_path=args.tmp_path) # Step 2: load int4 model - llm = load(model_path=bigdl_llm_path, + llm = load(model_path=ipex_llm_path, model_family=args.model_family, n_threads=args.thread_num) diff --git a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py index 9b219452..fe9186d8 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py index 1811c36b..7229adbe 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py @@ -38,7 +38,7 @@ if __name__ == '__main__': model = Bark.init_from_config(config) model.load_checkpoint(config, checkpoint_dir=model_path, eval=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Synthesize speech with the given input diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py index cd0c73b7..25f66521 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py @@ -38,7 +38,7 @@ if __name__ == '__main__': torch_dtype="auto", low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py index d16d2331..00db9920 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py @@ -40,7 +40,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py index 89d26761..76f152bc 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py @@ -41,7 +41,7 @@ if __name__ == '__main__': # Load model model = AutoModel.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py index 22fdeaad..ce2e7d4b 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py @@ -41,7 +41,7 @@ if __name__ == '__main__': # Load model model = AutoModel.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py index 12266e99..e7e93dc2 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py @@ -41,7 +41,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py index a0610bc6..4963051c 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py @@ -40,7 +40,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py index 8714b419..cb713533 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py @@ -50,7 +50,7 @@ if __name__ == '__main__': trust_remote_code=True, ) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py index 1a2cbaec..5bd9740b 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py @@ -46,7 +46,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py index 35e1b25d..c4a3e220 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py @@ -40,7 +40,7 @@ if __name__ == '__main__': # Load model model = AutoModelForSeq2SeqLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # "wo" module is not converted due to some issues of T5 model # (https://github.com/huggingface/transformers/issues/20287), # "lm_head" module is not converted to generate outputs with better quality diff --git a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py index 8e2397ba..8503357e 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py @@ -40,8 +40,8 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model - # For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization + # With only one line to enable IPEX-LLM optimization on model + # For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization model = optimize_model(model, low_bit='sym_int4', modules_to_not_convert=['vision_embed_tokens']) diff --git a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py index dc664493..13b1c1ca 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py @@ -38,8 +38,8 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model - # For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization + # With only one line to enable IPEX-LLM optimization on model + # For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization model = optimize_model(model, low_bit='sym_int4', modules_to_not_convert=['qkv']) diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py index 6c4ab17a..5888e896 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py @@ -45,7 +45,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py index c27d1a50..5f3316e3 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py @@ -292,7 +292,7 @@ if __name__ == '__main__': model_base=None, model_name=model_name) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Generate image tensor diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py index 9462474a..77c9d1ec 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py @@ -42,7 +42,7 @@ if __name__ == '__main__': # Load model model = MambaLMHeadModel.from_pretrained(model_path) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model, low_bit='asym_int4', modules_to_not_convert=["dt_proj", "x_proj", "out_proj"]) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py index 37958b67..98f3c3e2 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py index 330b4349..ebb7bf92 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py index 1b071d57..3e202310 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py @@ -46,7 +46,7 @@ if __name__ == '__main__': # Load whisper model under pytorch framework model = whisper.load_model(args.model_name) - # With only one line to enable bigdl optimize on a pytorch model + # With only one line to enable IPEX-LLM optimize on a pytorch model model = optimize_model(model) st = time.time() diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py index f70da15d..6b69ca07 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py @@ -42,7 +42,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py index 319c009f..745ed4de 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py @@ -42,7 +42,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py index d92cabbc..0bd286d9 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py @@ -37,8 +37,8 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model - # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization + # With only one line to enable IPEX-LLM optimization on model + # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization model = optimize_model(model, low_bit='sym_int4', modules_to_not_convert=['c_fc', 'out_proj']) diff --git a/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py index fa52a2e9..0ee0f7e9 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py @@ -41,7 +41,7 @@ if __name__ == '__main__': model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py index 2ddd48af..ed566894 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py @@ -45,7 +45,7 @@ if __name__ == '__main__': torch_dtype=torch.float16, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py index 832f7623..e6760953 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) # Load tokenizer diff --git a/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py index bf6af053..a67f8361 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py @@ -54,7 +54,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py index ea71ad76..3e011231 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py @@ -48,7 +48,7 @@ if __name__ == '__main__': print("Creating model...") model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True, torch_dtype=torch.float16).eval() - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model) prompt = YUAN2_PROMPT_FORMAT.format(prompt=args.prompt) @@ -59,7 +59,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict) end_time = time.time() diff --git a/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py index e6f2c02d..580e198d 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py @@ -42,7 +42,7 @@ if __name__ == '__main__': from ipex_llm import optimize_model # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; - # to obtain optimal performance with BigDL-LLM `optimization_model` API optimizations, + # to obtain optimal performance with IPEX-LLM `optimization_model` API optimizations, # it is important to set use_cache=True for Ziya models model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, diff --git a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py index 59c01f63..e4e10129 100644 --- a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py @@ -49,7 +49,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # `low_bit` param support `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8` # By specifying `low_bit` param, relevant low bit optimizations will be applied to the model model = optimize_model(model, low_bit=low_bit) diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py index cdf3196c..c2362267 100644 --- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py +++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py @@ -49,7 +49,7 @@ from utils.prompter import Prompter from transformers import BitsAndBytesConfig from ipex_llm.transformers import AutoModelForCausalLM -# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model +# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig from ipex_llm.utils.isa_checker import ISAChecker @@ -64,7 +64,7 @@ def get_int_from_env(env_keys, default): def train( # model/data params base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf" - saved_low_bit_model: str = None, # optional, the path to the saved model with bigdl-llm low-bit optimization + saved_low_bit_model: str = None, # optional, the path to the saved model with ipex-llm low-bit optimization data_path: str = "yahma/alpaca-cleaned", output_dir: str = "./bigdl-qlora-alpaca", # training hyperparams @@ -256,7 +256,7 @@ def train( ] # could be sped up, probably return tokenized_full_prompt - # Prepare a BigDL-LLM compatible Peft model + # Prepare a IPEX-LLM compatible Peft model model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing) config = LoraConfig( diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index 3f221425..b2fa7aaf 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -63,7 +63,7 @@ if __name__ == '__main__': low_bit = args.low_bit # First use CPU as accelerator - # Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage + # Convert to deepspeed model and apply IPEX-LLM optimization on CPU to decrease GPU memory usage current_accel = CPU_Accelerator() set_accelerator(current_accel) model = AutoModelForCausalLM.from_pretrained(args.repo_id_or_model_path, @@ -80,7 +80,7 @@ if __name__ == '__main__': replace_method="auto", ) - # Use bigdl-llm `optimize_model` to convert the model into optimized low bit format + # Use IPEX-LLM `optimize_model` to convert the model into optimized low bit format # Convert the rest of the model into float16 to reduce allreduce traffic model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16) @@ -119,7 +119,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py index c6ff5241..66ef8ff9 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py @@ -58,7 +58,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py index 6834bf88..b9353a09 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py @@ -71,7 +71,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM Low Bit optimizations + # to obtain optimal performance with IPEX-LLM Low Bit optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict, repetition_penalty=1.1) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py index 9272b727..e43afcbb 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': model_path = args.model - # Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer + # Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer model, tokenizer = AutoModelForCausalLM.from_gguf(model_path) model = model.to('xpu') diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py index 4317730f..6a39250d 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py @@ -59,7 +59,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) end = time.time() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py index ec729d12..8715c38d 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py @@ -60,7 +60,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py index cc75bba7..3037a392 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py @@ -60,7 +60,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py index 8524fa65..1237bca7 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py @@ -64,7 +64,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py index d4f53a89..476d0946 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = AutoModelForCausalLM.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py index 940ac45a..30a7cc1a 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py @@ -64,7 +64,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py index a580b92e..53272834 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py @@ -67,7 +67,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py index 9930441e..92190bca 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py @@ -67,7 +67,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py index cedf26dd..b9584fa0 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py @@ -57,7 +57,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly to obtain optimal - # performance with BigDL-LLM INT4 optimizations + # performance with IPEX-LLM INT4 optimizations # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = AutoModelForCausalLM.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py index b4833107..c772d8c2 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py @@ -67,7 +67,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py index 728ae71f..eb41e090 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py @@ -55,7 +55,7 @@ if __name__ == '__main__': cpu_embedding=True ) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = model.to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py index 21d2a43b..7e5b1f62 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py @@ -64,7 +64,7 @@ if __name__ == '__main__': st = time.time() # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; - # to obtain optimal performance with BigDL-LLM INT4 optimizations, + # to obtain optimal performance with IPEX-LLM INT4 optimizations, # it is important to set use_cache=True for Dolly v1 models output = model.generate(input_ids, use_cache=True, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py index b5182e39..fbec6f3f 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py @@ -67,7 +67,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict, pad_token_id=tokenizer.pad_token_id, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py index 3326a6b7..88f739bb 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py @@ -67,7 +67,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py index e28efcaf..c6158dde 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py @@ -69,7 +69,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py index 1b69e57f..bbe4f68b 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py @@ -68,7 +68,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py index baa777a4..4fd3a37d 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py @@ -65,7 +65,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py index c1cd1425..34ee285c 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py @@ -66,7 +66,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py index 33e81f77..ecd12b6b 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py @@ -72,7 +72,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py index d724e899..09f389ad 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py @@ -79,7 +79,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py index 390c7129..3734724a 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py @@ -65,7 +65,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py index 3a9e2da0..2795883b 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py @@ -65,7 +65,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py index 93448b67..7ff3326a 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py @@ -65,7 +65,7 @@ if __name__ == '__main__': # start inference # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; - # to obtain optimal performance with BigDL-LLM INT4 optimizations, + # to obtain optimal performance with IPEX-LLM INT4 optimizations, # it is important to set use_cache=True for MPT models mpt_generation_config = GenerationConfig( max_new_tokens=args.n_predict, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py index 165e9e63..f3b766c7 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py @@ -68,7 +68,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations # Note that phi-1_5 uses GenerationConfig to enable 'use_cache' output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py index 6ca9a192..09cc8e95 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py @@ -69,7 +69,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations # Note that phi-2 uses GenerationConfig to enable 'use_cache' output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py index 03127136..668cbe9f 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py @@ -37,7 +37,7 @@ if __name__ == '__main__': model_path = args.repo_id_or_model_path # Load model - # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization + # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = AutoModelForCausalLM.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py index 6035eb1d..2d0a5f8a 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py @@ -73,7 +73,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py index fa55600f..2f387d6b 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py @@ -70,7 +70,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(**inputs, max_new_tokens=args.n_predict, do_sample=True, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py index b5001366..3cf6e51c 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py @@ -66,7 +66,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py index 93c67f11..c2d90c91 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py @@ -67,7 +67,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py index 8e61fd76..dcc558f9 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py @@ -66,7 +66,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py index df9686f8..1cf63a2c 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py @@ -56,7 +56,7 @@ if __name__ == '__main__': st = time.time() # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; - # to obtain optimal performance with BigDL-LLM INT4 optimizations, + # to obtain optimal performance with IPEX-LLM INT4 optimizations, # it is important to set use_cache=True for vicuna-v1.3 models output = model.generate(input_ids, use_cache=True, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py index 4a0ca795..fbdb8feb 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py @@ -67,7 +67,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids) end = time.time() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py index 60ef19fc..f9a0e544 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py @@ -71,7 +71,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py index e84e0e46..8b7a358e 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py @@ -68,7 +68,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict) end_time = time.time() diff --git a/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py index cb4a35f6..54e05c9a 100644 --- a/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py @@ -128,7 +128,7 @@ if __name__ == "__main__": # modules_to_not_convert=["lm_head"],) model = model.to('xpu') - # Prepare a BigDL-LLM compatible Peft model + # Prepare a IPEX-LLM compatible Peft model model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing) model = get_peft_model(model, peft_config) model.config.use_cache = False diff --git a/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py b/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py index a829cd40..7471d6a6 100644 --- a/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py +++ b/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py @@ -33,7 +33,7 @@ from ipex_llm import llm_patch llm_patch(train=True) -# The following is the original LLM finetuning code using PEFT (without BigDL-LLM) +# The following is the original LLM finetuning code using PEFT (without IPEX-LLM) import os import sys from typing import List diff --git a/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py index 4af84ed6..d4b593d1 100644 --- a/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py @@ -53,7 +53,7 @@ from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_ from transformers import BitsAndBytesConfig from ipex_llm.transformers import AutoModelForCausalLM -# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model +# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ LoraConfig from ipex_llm.utils.common import invalidInputError @@ -69,7 +69,7 @@ os.environ["MASTER_PORT"] = str(port) def train( # model/data params base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf" - saved_low_bit_model: str = None, # optional, the path to the saved model with bigdl-llm low-bit optimization + saved_low_bit_model: str = None, # optional, the path to the saved model with ipex-llm low-bit optimization data_path: str = "yahma/alpaca-cleaned", output_dir: str = "./bigdl-qlora-alpaca", # training hyperparams @@ -187,7 +187,7 @@ def train( print(model) - # Prepare a BigDL-LLM compatible Peft model + # Prepare a IPEX-LLM compatible Peft model model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing) config = LoraConfig( diff --git a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py index 647cf9e9..91271a78 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py @@ -53,7 +53,7 @@ from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_ from transformers import BitsAndBytesConfig from ipex_llm.transformers import AutoModelForCausalLM -# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model +# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ LoraConfig from ipex_llm.utils.common import invalidInputError @@ -69,7 +69,7 @@ os.environ["MASTER_PORT"] = str(port) def train( # model/data params base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf" - saved_low_bit_model: str = None, # optional, the path to the saved model with bigdl-llm low-bit optimization + saved_low_bit_model: str = None, # optional, the path to the saved model with ipex-llm low-bit optimization data_path: str = "yahma/alpaca-cleaned", output_dir: str = "./bigdl-qlora-alpaca", # training hyperparams @@ -199,7 +199,7 @@ def train( print(model) - # Prepare a BigDL-LLM compatible Peft model + # Prepare a IPEX-LLM compatible Peft model model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing) config = LoraConfig( diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py index 3ffd6727..9156462a 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py @@ -53,7 +53,7 @@ from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_ from transformers import BitsAndBytesConfig from ipex_llm.transformers import AutoModelForCausalLM -# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model +# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ LoraConfig from ipex_llm.utils.common import invalidInputError @@ -69,7 +69,7 @@ os.environ["MASTER_PORT"] = str(port) def train( # model/data params base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf" - saved_low_bit_model: str = None, # optional, the path to the saved model with bigdl-llm low-bit optimization + saved_low_bit_model: str = None, # optional, the path to the saved model with IPEX-LLM low-bit optimization data_path: str = "yahma/alpaca-cleaned", output_dir: str = "./bigdl-qlora-alpaca", # training hyperparams @@ -199,7 +199,7 @@ def train( print(model) - # Prepare a BigDL-LLM compatible Peft model + # Prepare a IPEX-LLM compatible Peft model model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing) config = LoraConfig( diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py index 2a2ff947..2717f7ad 100644 --- a/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py @@ -54,7 +54,7 @@ from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_ from transformers import BitsAndBytesConfig from ipex_llm.transformers import AutoModelForCausalLM from ipex_llm.transformers.relora import ReLoRATrainer -# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model +# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ LoraConfig from ipex_llm.utils.common import invalidInputError @@ -70,7 +70,7 @@ os.environ["MASTER_PORT"] = str(port) def train( # model/data params base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf" - saved_low_bit_model: str = None, # optional, the path to the saved model with bigdl-llm low-bit optimization + saved_low_bit_model: str = None, # optional, the path to the saved model with IPEX-LLM low-bit optimization data_path: str = "yahma/alpaca-cleaned", output_dir: str = "./bigdl-qlora-alpaca", # training hyperparams @@ -211,7 +211,7 @@ def train( print(model) - # Prepare a BigDL-LLM compatible Peft model + # Prepare a IPEX-LLM compatible Peft model model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing) config = LoraConfig( diff --git a/python/llm/example/GPU/LlamaIndex/rag.py b/python/llm/example/GPU/LlamaIndex/rag.py index 97dc3ae7..87838715 100644 --- a/python/llm/example/GPU/LlamaIndex/rag.py +++ b/python/llm/example/GPU/LlamaIndex/rag.py @@ -162,7 +162,7 @@ def messages_to_prompt(messages): def main(args): embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path) - # Use custom LLM in BigDL + # Use custom LLM in IPEX-LLM from ipex_llm.llamaindex.llms import BigdlLLM llm = BigdlLLM( model_name=args.model_path, diff --git a/python/llm/example/GPU/ModelScope-Models/generate.py b/python/llm/example/GPU/ModelScope-Models/generate.py index 0df692e1..47069970 100644 --- a/python/llm/example/GPU/ModelScope-Models/generate.py +++ b/python/llm/example/GPU/ModelScope-Models/generate.py @@ -69,7 +69,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py index f9454e78..e54cf881 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py @@ -101,7 +101,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py index 6258dd2e..7bdb63ff 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py index 9fcddbf1..ccc8b24f 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py @@ -43,7 +43,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py index a4592679..ae9e4c7f 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py @@ -45,7 +45,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py index 9bff3517..82763b4c 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py @@ -43,7 +43,7 @@ if __name__ == '__main__': processor = AutoProcessor.from_pretrained(model_path) model = BarkModel.from_pretrained(model_path) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py index cdb9567f..e308820c 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py @@ -43,7 +43,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py index f7cc3938..e968bddb 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py index 4c9dbe77..8dfceef6 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py @@ -43,7 +43,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py index ab6ad290..53ccd356 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py index 89c93edb..42115e9d 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py @@ -43,7 +43,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py index a9eaaace..d676666b 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py @@ -45,7 +45,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py index 3a4c7e52..4eeabcee 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py @@ -50,7 +50,7 @@ if __name__ == '__main__': trust_remote_code=True, ) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model( diff --git a/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py index be92fd95..9732ca9d 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py @@ -54,7 +54,7 @@ if __name__ == '__main__': tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer.pad_token = tokenizer.eos_token - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py index 9168b46a..1b76b040 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py @@ -50,7 +50,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py index 82787ec9..4756307a 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py @@ -50,7 +50,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py index 8c216ec4..a1e6976e 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py @@ -43,7 +43,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # "wo" module is not converted due to some issues of T5 model # (https://github.com/huggingface/transformers/issues/20287), # "lm_head" module is not converted to generate outputs with better quality diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py index 4add8dfb..7593d549 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py @@ -48,7 +48,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py index 6a6e5a4a..ce3275df 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py @@ -291,7 +291,7 @@ if __name__ == '__main__': model_base=None, model_name=model_name) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model).to('xpu') diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py index ac8a8dd9..6ec314b2 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py @@ -43,7 +43,7 @@ if __name__ == '__main__': # Load model model = MambaLMHeadModel.from_pretrained(model_path) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model model = optimize_model(model, low_bit='asym_int4', modules_to_not_convert=["dt_proj", "x_proj"]) model = model.to('xpu') diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py index 80e1fc52..0e717730 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py index ec8c3711..a58b1628 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py @@ -44,7 +44,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py index fbdef847..6bf99191 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py @@ -42,7 +42,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py index ca7499bd..b9c64ab9 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py @@ -42,7 +42,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py index d1e7e7fa..0c8a278a 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py @@ -43,7 +43,7 @@ if __name__ == '__main__': model_path = args.repo_id_or_model_path - # Load huggingface model with optimize_model in BigDL + # Load huggingface model with optimize_model in IPEX-LLM from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py index 783689cb..358c1304 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py @@ -39,8 +39,8 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model - # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization + # With only one line to enable IPEX-LLM optimization on model + # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py index 3edd21b7..4a92962d 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py @@ -43,7 +43,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py index 95388061..ee522a8f 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py @@ -46,7 +46,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py index 58e57c8d..2145b361 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py @@ -73,7 +73,7 @@ if __name__ == '__main__': model = SpeechT5ForTextToSpeech.from_pretrained(model_path) vocoder = SpeechT5HifiGan.from_pretrained(vocoder_path) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # Skip optimizing these two modules to get higher audio quality # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. diff --git a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py index 1092fe43..6c50838f 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py @@ -43,7 +43,7 @@ if __name__ == '__main__': torch_dtype='auto', low_cpu_mem_usage=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py index 61fe372e..31256cda 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py @@ -41,7 +41,7 @@ if __name__ == '__main__': trust_remote_code=True, use_cache=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) @@ -64,7 +64,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations output = model.generate(input_ids, max_new_tokens=args.n_predict) torch.xpu.synchronize() diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py index 31179c8c..9a887c09 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py @@ -49,7 +49,7 @@ if __name__ == '__main__': print("Creating model...") model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype='auto', low_cpu_mem_usage=True).eval() - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) @@ -70,7 +70,7 @@ if __name__ == '__main__': # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations + # to obtain optimal performance with IPEX-LLM INT4 optimizations outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict) end_time = time.time() diff --git a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py index d3a94b07..f615619e 100644 --- a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py @@ -49,7 +49,7 @@ if __name__ == '__main__': # Load model model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) - # With only one line to enable BigDL-LLM optimization on model + # With only one line to enable IPEX-LLM optimization on model # `low_bit` param support `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8` # By specifying `low_bit` param, relevant low bit optimizations will be applied to the model model = optimize_model(model, low_bit=low_bit)