From 10ee786920e8d9a1deab3e2bd7c20e00ebdf6a84 Mon Sep 17 00:00:00 2001
From: Jin Qiao <89779290+JinBridger@users.noreply.github.com>
Date: Sun, 7 Apr 2024 13:29:51 +0800
Subject: [PATCH] Replace with IPEX-LLM in example comments (#10671)

* Replace with IPEX-LLM in example comments

* More replacement

* revert some changes
---
 .../Applications/streaming-llm/streaming_llm/utils.py  |  2 +-
 .../example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py   |  4 ++--
 .../Advanced-Quantizations/AWQ/generate.py             |  2 +-
 .../Advanced-Quantizations/GGUF/generate.py            |  2 +-
 .../Advanced-Quantizations/GPTQ/generate.py            |  2 +-
 .../Model/aquila/generate.py                           |  2 +-
 .../Model/aquila2/generate.py                          |  2 +-
 .../Model/baichuan/generate.py                         |  2 +-
 .../Model/baichuan2/generate.py                        |  2 +-
 .../Model/bluelm/generate.py                           |  2 +-
 .../Model/chatglm/generate.py                          |  2 +-
 .../Model/chatglm2/generate.py                         |  2 +-
 .../Model/chatglm3/generate.py                         |  2 +-
 .../Model/codellama/generate.py                        |  2 +-
 .../Model/codeshell/generate.py                        |  2 +-
 .../Model/deepseek/generate.py                         |  2 +-
 .../Model/dolly_v1/generate.py                         |  2 +-
 .../Model/dolly_v2/generate.py                         |  2 +-
 .../Model/falcon/generate.py                           |  2 +-
 .../Model/flan-t5/generate.py                          |  2 +-
 .../HF-Transformers-AutoModels/Model/fuyu/generate.py  |  2 +-
 .../HF-Transformers-AutoModels/Model/gemma/generate.py |  2 +-
 .../Model/internlm-xcomposer/chat.py                   |  2 +-
 .../Model/internlm/generate.py                         |  2 +-
 .../Model/internlm2/generate.py                        |  2 +-
 .../Model/llama2/generate.py                           |  2 +-
 .../Model/mistral/generate.py                          |  2 +-
 .../Model/mixtral/generate.py                          |  2 +-
 .../HF-Transformers-AutoModels/Model/moss/generate.py  |  2 +-
 .../HF-Transformers-AutoModels/Model/mpt/generate.py   |  2 +-
 .../Model/phi-1_5/generate.py                          |  2 +-
 .../HF-Transformers-AutoModels/Model/phi-2/generate.py |  2 +-
 .../Model/phixtral/generate.py                         |  2 +-
 .../Model/phoenix/generate.py                          |  2 +-
 .../HF-Transformers-AutoModels/Model/qwen-vl/chat.py   |  2 +-
 .../HF-Transformers-AutoModels/Model/qwen/generate.py  |  2 +-
 .../Model/redpajama/generate.py                        |  2 +-
 .../Model/replit/generate.py                           |  2 +-
 .../Model/skywork/generate.py                          |  2 +-
 .../HF-Transformers-AutoModels/Model/solar/generate.py |  2 +-
 .../Model/starcoder/generate.py                        |  2 +-
 .../Model/vicuna/generate.py                           |  2 +-
 .../Model/whisper/recognize.py                         |  2 +-
 .../Model/wizardcoder-python/generate.py               |  2 +-
 .../HF-Transformers-AutoModels/Model/yi/generate.py    |  2 +-
 .../HF-Transformers-AutoModels/Model/yuan2/generate.py |  2 +-
 .../HF-Transformers-AutoModels/Model/ziya/generate.py  |  2 +-
 python/llm/example/CPU/LlamaIndex/rag.py               |  2 +-
 python/llm/example/CPU/ModelScope-Models/generate.py   |  2 +-
 .../example/CPU/Native-Models/native_int4_pipeline.py  | 10 +++++-----
 .../CPU/PyTorch-Models/Model/aquila2/generate.py       |  2 +-
 .../CPU/PyTorch-Models/Model/bark/synthesize_speech.py |  2 +-
 .../CPU/PyTorch-Models/Model/bert/extract_feature.py   |  2 +-
 .../CPU/PyTorch-Models/Model/bluelm/generate.py        |  2 +-
 .../CPU/PyTorch-Models/Model/chatglm/generate.py       |  2 +-
 .../CPU/PyTorch-Models/Model/chatglm3/generate.py      |  2 +-
 .../CPU/PyTorch-Models/Model/codellama/generate.py     |  2 +-
 .../CPU/PyTorch-Models/Model/codeshell/generate.py     |  2 +-
 .../CPU/PyTorch-Models/Model/deciLM-7b/generate.py     |  2 +-
 .../CPU/PyTorch-Models/Model/deepseek/generate.py      |  2 +-
 .../CPU/PyTorch-Models/Model/flan-t5/generate.py       |  2 +-
 .../example/CPU/PyTorch-Models/Model/fuyu/generate.py  |  4 ++--
 .../PyTorch-Models/Model/internlm-xcomposer/chat.py    |  4 ++--
 .../CPU/PyTorch-Models/Model/llama2/generate.py        |  2 +-
 .../example/CPU/PyTorch-Models/Model/llava/generate.py |  2 +-
 .../example/CPU/PyTorch-Models/Model/mamba/generate.py |  2 +-
 .../CPU/PyTorch-Models/Model/mistral/generate.py       |  2 +-
 .../CPU/PyTorch-Models/Model/mixtral/generate.py       |  2 +-
 .../PyTorch-Models/Model/openai-whisper/recognize.py   |  2 +-
 .../CPU/PyTorch-Models/Model/phi-1_5/generate.py       |  2 +-
 .../example/CPU/PyTorch-Models/Model/phi-2/generate.py |  2 +-
 .../example/CPU/PyTorch-Models/Model/qwen-vl/chat.py   |  4 ++--
 .../CPU/PyTorch-Models/Model/skywork/generate.py       |  2 +-
 .../example/CPU/PyTorch-Models/Model/solar/generate.py |  2 +-
 .../Model/wizardcoder-python/generate.py               |  2 +-
 .../example/CPU/PyTorch-Models/Model/yi/generate.py    |  2 +-
 .../example/CPU/PyTorch-Models/Model/yuan2/generate.py |  4 ++--
 .../example/CPU/PyTorch-Models/Model/ziya/generate.py  |  2 +-
 .../CPU/PyTorch-Models/More-Data-Types/generate.py     |  2 +-
 .../alpaca-qlora/alpaca_qlora_finetuning_cpu.py        |  6 +++---
 .../example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py   |  6 +++---
 .../Advanced-Quantizations/AWQ/generate.py             |  2 +-
 .../Advanced-Quantizations/GGUF-IQ2/generate.py        |  2 +-
 .../Advanced-Quantizations/GGUF/generate.py            |  2 +-
 .../Advanced-Quantizations/GPTQ/generate.py            |  2 +-
 .../Model/aquila/generate.py                           |  2 +-
 .../Model/aquila2/generate.py                          |  2 +-
 .../Model/baichuan/generate.py                         |  2 +-
 .../Model/baichuan2/generate.py                        |  2 +-
 .../Model/bluelm/generate.py                           |  2 +-
 .../Model/chatglm2/generate.py                         |  2 +-
 .../Model/chatglm3/generate.py                         |  2 +-
 .../Model/chinese-llama2/generate.py                   |  2 +-
 .../Model/codellama/generate.py                        |  2 +-
 .../Model/deciLM-7b/generate.py                        |  2 +-
 .../Model/dolly-v1/generate.py                         |  2 +-
 .../Model/dolly-v2/generate.py                         |  2 +-
 .../Model/falcon/generate.py                           |  2 +-
 .../Model/flan-t5/generate.py                          |  2 +-
 .../HF-Transformers-AutoModels/Model/gemma/generate.py |  2 +-
 .../HF-Transformers-AutoModels/Model/gpt-j/generate.py |  2 +-
 .../Model/internlm/generate.py                         |  2 +-
 .../Model/internlm2/generate.py                        |  2 +-
 .../Model/llama2/generate.py                           |  2 +-
 .../Model/mistral/generate.py                          |  2 +-
 .../Model/mixtral/generate.py                          |  2 +-
 .../HF-Transformers-AutoModels/Model/mpt/generate.py   |  2 +-
 .../Model/phi-1_5/generate.py                          |  2 +-
 .../HF-Transformers-AutoModels/Model/phi-2/generate.py |  2 +-
 .../HF-Transformers-AutoModels/Model/qwen-vl/chat.py   |  2 +-
 .../HF-Transformers-AutoModels/Model/qwen/generate.py  |  2 +-
 .../Model/redpajama/generate.py                        |  2 +-
 .../Model/replit/generate.py                           |  2 +-
 .../HF-Transformers-AutoModels/Model/solar/generate.py |  2 +-
 .../Model/starcoder/generate.py                        |  2 +-
 .../Model/vicuna/generate.py                           |  2 +-
 .../Model/whisper/recognize.py                         |  2 +-
 .../HF-Transformers-AutoModels/Model/yi/generate.py    |  2 +-
 .../HF-Transformers-AutoModels/Model/yuan2/generate.py |  2 +-
 .../example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py   |  2 +-
 .../GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py |  2 +-
 .../GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py  |  6 +++---
 .../LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py |  6 +++---
 .../QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py      |  6 +++---
 .../LLM-Finetuning/ReLora/alpaca_relora_finetuning.py  |  6 +++---
 python/llm/example/GPU/LlamaIndex/rag.py               |  2 +-
 python/llm/example/GPU/ModelScope-Models/generate.py   |  2 +-
 .../GPU/Pipeline-Parallel-Inference/generate.py        |  2 +-
 .../GPU/PyTorch-Models/Model/aquila2/generate.py       |  2 +-
 .../GPU/PyTorch-Models/Model/baichuan/generate.py      |  2 +-
 .../GPU/PyTorch-Models/Model/baichuan2/generate.py     |  2 +-
 .../GPU/PyTorch-Models/Model/bark/synthesize_speech.py |  2 +-
 .../GPU/PyTorch-Models/Model/bluelm/generate.py        |  2 +-
 .../GPU/PyTorch-Models/Model/chatglm2/generate.py      |  2 +-
 .../GPU/PyTorch-Models/Model/chatglm2/streamchat.py    |  2 +-
 .../GPU/PyTorch-Models/Model/chatglm3/generate.py      |  2 +-
 .../GPU/PyTorch-Models/Model/chatglm3/streamchat.py    |  2 +-
 .../GPU/PyTorch-Models/Model/codellama/generate.py     |  2 +-
 .../GPU/PyTorch-Models/Model/deciLM-7b/generate.py     |  2 +-
 .../GPU/PyTorch-Models/Model/deepseek/generate.py      |  2 +-
 .../GPU/PyTorch-Models/Model/dolly-v1/generate.py      |  2 +-
 .../GPU/PyTorch-Models/Model/dolly-v2/generate.py      |  2 +-
 .../GPU/PyTorch-Models/Model/flan-t5/generate.py       |  2 +-
 .../GPU/PyTorch-Models/Model/llama2/generate.py        |  2 +-
 .../example/GPU/PyTorch-Models/Model/llava/generate.py |  2 +-
 .../example/GPU/PyTorch-Models/Model/mamba/generate.py |  2 +-
 .../GPU/PyTorch-Models/Model/mistral/generate.py       |  2 +-
 .../GPU/PyTorch-Models/Model/mixtral/generate.py       |  2 +-
 .../GPU/PyTorch-Models/Model/phi-1_5/generate.py       |  2 +-
 .../example/GPU/PyTorch-Models/Model/phi-2/generate.py |  2 +-
 .../GPU/PyTorch-Models/Model/phixtral/generate.py      |  2 +-
 .../example/GPU/PyTorch-Models/Model/qwen-vl/chat.py   |  4 ++--
 .../GPU/PyTorch-Models/Model/replit/generate.py        |  2 +-
 .../example/GPU/PyTorch-Models/Model/solar/generate.py |  2 +-
 .../Model/speech-t5/synthesize_speech.py               |  2 +-
 .../GPU/PyTorch-Models/Model/starcoder/generate.py     |  2 +-
 .../example/GPU/PyTorch-Models/Model/yi/generate.py    |  4 ++--
 .../example/GPU/PyTorch-Models/Model/yuan2/generate.py |  4 ++--
 .../GPU/PyTorch-Models/More-Data-Types/generate.py     |  2 +-
 159 files changed, 183 insertions(+), 183 deletions(-)

diff --git a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py
index 163ccc71..61ca1b4a 100644
--- a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py
+++ b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py
@@ -48,7 +48,7 @@ import ssl
 import urllib.request
 import os
 import json
-# code change to import from bigdl-llm API instead of using transformers API
+# code change to import from IPEX-LLM API instead of using transformers API
 from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 import intel_extension_for_pytorch as ipex
diff --git a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
index 63f5660a..77383dd8 100644
--- a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -87,7 +87,7 @@ if __name__ == '__main__':
         replace_method="auto"
     )
 
-    # Apply BigDL-LLM INT4 optimizations on transformers
+    # Apply IPEX-LLM INT4 optimizations on transformers
     model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
 
     model = model.to(f'cpu:{local_rank}')
@@ -111,7 +111,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 do_sample=False,
                                 max_new_tokens=args.n_predict)
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
index 37843751..0a975d60 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
@@ -59,7 +59,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
index 4acad805..c16cfb63 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
 
     model_path = args.model
 
-    # Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
+    # Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
     model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)
 
     # Generate predicted tokens
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
index 1f5852b6..2929194b 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
@@ -60,7 +60,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
index 69d9045f..40133167 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
index b9bc0ee2..45e6e001 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
index df64f80e..0961f679 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
index 59dccfe8..7a3adcfb 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
@@ -45,7 +45,7 @@ if __name__ == '__main__':
     # if your selected model is capable of utilizing previous key/value attentions
     # to enhance decoding speed, but has `"use_cache": false` in its model config,
     # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
                                                  trust_remote_code=True,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
index 07a4359e..be839ca3 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
index e38f56c4..5f7ed211 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
index fb1423fa..605bcf0d 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
index d3d8daae..a9386ba5 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py
index b8329d61..332aba75 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py
index adc79339..0a8cc749 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
 
         output = model.generate(input_ids, max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
index 679fe2e6..5ec2eaea 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
@@ -61,7 +61,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
index ee043e0b..3dd419a1 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
@@ -61,7 +61,7 @@ if __name__ == '__main__':
         st = time.time()
         # enabling `use_cache=True` allows the model to utilize the previous
         # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
         # it is important to set use_cache=True for Dolly v1 models
         output = model.generate(input_ids,
                                 use_cache=True,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
index 18867636..6f119c0f 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
@@ -64,7 +64,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 pad_token_id=tokenizer.pad_token_id,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
index 6419aa5a..4f66c335 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
index 91b8addc..54dae699 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
@@ -60,7 +60,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py
index 271b1d4f..6968b408 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py
@@ -38,7 +38,7 @@ if __name__ == '__main__':
     image = Image.open(args.image_path)
 
     # Load model
-    # For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
+    # For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
     model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu',
                                                  load_in_4bit = True,
                                                  trust_remote_code=True,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py
index 4606e2b5..20b894f2 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py
@@ -61,7 +61,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py
index 6834f582..9868bb4e 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py
@@ -37,7 +37,7 @@ if __name__ == '__main__':
     image = args.image_path
 
     # Load model
-    # For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
+    # For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
     model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', load_in_4bit=True,
                                                  trust_remote_code=True, modules_to_not_convert=['qkv'])
 
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
index 1c33a1fe..87bb5653 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
index 7e05e153..cd0bc62c 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
index ed5c93b0..dd5080ef 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
@@ -60,7 +60,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py
index 94e6ab48..ac90570a 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
index cd8b9f60..361ed02e 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
@@ -61,7 +61,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
index 786f7f0e..cc56bd84 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
index e4caa938..7e54712a 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
         input_ids = tokenizer.encode(prompt, return_tensors="pt")
         # enabling `use_cache=True` allows the model to utilize the previous
         # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
         # it is important to set use_cache=True for MPT models
         mpt_generation_config = GenerationConfig(
             max_new_tokens=args.n_predict, 
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
index 710d2a39..07ae8dc5 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
 
         # Note that phi-1_5 uses GenerationConfig to enable 'use_cache'
         output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
index 91930b72..882aef9f 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
 
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
         # Note that phi-2 uses GenerationConfig to enable 'use_cache'
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
index 395481ae..50e2a76a 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
 
         # Note that phixtral uses GenerationConfig to enable 'use_cache'
         output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
index 264f27e2..83026fd5 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 do_sample=False,
                                 max_new_tokens=args.n_predict)
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
index 5796177f..af5d3e04 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
@@ -36,7 +36,7 @@ if __name__ == '__main__':
     model_path = args.repo_id_or_model_path  
         
     # Load model
-    # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
+    # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
     model = AutoModelForCausalLM.from_pretrained(model_path, 
                                                  load_in_4bit=True, 
                                                  device_map="cpu", 
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
index 4f260181..3e43b632 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
@@ -64,7 +64,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
index da5f69ee..7eaec908 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(**inputs,
                                 max_new_tokens=args.n_predict,
                                 do_sample=True,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py
index 0599df2c..183e0bc4 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py
index 47bc1c79..e288f914 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py
index b84d7b61..36fbfa76 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py
@@ -59,7 +59,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
index e6a80a71..1b654a26 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
index 118b6084..074f9552 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
@@ -53,7 +53,7 @@ if __name__ == '__main__':
         st = time.time()
         # enabling `use_cache=True` allows the model to utilize the previous
         # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
         # it is important to set use_cache=True for vicuna-v1.3 models
         output = model.generate(input_ids,
                                 use_cache=True,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
index 60de9751..a5bbb759 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
@@ -63,7 +63,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         predicted_ids = model.generate(input_features,
                                        forced_decoder_ids=forced_decoder_ids)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py
index 72d5dd97..8a41149f 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py
index f809c44b..a28a5a88 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py
@@ -61,7 +61,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
index 46115bc0..17e918d8 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
     # if your selected model is capable of utilizing previous key/value attentions
     # to enhance decoding speed, but has `"use_cache": false` in its model config,
     # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
     outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
     end_time = time.time()
 
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py
index cf4914c2..5980ac60 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py
@@ -42,7 +42,7 @@ if __name__ == '__main__':
     from ipex_llm.transformers import AutoModelForCausalLM
     # enabling `use_cache=True` allows the model to utilize the previous
     # key/values attentions to speed up decoding;
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations,
     # it is important to set use_cache=True for Ziya models
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
diff --git a/python/llm/example/CPU/LlamaIndex/rag.py b/python/llm/example/CPU/LlamaIndex/rag.py
index c4c4c8f8..9f26e55d 100644
--- a/python/llm/example/CPU/LlamaIndex/rag.py
+++ b/python/llm/example/CPU/LlamaIndex/rag.py
@@ -163,7 +163,7 @@ def messages_to_prompt(messages):
 def main(args):
     embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path)
     
-    # Use custom LLM in BigDL
+    # Use custom LLM in IPEX-LLM
     from ipex_llm.llamaindex.llms import BigdlLLM
     llm = BigdlLLM(
         model_name=args.model_path,
diff --git a/python/llm/example/CPU/ModelScope-Models/generate.py b/python/llm/example/CPU/ModelScope-Models/generate.py
index 274566f3..0e770065 100644
--- a/python/llm/example/CPU/ModelScope-Models/generate.py
+++ b/python/llm/example/CPU/ModelScope-Models/generate.py
@@ -59,7 +59,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
index aa349c29..e576cfd1 100644
--- a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
+++ b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
@@ -56,8 +56,8 @@ def load(model_path, model_family, n_threads):
 def inference(llm, repo_id_or_model_path, model_family, prompt):
 
     if model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm']:
-        # ------ Option 1: Use bigdl-llm based tokenizer
-        print('-'*20, ' bigdl-llm based tokenizer ', '-'*20)
+        # ------ Option 1: Use IPEX-LLM based tokenizer
+        print('-'*20, ' IPEX-LLM based tokenizer ', '-'*20)
         st = time.time()
 
         # please note that the prompt here can either be a string or a list of string
@@ -126,13 +126,13 @@ def main():
     if args.model_family == 'llama2':
         args.model_family = 'llama'
 
-    # Step 1: convert original model to BigDL llm model
-    bigdl_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
+    # Step 1: convert original model to IPEX-LLM model
+    ipex_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
                              model_family=args.model_family,
                              tmp_path=args.tmp_path)
     
     # Step 2: load int4 model
-    llm = load(model_path=bigdl_llm_path,
+    llm = load(model_path=ipex_llm_path,
                model_family=args.model_family,
                n_threads=args.thread_num)
 
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py
index 9b219452..fe9186d8 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py
index 1811c36b..7229adbe 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py
@@ -38,7 +38,7 @@ if __name__ == '__main__':
     model = Bark.init_from_config(config)
     model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Synthesize speech with the given input
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py
index cd0c73b7..25f66521 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py
@@ -38,7 +38,7 @@ if __name__ == '__main__':
                                       torch_dtype="auto",
                                       low_cpu_mem_usage=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py
index d16d2331..00db9920 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py
@@ -40,7 +40,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
     
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py
index 89d26761..76f152bc 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py
@@ -41,7 +41,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
     
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py
index 22fdeaad..ce2e7d4b 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py
@@ -41,7 +41,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
     
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py
index 12266e99..e7e93dc2 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py
@@ -41,7 +41,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py
index a0610bc6..4963051c 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py
@@ -40,7 +40,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
     
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py
index 8714b419..cb713533 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py
@@ -50,7 +50,7 @@ if __name__ == '__main__':
         trust_remote_code=True,
     )
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py
index 1a2cbaec..5bd9740b 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py
@@ -46,7 +46,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py
index 35e1b25d..c4a3e220 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py
@@ -40,7 +40,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForSeq2SeqLM.from_pretrained(model_path, trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # "wo" module is not converted due to some issues of T5 model
     # (https://github.com/huggingface/transformers/issues/20287),
     # "lm_head" module is not converted to generate outputs with better quality
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py
index 8e2397ba..8503357e 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py
@@ -40,8 +40,8 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
-    # For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
+    # With only one line to enable IPEX-LLM optimization on model
+    # For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
     model = optimize_model(model,
                            low_bit='sym_int4',
                            modules_to_not_convert=['vision_embed_tokens'])
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py
index dc664493..13b1c1ca 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py
@@ -38,8 +38,8 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
-    # For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
+    # With only one line to enable IPEX-LLM optimization on model
+    # For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
     model = optimize_model(model,
                            low_bit='sym_int4',
                            modules_to_not_convert=['qkv'])
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py
index 6c4ab17a..5888e896 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py
@@ -45,7 +45,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
index c27d1a50..5f3316e3 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
@@ -292,7 +292,7 @@ if __name__ == '__main__':
                                                                  model_base=None,
                                                                  model_name=model_name)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Generate image tensor
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py
index 9462474a..77c9d1ec 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py
@@ -42,7 +42,7 @@ if __name__ == '__main__':
     # Load model
     model = MambaLMHeadModel.from_pretrained(model_path)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model, low_bit='asym_int4', modules_to_not_convert=["dt_proj", "x_proj", "out_proj"])
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py
index 37958b67..98f3c3e2 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
index 330b4349..ebb7bf92 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py
index 1b071d57..3e202310 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py
@@ -46,7 +46,7 @@ if __name__ == '__main__':
     # Load whisper model under pytorch framework
     model = whisper.load_model(args.model_name)
 
-    # With only one line to enable bigdl optimize on a pytorch model
+    # With only one line to enable IPEX-LLM optimize on a pytorch model
     model = optimize_model(model)
 
     st = time.time()
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py
index f70da15d..6b69ca07 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py
@@ -42,7 +42,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
     
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py
index 319c009f..745ed4de 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py
@@ -42,7 +42,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
     
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py
index d92cabbc..0bd286d9 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py
@@ -37,8 +37,8 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
-    # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
+    # With only one line to enable IPEX-LLM optimization on model
+    # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
     model = optimize_model(model, 
                            low_bit='sym_int4', 
                            modules_to_not_convert=['c_fc', 'out_proj'])
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py
index fa52a2e9..0ee0f7e9 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py
@@ -41,7 +41,7 @@ if __name__ == '__main__':
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py
index 2ddd48af..ed566894 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py
@@ -45,7 +45,7 @@ if __name__ == '__main__':
                                                  torch_dtype=torch.float16,
                                                  trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
     
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py
index 832f7623..e6760953 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     # Load tokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py
index bf6af053..a67f8361 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py
@@ -54,7 +54,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py
index ea71ad76..3e011231 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py
@@ -48,7 +48,7 @@ if __name__ == '__main__':
     print("Creating model...")
     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True, torch_dtype=torch.float16).eval()
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model)
 
     prompt = YUAN2_PROMPT_FORMAT.format(prompt=args.prompt)
@@ -59,7 +59,7 @@ if __name__ == '__main__':
     # if your selected model is capable of utilizing previous key/value attentions
     # to enhance decoding speed, but has `"use_cache": false` in its model config,
     # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
     outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
     end_time = time.time()
 
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py
index e6f2c02d..580e198d 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py
@@ -42,7 +42,7 @@ if __name__ == '__main__':
     from ipex_llm import optimize_model
     # enabling `use_cache=True` allows the model to utilize the previous
     # key/values attentions to speed up decoding;
-    # to obtain optimal performance with BigDL-LLM `optimization_model` API optimizations,
+    # to obtain optimal performance with IPEX-LLM `optimization_model` API optimizations,
     # it is important to set use_cache=True for Ziya models
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  trust_remote_code=True,
diff --git a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py
index 59c01f63..e4e10129 100644
--- a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py
@@ -49,7 +49,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # `low_bit` param support `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8`
     # By specifying `low_bit` param, relevant low bit optimizations will be applied to the model
     model = optimize_model(model, low_bit=low_bit)
diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py
index cdf3196c..c2362267 100644
--- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py
+++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py
@@ -49,7 +49,7 @@ from utils.prompter import Prompter
 from transformers import BitsAndBytesConfig
 from ipex_llm.transformers import AutoModelForCausalLM
 
-# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
 from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
 from ipex_llm.utils.isa_checker import ISAChecker
 
@@ -64,7 +64,7 @@ def get_int_from_env(env_keys, default):
 def train(
     # model/data params
     base_model: str = "meta-llama/Llama-2-7b-hf",  # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
-    saved_low_bit_model: str = None,  # optional, the path to the saved model with bigdl-llm low-bit optimization
+    saved_low_bit_model: str = None,  # optional, the path to the saved model with ipex-llm low-bit optimization
     data_path: str = "yahma/alpaca-cleaned",
     output_dir: str = "./bigdl-qlora-alpaca",
     # training hyperparams
@@ -256,7 +256,7 @@ def train(
             ]  # could be sped up, probably
         return tokenized_full_prompt
 
-    # Prepare a BigDL-LLM compatible Peft model
+    # Prepare a IPEX-LLM compatible Peft model
     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
 
     config = LoraConfig(
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
index 3f221425..b2fa7aaf 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -63,7 +63,7 @@ if __name__ == '__main__':
     low_bit = args.low_bit
 
     # First use CPU as accelerator
-    # Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage
+    # Convert to deepspeed model and apply IPEX-LLM optimization on CPU to decrease GPU memory usage
     current_accel = CPU_Accelerator()
     set_accelerator(current_accel)
     model = AutoModelForCausalLM.from_pretrained(args.repo_id_or_model_path,
@@ -80,7 +80,7 @@ if __name__ == '__main__':
         replace_method="auto",
     )
 
-    # Use bigdl-llm `optimize_model` to convert the model into optimized low bit format
+    # Use IPEX-LLM `optimize_model` to convert the model into optimized low bit format
     # Convert the rest of the model into float16 to reduce allreduce traffic
     model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16)
 
@@ -119,7 +119,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 do_sample=False,
                                 max_new_tokens=args.n_predict)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
index c6ff5241..66ef8ff9 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
index 6834bf88..b9353a09 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
@@ -71,7 +71,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM Low Bit optimizations
+        # to obtain optimal performance with IPEX-LLM Low Bit optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 repetition_penalty=1.1)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
index 9272b727..e43afcbb 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
 
     model_path = args.model
 
-    # Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
+    # Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
     model, tokenizer = AutoModelForCausalLM.from_gguf(model_path)
     model = model.to('xpu')
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
index 4317730f..6a39250d 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
@@ -59,7 +59,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         end = time.time()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py
index ec729d12..8715c38d 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py
@@ -60,7 +60,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
index cc75bba7..3037a392 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
@@ -60,7 +60,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
index 8524fa65..1237bca7 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
@@ -64,7 +64,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
index d4f53a89..476d0946 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
     # if your selected model is capable of utilizing previous key/value attentions
     # to enhance decoding speed, but has `"use_cache": false` in its model config,
     # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = AutoModelForCausalLM.from_pretrained(model_path,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
index 940ac45a..30a7cc1a 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
@@ -64,7 +64,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
index a580b92e..53272834 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
@@ -67,7 +67,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
index 9930441e..92190bca 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
@@ -67,7 +67,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
index cedf26dd..b9584fa0 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
     # if your selected model is capable of utilizing previous key/value attentions
     # to enhance decoding speed, but has `"use_cache": false` in its model config,
     # it is important to set `use_cache=True` explicitly to obtain optimal
-    # performance with BigDL-LLM INT4 optimizations
+    # performance with IPEX-LLM INT4 optimizations
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = AutoModelForCausalLM.from_pretrained(model_path,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
index b4833107..c772d8c2 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
@@ -67,7 +67,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
index 728ae71f..eb41e090 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
         cpu_embedding=True
     )
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = model.to('xpu')
     # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_path)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py
index 21d2a43b..7e5b1f62 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py
@@ -64,7 +64,7 @@ if __name__ == '__main__':
         st = time.time()
         # enabling `use_cache=True` allows the model to utilize the previous
         # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
         # it is important to set use_cache=True for Dolly v1 models
         output = model.generate(input_ids,
                                 use_cache=True,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py
index b5182e39..fbec6f3f 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py
@@ -67,7 +67,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 pad_token_id=tokenizer.pad_token_id,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
index 3326a6b7..88f739bb 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
@@ -67,7 +67,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
index e28efcaf..c6158dde 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
@@ -69,7 +69,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
index 1b69e57f..bbe4f68b 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
@@ -68,7 +68,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
index baa777a4..4fd3a37d 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
@@ -65,7 +65,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
index c1cd1425..34ee285c 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
@@ -66,7 +66,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
index 33e81f77..ecd12b6b 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
@@ -72,7 +72,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
index d724e899..09f389ad 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
@@ -79,7 +79,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py
index 390c7129..3734724a 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py
@@ -65,7 +65,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
index 3a9e2da0..2795883b 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
@@ -65,7 +65,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
index 93448b67..7ff3326a 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
@@ -65,7 +65,7 @@ if __name__ == '__main__':
         # start inference
         # enabling `use_cache=True` allows the model to utilize the previous
         # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
         # it is important to set use_cache=True for MPT models
         mpt_generation_config = GenerationConfig(
             max_new_tokens=args.n_predict, 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
index 165e9e63..f3b766c7 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
@@ -68,7 +68,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
 
         # Note that phi-1_5 uses GenerationConfig to enable 'use_cache'
         output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
index 6ca9a192..09cc8e95 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
@@ -69,7 +69,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
 
         # Note that phi-2 uses GenerationConfig to enable 'use_cache'
         output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
index 03127136..668cbe9f 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
@@ -37,7 +37,7 @@ if __name__ == '__main__':
     model_path = args.repo_id_or_model_path  
         
     # Load model
-    # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
+    # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = AutoModelForCausalLM.from_pretrained(model_path, 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
index 6035eb1d..2d0a5f8a 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
@@ -73,7 +73,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
index fa55600f..2f387d6b 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
@@ -70,7 +70,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(**inputs,
                                 max_new_tokens=args.n_predict,
                                 do_sample=True,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py
index b5001366..3cf6e51c 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py
@@ -66,7 +66,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py
index 93c67f11..c2d90c91 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py
@@ -67,7 +67,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
index 8e61fd76..dcc558f9 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
@@ -66,7 +66,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
index df9686f8..1cf63a2c 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         st = time.time()
         # enabling `use_cache=True` allows the model to utilize the previous
         # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
         # it is important to set use_cache=True for vicuna-v1.3 models
         output = model.generate(input_ids,
                                 use_cache=True,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
index 4a0ca795..fbdb8feb 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
@@ -67,7 +67,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         predicted_ids = model.generate(input_features,
                                        forced_decoder_ids=forced_decoder_ids)
         end = time.time()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py
index 60ef19fc..f9a0e544 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py
@@ -71,7 +71,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
index e84e0e46..8b7a358e 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
@@ -68,7 +68,7 @@ if __name__ == '__main__':
     # if your selected model is capable of utilizing previous key/value attentions
     # to enhance decoding speed, but has `"use_cache": false` in its model config,
     # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
     outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
     end_time = time.time()
 
diff --git a/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py
index cb4a35f6..54e05c9a 100644
--- a/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py
@@ -128,7 +128,7 @@ if __name__ == "__main__":
     #                                              modules_to_not_convert=["lm_head"],)
 
     model = model.to('xpu')
-    # Prepare a BigDL-LLM compatible Peft model
+    # Prepare a IPEX-LLM compatible Peft model
     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
     model = get_peft_model(model, peft_config)
     model.config.use_cache = False
diff --git a/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py b/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py
index a829cd40..7471d6a6 100644
--- a/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py
+++ b/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py
@@ -33,7 +33,7 @@
 from ipex_llm import llm_patch
 llm_patch(train=True)
 
-# The following is the original LLM finetuning code using PEFT (without BigDL-LLM)
+# The following is the original LLM finetuning code using PEFT (without IPEX-LLM)
 import os
 import sys
 from typing import List
diff --git a/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py
index 4af84ed6..d4b593d1 100644
--- a/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py
@@ -53,7 +53,7 @@ from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_
 
 from transformers import BitsAndBytesConfig
 from ipex_llm.transformers import AutoModelForCausalLM
-# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
 from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
     LoraConfig
 from ipex_llm.utils.common import invalidInputError
@@ -69,7 +69,7 @@ os.environ["MASTER_PORT"] = str(port)
 def train(
     # model/data params
     base_model: str = "meta-llama/Llama-2-7b-hf",  # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
-    saved_low_bit_model: str = None,  # optional, the path to the saved model with bigdl-llm low-bit optimization
+    saved_low_bit_model: str = None,  # optional, the path to the saved model with ipex-llm low-bit optimization
     data_path: str = "yahma/alpaca-cleaned",
     output_dir: str = "./bigdl-qlora-alpaca",
     # training hyperparams
@@ -187,7 +187,7 @@ def train(
 
     print(model)
 
-    # Prepare a BigDL-LLM compatible Peft model
+    # Prepare a IPEX-LLM compatible Peft model
     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
 
     config = LoraConfig(
diff --git a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py
index 647cf9e9..91271a78 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py
@@ -53,7 +53,7 @@ from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_
 
 from transformers import BitsAndBytesConfig
 from ipex_llm.transformers import AutoModelForCausalLM
-# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
 from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
     LoraConfig
 from ipex_llm.utils.common import invalidInputError
@@ -69,7 +69,7 @@ os.environ["MASTER_PORT"] = str(port)
 def train(
     # model/data params
     base_model: str = "meta-llama/Llama-2-7b-hf",  # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
-    saved_low_bit_model: str = None,  # optional, the path to the saved model with bigdl-llm low-bit optimization
+    saved_low_bit_model: str = None,  # optional, the path to the saved model with ipex-llm low-bit optimization
     data_path: str = "yahma/alpaca-cleaned",
     output_dir: str = "./bigdl-qlora-alpaca",
     # training hyperparams
@@ -199,7 +199,7 @@ def train(
 
     print(model)
 
-    # Prepare a BigDL-LLM compatible Peft model
+    # Prepare a IPEX-LLM compatible Peft model
     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
 
     config = LoraConfig(
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py
index 3ffd6727..9156462a 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py
@@ -53,7 +53,7 @@ from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_
 
 from transformers import BitsAndBytesConfig
 from ipex_llm.transformers import AutoModelForCausalLM
-# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
 from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
     LoraConfig
 from ipex_llm.utils.common import invalidInputError
@@ -69,7 +69,7 @@ os.environ["MASTER_PORT"] = str(port)
 def train(
     # model/data params
     base_model: str = "meta-llama/Llama-2-7b-hf",  # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
-    saved_low_bit_model: str = None,  # optional, the path to the saved model with bigdl-llm low-bit optimization
+    saved_low_bit_model: str = None,  # optional, the path to the saved model with IPEX-LLM low-bit optimization
     data_path: str = "yahma/alpaca-cleaned",
     output_dir: str = "./bigdl-qlora-alpaca",
     # training hyperparams
@@ -199,7 +199,7 @@ def train(
 
     print(model)
 
-    # Prepare a BigDL-LLM compatible Peft model
+    # Prepare a IPEX-LLM compatible Peft model
     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
 
     config = LoraConfig(
diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py
index 2a2ff947..2717f7ad 100644
--- a/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py
@@ -54,7 +54,7 @@ from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_
 from transformers import BitsAndBytesConfig
 from ipex_llm.transformers import AutoModelForCausalLM
 from ipex_llm.transformers.relora import ReLoRATrainer
-# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
 from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
     LoraConfig
 from ipex_llm.utils.common import invalidInputError
@@ -70,7 +70,7 @@ os.environ["MASTER_PORT"] = str(port)
 def train(
     # model/data params
     base_model: str = "meta-llama/Llama-2-7b-hf",  # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
-    saved_low_bit_model: str = None,  # optional, the path to the saved model with bigdl-llm low-bit optimization
+    saved_low_bit_model: str = None,  # optional, the path to the saved model with IPEX-LLM low-bit optimization
     data_path: str = "yahma/alpaca-cleaned",
     output_dir: str = "./bigdl-qlora-alpaca",
     # training hyperparams
@@ -211,7 +211,7 @@ def train(
 
     print(model)
 
-    # Prepare a BigDL-LLM compatible Peft model
+    # Prepare a IPEX-LLM compatible Peft model
     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
 
     config = LoraConfig(
diff --git a/python/llm/example/GPU/LlamaIndex/rag.py b/python/llm/example/GPU/LlamaIndex/rag.py
index 97dc3ae7..87838715 100644
--- a/python/llm/example/GPU/LlamaIndex/rag.py
+++ b/python/llm/example/GPU/LlamaIndex/rag.py
@@ -162,7 +162,7 @@ def messages_to_prompt(messages):
 def main(args):
     embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path)
     
-    # Use custom LLM in BigDL
+    # Use custom LLM in IPEX-LLM
     from ipex_llm.llamaindex.llms import BigdlLLM
     llm = BigdlLLM(
         model_name=args.model_path,
diff --git a/python/llm/example/GPU/ModelScope-Models/generate.py b/python/llm/example/GPU/ModelScope-Models/generate.py
index 0df692e1..47069970 100644
--- a/python/llm/example/GPU/ModelScope-Models/generate.py
+++ b/python/llm/example/GPU/ModelScope-Models/generate.py
@@ -69,7 +69,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
index f9454e78..e54cf881 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
@@ -101,7 +101,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py
index 6258dd2e..7bdb63ff 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py
index 9fcddbf1..ccc8b24f 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py
index a4592679..ae9e4c7f 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py
@@ -45,7 +45,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py
index 9bff3517..82763b4c 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
     processor = AutoProcessor.from_pretrained(model_path)
     model = BarkModel.from_pretrained(model_path)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py
index cdb9567f..e308820c 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py
index f7cc3938..e968bddb 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
                                       torch_dtype='auto',
                                       low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py
index 4c9dbe77..8dfceef6 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
                                       torch_dtype='auto',
                                       low_cpu_mem_usage=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py
index ab6ad290..53ccd356 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
                                       torch_dtype='auto',
                                       low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py
index 89c93edb..42115e9d 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
                                       torch_dtype='auto',
                                       low_cpu_mem_usage=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
index a9eaaace..d676666b 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
@@ -45,7 +45,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py
index 3a4c7e52..4eeabcee 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py
@@ -50,7 +50,7 @@ if __name__ == '__main__':
         trust_remote_code=True,
     )
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py
index be92fd95..9732ca9d 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py
@@ -54,7 +54,7 @@ if __name__ == '__main__':
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     tokenizer.pad_token = tokenizer.eos_token
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model,
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py
index 9168b46a..1b76b040 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py
@@ -50,7 +50,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py
index 82787ec9..4756307a 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py
@@ -50,7 +50,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py
index 8c216ec4..a1e6976e 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
                                                   torch_dtype='auto',
                                                   low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # "wo" module is not converted due to some issues of T5 model
     # (https://github.com/huggingface/transformers/issues/20287),
     # "lm_head" module is not converted to generate outputs with better quality
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py
index 4add8dfb..7593d549 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py
@@ -48,7 +48,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
index 6a6e5a4a..ce3275df 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
@@ -291,7 +291,7 @@ if __name__ == '__main__':
                                                                  model_base=None,
                                                                  model_name=model_name)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model).to('xpu')
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py
index ac8a8dd9..6ec314b2 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
     # Load model
     model = MambaLMHeadModel.from_pretrained(model_path)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     model = optimize_model(model, low_bit='asym_int4', modules_to_not_convert=["dt_proj", "x_proj"])
 
     model = model.to('xpu')
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
index 80e1fc52..0e717730 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py
index ec8c3711..a58b1628 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py
index fbdef847..6bf99191 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py
@@ -42,7 +42,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py
index ca7499bd..b9c64ab9 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py
@@ -42,7 +42,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py
index d1e7e7fa..0c8a278a 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
     model_path = args.repo_id_or_model_path
 
     
-    # Load huggingface model with optimize_model in BigDL
+    # Load huggingface model with optimize_model in IPEX-LLM
     from transformers import AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  trust_remote_code=True)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py
index 783689cb..358c1304 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py
@@ -39,8 +39,8 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
-    # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
+    # With only one line to enable IPEX-LLM optimization on model
+    # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model, 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py
index 3edd21b7..4a92962d 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py
index 95388061..ee522a8f 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py
@@ -46,7 +46,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py
index 58e57c8d..2145b361 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py
@@ -73,7 +73,7 @@ if __name__ == '__main__':
     model = SpeechT5ForTextToSpeech.from_pretrained(model_path)
     vocoder = SpeechT5HifiGan.from_pretrained(vocoder_path)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # Skip optimizing these two modules to get higher audio quality
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py
index 1092fe43..6c50838f 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py
@@ -43,7 +43,7 @@ if __name__ == '__main__':
                                                  torch_dtype='auto',
                                                  low_cpu_mem_usage=True)
 
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py
index 61fe372e..31256cda 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py
@@ -41,7 +41,7 @@ if __name__ == '__main__':
                                                  trust_remote_code=True,
                                                  use_cache=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
@@ -64,7 +64,7 @@ if __name__ == '__main__':
         # if your selected model is capable of utilizing previous key/value attentions
         # to enhance decoding speed, but has `"use_cache": false` in its model config,
         # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         torch.xpu.synchronize()
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py
index 31179c8c..9a887c09 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py
@@ -49,7 +49,7 @@ if __name__ == '__main__':
     print("Creating model...")
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype='auto', low_cpu_mem_usage=True).eval()
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
     model = optimize_model(model)
@@ -70,7 +70,7 @@ if __name__ == '__main__':
     # if your selected model is capable of utilizing previous key/value attentions
     # to enhance decoding speed, but has `"use_cache": false` in its model config,
     # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
     outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
     end_time = time.time()
 
diff --git a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py
index d3a94b07..f615619e 100644
--- a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py
@@ -49,7 +49,7 @@ if __name__ == '__main__':
     # Load model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
     
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
     # `low_bit` param support `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8`
     # By specifying `low_bit` param, relevant low bit optimizations will be applied to the model
     model = optimize_model(model, low_bit=low_bit)