Replace with IPEX-LLM in example comments (#10671)

* Replace with IPEX-LLM in example comments * More replacement * revert some changes
2024-04-07 13:29:51 +08:00 · 2024-04-07 13:29:51 +08:00 · 10ee786920
commit 10ee786920
parent 08018a18df
159 changed files with 183 additions and 183 deletions
--- a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py
+++ b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py
@ -48,7 +48,7 @@ import ssl
 import urllib.request
 import os
 import json
-# code change to import from bigdl-llm API instead of using transformers API
+# code change to import from IPEX-LLM API instead of using transformers API
 from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 import intel_extension_for_pytorch as ipex
--- a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
@ -87,7 +87,7 @@ if __name__ == '__main__':
        replace_method="auto"
    )

-    # Apply BigDL-LLM INT4 optimizations on transformers
+    # Apply IPEX-LLM INT4 optimizations on transformers
    model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')

    model = model.to(f'cpu:{local_rank}')
@ -111,7 +111,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                do_sample=False,
                                max_new_tokens=args.n_predict)
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
@ -59,7 +59,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
@ -44,7 +44,7 @@ if __name__ == '__main__':

    model_path = args.model

-    # Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
+    # Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
    model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)

    # Generate predicted tokens
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
@ -60,7 +60,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
@ -56,7 +56,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
@ -56,7 +56,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
@ -56,7 +56,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
@ -45,7 +45,7 @@ if __name__ == '__main__':
    # if your selected model is capable of utilizing previous key/value attentions
    # to enhance decoding speed, but has `"use_cache": false` in its model config,
    # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 load_in_4bit=True,
                                                 trust_remote_code=True,
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
@ -56,7 +56,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
@ -57,7 +57,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
@ -57,7 +57,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
@ -57,7 +57,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py
@ -55,7 +55,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py
@ -56,7 +56,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations

        output = model.generate(input_ids, max_new_tokens=args.n_predict)

--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
@ -61,7 +61,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
@ -61,7 +61,7 @@ if __name__ == '__main__':
        st = time.time()
        # enabling `use_cache=True` allows the model to utilize the previous
        # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
        # it is important to set use_cache=True for Dolly v1 models
        output = model.generate(input_ids,
                                use_cache=True,
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
@ -64,7 +64,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict,
                                pad_token_id=tokenizer.pad_token_id,
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
@ -57,7 +57,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
@ -60,7 +60,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py
@ -38,7 +38,7 @@ if __name__ == '__main__':
    image = Image.open(args.image_path)

    # Load model
-    # For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
+    # For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu',
                                                 load_in_4bit = True,
                                                 trust_remote_code=True,
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py
@ -61,7 +61,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py
@ -37,7 +37,7 @@ if __name__ == '__main__':
    image = args.image_path

    # Load model
-    # For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
+    # For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
    model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', load_in_4bit=True,
                                                 trust_remote_code=True, modules_to_not_convert=['qkv'])

--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
@ -57,7 +57,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
@ -56,7 +56,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
@ -60,7 +60,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py
@ -55,7 +55,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
@ -61,7 +61,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
@ -56,7 +56,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
@ -55,7 +55,7 @@ if __name__ == '__main__':
        input_ids = tokenizer.encode(prompt, return_tensors="pt")
        # enabling `use_cache=True` allows the model to utilize the previous
        # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
        # it is important to set use_cache=True for MPT models
        mpt_generation_config = GenerationConfig(
            max_new_tokens=args.n_predict, 
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
@ -58,7 +58,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations

        # Note that phi-1_5 uses GenerationConfig to enable 'use_cache'
        output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
@ -58,7 +58,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations

        model.generation_config.pad_token_id = model.generation_config.eos_token_id
        # Note that phi-2 uses GenerationConfig to enable 'use_cache'
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
@ -58,7 +58,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations

        # Note that phixtral uses GenerationConfig to enable 'use_cache'
        output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
@ -55,7 +55,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                do_sample=False,
                                max_new_tokens=args.n_predict)
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
@ -36,7 +36,7 @@ if __name__ == '__main__':
    model_path = args.repo_id_or_model_path  
        
    # Load model
-    # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
+    # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
    model = AutoModelForCausalLM.from_pretrained(model_path, 
                                                 load_in_4bit=True, 
                                                 device_map="cpu", 
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
@ -64,7 +64,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
@ -56,7 +56,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(**inputs,
                                max_new_tokens=args.n_predict,
                                do_sample=True,
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py
@ -55,7 +55,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py
@ -55,7 +55,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py
@ -59,7 +59,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
@ -55,7 +55,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
@ -53,7 +53,7 @@ if __name__ == '__main__':
        st = time.time()
        # enabling `use_cache=True` allows the model to utilize the previous
        # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
        # it is important to set use_cache=True for vicuna-v1.3 models
        output = model.generate(input_ids,
                                use_cache=True,
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
@ -63,7 +63,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        predicted_ids = model.generate(input_features,
                                       forced_decoder_ids=forced_decoder_ids)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py
@ -58,7 +58,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py
@ -61,7 +61,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
@ -57,7 +57,7 @@ if __name__ == '__main__':
    # if your selected model is capable of utilizing previous key/value attentions
    # to enhance decoding speed, but has `"use_cache": false` in its model config,
    # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
    outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
    end_time = time.time()

--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py
@ -42,7 +42,7 @@ if __name__ == '__main__':
    from ipex_llm.transformers import AutoModelForCausalLM
    # enabling `use_cache=True` allows the model to utilize the previous
    # key/values attentions to speed up decoding;
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations,
    # it is important to set use_cache=True for Ziya models
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 load_in_4bit=True,
--- a/python/llm/example/CPU/LlamaIndex/rag.py
+++ b/python/llm/example/CPU/LlamaIndex/rag.py
@ -163,7 +163,7 @@ def messages_to_prompt(messages):
 def main(args):
    embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path)
    
-    # Use custom LLM in BigDL
+    # Use custom LLM in IPEX-LLM
    from ipex_llm.llamaindex.llms import BigdlLLM
    llm = BigdlLLM(
        model_name=args.model_path,
--- a/python/llm/example/CPU/ModelScope-Models/generate.py
+++ b/python/llm/example/CPU/ModelScope-Models/generate.py
@ -59,7 +59,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
+++ b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
@ -56,8 +56,8 @@ def load(model_path, model_family, n_threads):
 def inference(llm, repo_id_or_model_path, model_family, prompt):

    if model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm']:
-        # ------ Option 1: Use bigdl-llm based tokenizer
-        print('-'*20, ' bigdl-llm based tokenizer ', '-'*20)
+        # ------ Option 1: Use IPEX-LLM based tokenizer
+        print('-'*20, ' IPEX-LLM based tokenizer ', '-'*20)
        st = time.time()

        # please note that the prompt here can either be a string or a list of string
@ -126,13 +126,13 @@ def main():
    if args.model_family == 'llama2':
        args.model_family = 'llama'

-    # Step 1: convert original model to BigDL llm model
-    bigdl_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
+    # Step 1: convert original model to IPEX-LLM model
+    ipex_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
                             model_family=args.model_family,
                             tmp_path=args.tmp_path)
    
    # Step 2: load int4 model
-    llm = load(model_path=bigdl_llm_path,
+    llm = load(model_path=ipex_llm_path,
               model_family=args.model_family,
               n_threads=args.thread_num)

--- a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py
@ -44,7 +44,7 @@ if __name__ == '__main__':
                                                 torch_dtype='auto',
                                                 low_cpu_mem_usage=True)
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py
@ -38,7 +38,7 @@ if __name__ == '__main__':
    model = Bark.init_from_config(config)
    model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Synthesize speech with the given input
--- a/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py
@ -38,7 +38,7 @@ if __name__ == '__main__':
                                      torch_dtype="auto",
                                      low_cpu_mem_usage=True)
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py
@ -40,7 +40,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)
    
    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py
@ -41,7 +41,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModel.from_pretrained(model_path, trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)
    
    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py
@ -41,7 +41,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModel.from_pretrained(model_path, trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)
    
    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py
@ -41,7 +41,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py
@ -40,7 +40,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)
    
    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py
@ -50,7 +50,7 @@ if __name__ == '__main__':
        trust_remote_code=True,
    )
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py
@ -46,7 +46,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py
@ -40,7 +40,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    # "wo" module is not converted due to some issues of T5 model
    # (https://github.com/huggingface/transformers/issues/20287),
    # "lm_head" module is not converted to generate outputs with better quality
--- a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py
@ -40,8 +40,8 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
-    # For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
+    # With only one line to enable IPEX-LLM optimization on model
+    # For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
    model = optimize_model(model,
                           low_bit='sym_int4',
                           modules_to_not_convert=['vision_embed_tokens'])
--- a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py
@ -38,8 +38,8 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
-    # For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
+    # With only one line to enable IPEX-LLM optimization on model
+    # For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
    model = optimize_model(model,
                           low_bit='sym_int4',
                           modules_to_not_convert=['qkv'])
--- a/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py
@ -45,7 +45,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
@ -292,7 +292,7 @@ if __name__ == '__main__':
                                                                 model_base=None,
                                                                 model_name=model_name)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Generate image tensor
--- a/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py
@ -42,7 +42,7 @@ if __name__ == '__main__':
    # Load model
    model = MambaLMHeadModel.from_pretrained(model_path)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model, low_bit='asym_int4', modules_to_not_convert=["dt_proj", "x_proj", "out_proj"])

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py
@ -44,7 +44,7 @@ if __name__ == '__main__':
                                                 torch_dtype='auto',
                                                 low_cpu_mem_usage=True)
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
@ -44,7 +44,7 @@ if __name__ == '__main__':
                                                 torch_dtype='auto',
                                                 low_cpu_mem_usage=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py
@ -46,7 +46,7 @@ if __name__ == '__main__':
    # Load whisper model under pytorch framework
    model = whisper.load_model(args.model_name)

-    # With only one line to enable bigdl optimize on a pytorch model
+    # With only one line to enable IPEX-LLM optimize on a pytorch model
    model = optimize_model(model)

    st = time.time()
--- a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py
@ -42,7 +42,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)
    
    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py
@ -42,7 +42,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)
    
    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py
@ -37,8 +37,8 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
-    # For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
+    # With only one line to enable IPEX-LLM optimization on model
+    # For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
    model = optimize_model(model, 
                           low_bit='sym_int4', 
                           modules_to_not_convert=['c_fc', 'out_proj'])
--- a/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py
@ -41,7 +41,7 @@ if __name__ == '__main__':
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py
@ -45,7 +45,7 @@ if __name__ == '__main__':
                                                 torch_dtype=torch.float16,
                                                 trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)
    
    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py
@ -44,7 +44,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)

-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    # Load tokenizer
--- a/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py
@ -54,7 +54,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py
@ -48,7 +48,7 @@ if __name__ == '__main__':
    print("Creating model...")
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cpu", trust_remote_code=True, torch_dtype=torch.float16).eval()
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = optimize_model(model)

    prompt = YUAN2_PROMPT_FORMAT.format(prompt=args.prompt)
@ -59,7 +59,7 @@ if __name__ == '__main__':
    # if your selected model is capable of utilizing previous key/value attentions
    # to enhance decoding speed, but has `"use_cache": false` in its model config,
    # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
    outputs = model.generate(inputs, do_sample=True, top_k=5, max_length=args.n_predict)
    end_time = time.time()

--- a/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py
@ -42,7 +42,7 @@ if __name__ == '__main__':
    from ipex_llm import optimize_model
    # enabling `use_cache=True` allows the model to utilize the previous
    # key/values attentions to speed up decoding;
-    # to obtain optimal performance with BigDL-LLM `optimization_model` API optimizations,
+    # to obtain optimal performance with IPEX-LLM `optimization_model` API optimizations,
    # it is important to set use_cache=True for Ziya models
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                 trust_remote_code=True,
--- a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py
@ -49,7 +49,7 @@ if __name__ == '__main__':
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    # `low_bit` param support `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8`
    # By specifying `low_bit` param, relevant low bit optimizations will be applied to the model
    model = optimize_model(model, low_bit=low_bit)
--- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py
+++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py
@ -49,7 +49,7 @@ from utils.prompter import Prompter
 from transformers import BitsAndBytesConfig
 from ipex_llm.transformers import AutoModelForCausalLM

-# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
 from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
 from ipex_llm.utils.isa_checker import ISAChecker

@ -64,7 +64,7 @@ def get_int_from_env(env_keys, default):
 def train(
    # model/data params
    base_model: str = "meta-llama/Llama-2-7b-hf",  # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
-    saved_low_bit_model: str = None,  # optional, the path to the saved model with bigdl-llm low-bit optimization
+    saved_low_bit_model: str = None,  # optional, the path to the saved model with ipex-llm low-bit optimization
    data_path: str = "yahma/alpaca-cleaned",
    output_dir: str = "./bigdl-qlora-alpaca",
    # training hyperparams
@ -256,7 +256,7 @@ def train(
            ]  # could be sped up, probably
        return tokenized_full_prompt

-    # Prepare a BigDL-LLM compatible Peft model
+    # Prepare a IPEX-LLM compatible Peft model
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)

    config = LoraConfig(
--- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
@ -63,7 +63,7 @@ if __name__ == '__main__':
    low_bit = args.low_bit

    # First use CPU as accelerator
-    # Convert to deepspeed model and apply bigdl-llm optimization on CPU to decrease GPU memory usage
+    # Convert to deepspeed model and apply IPEX-LLM optimization on CPU to decrease GPU memory usage
    current_accel = CPU_Accelerator()
    set_accelerator(current_accel)
    model = AutoModelForCausalLM.from_pretrained(args.repo_id_or_model_path,
@ -80,7 +80,7 @@ if __name__ == '__main__':
        replace_method="auto",
    )

-    # Use bigdl-llm `optimize_model` to convert the model into optimized low bit format
+    # Use IPEX-LLM `optimize_model` to convert the model into optimized low bit format
    # Convert the rest of the model into float16 to reduce allreduce traffic
    model = optimize_model(model.module.to(f'cpu'), low_bit=low_bit).to(torch.float16)

@ -119,7 +119,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                do_sample=False,
                                max_new_tokens=args.n_predict)
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
@ -58,7 +58,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
@ -71,7 +71,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM Low Bit optimizations
+        # to obtain optimal performance with IPEX-LLM Low Bit optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict,
                                repetition_penalty=1.1)
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
@ -44,7 +44,7 @@ if __name__ == '__main__':

    model_path = args.model

-    # Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
+    # Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
    model, tokenizer = AutoModelForCausalLM.from_gguf(model_path)
    model = model.to('xpu')

--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
@ -59,7 +59,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        end = time.time()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py
@ -60,7 +60,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
@ -60,7 +60,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
@ -64,7 +64,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
@ -44,7 +44,7 @@ if __name__ == '__main__':
    # if your selected model is capable of utilizing previous key/value attentions
    # to enhance decoding speed, but has `"use_cache": false` in its model config,
    # it is important to set `use_cache=True` explicitly in the `generate` function
-    # to obtain optimal performance with BigDL-LLM INT4 optimizations
+    # to obtain optimal performance with IPEX-LLM INT4 optimizations
    # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
    # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
    model = AutoModelForCausalLM.from_pretrained(model_path,
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
@ -64,7 +64,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
@ -67,7 +67,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
@ -67,7 +67,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
@ -57,7 +57,7 @@ if __name__ == '__main__':
    # if your selected model is capable of utilizing previous key/value attentions
    # to enhance decoding speed, but has `"use_cache": false` in its model config,
    # it is important to set `use_cache=True` explicitly to obtain optimal
-    # performance with BigDL-LLM INT4 optimizations
+    # performance with IPEX-LLM INT4 optimizations
    # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
    # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
    model = AutoModelForCausalLM.from_pretrained(model_path,
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
@ -67,7 +67,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
@ -55,7 +55,7 @@ if __name__ == '__main__':
        cpu_embedding=True
    )
    
-    # With only one line to enable BigDL-LLM optimization on model
+    # With only one line to enable IPEX-LLM optimization on model
    model = model.to('xpu')
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py
@ -64,7 +64,7 @@ if __name__ == '__main__':
        st = time.time()
        # enabling `use_cache=True` allows the model to utilize the previous
        # key/values attentions to speed up decoding;
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations,
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations,
        # it is important to set use_cache=True for Dolly v1 models
        output = model.generate(input_ids,
                                use_cache=True,
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py
@ -67,7 +67,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict,
                                pad_token_id=tokenizer.pad_token_id,
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
@ -67,7 +67,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
@ -69,7 +69,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
@ -68,7 +68,7 @@ if __name__ == '__main__':
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
-        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        # to obtain optimal performance with IPEX-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=args.n_predict)
        torch.xpu.synchronize()
--- a/Show more
+++ b/Show more