From 52a2135d83707899e2e4eddbf1b81cdd140876c8 Mon Sep 17 00:00:00 2001
From: ZehuaCao <47251317+Romanticoseu@users.noreply.github.com>
Date: Thu, 28 Mar 2024 13:54:40 +0800
Subject: [PATCH] Replace ipex with ipex-llm (#10554)

* fix ipex with ipex_llm

* fix ipex with ipex_llm

* update

* update

* update

* update

* update

* update

* update

* update
---
 docker/llm/README.md                             |  2 +-
 .../llm/finetune/qlora/cpu/kubernetes/Chart.yaml |  2 +-
 docker/llm/serving/cpu/docker/README.md          |  2 +-
 docker/llm/serving/cpu/kubernetes/README.md      |  2 +-
 .../CPU/Deepspeed-AutoTP/deepspeed_autotp.py     |  2 +-
 python/llm/example/CPU/LangChain/README.md       | 11 ++++++++---
 .../CPU/PyTorch-Models/Model/mixtral/generate.py |  2 +-
 .../CPU/QLoRA-FineTuning/alpaca-qlora/README.md  |  4 ++--
 .../finetune_one_node_two_sockets.sh             |  2 +-
 .../GPU/Deepspeed-AutoTP/deepspeed_autotp.py     |  2 +-
 .../Advanced-Quantizations/GGUF-IQ2/generate.py  |  2 +-
 .../Model/baichuan/generate.py                   |  2 +-
 .../Model/baichuan2/generate.py                  |  2 +-
 .../Model/bluelm/generate.py                     |  2 +-
 .../Model/chatglm2/generate.py                   |  2 +-
 .../Model/chatglm2/streamchat.py                 |  2 +-
 .../Model/chatglm3/generate.py                   |  2 +-
 .../Model/chatglm3/streamchat.py                 |  2 +-
 .../Model/chinese-llama2/generate.py             |  2 +-
 .../Model/codellama/generate.py                  |  2 +-
 .../Model/falcon/generate.py                     |  2 +-
 .../Model/flan-t5/generate.py                    |  2 +-
 .../Model/gemma/generate.py                      |  2 +-
 .../Model/gpt-j/generate.py                      |  2 +-
 .../Model/internlm/generate.py                   |  2 +-
 .../Model/internlm2/generate.py                  |  2 +-
 .../Model/llama2/generate.py                     |  2 +-
 .../Model/mistral/generate.py                    |  2 +-
 .../Model/mixtral/generate.py                    |  2 +-
 .../Model/mpt/generate.py                        |  2 +-
 .../Model/phi-1_5/generate.py                    |  2 +-
 .../Model/phi-2/generate.py                      |  2 +-
 .../Model/phixtral/generate.py                   |  2 +-
 .../Model/qwen/generate.py                       |  2 +-
 .../Model/redpajama/generate.py                  |  2 +-
 .../Model/replit/generate.py                     |  2 +-
 .../Model/rwkv4/generate.py                      |  2 +-
 .../Model/rwkv5/generate.py                      |  2 +-
 .../Model/solar/generate.py                      |  2 +-
 .../Model/starcoder/generate.py                  |  2 +-
 .../Model/yi/generate.py                         |  2 +-
 .../Save-Load/generate.py                        |  2 +-
 .../example/GPU/LLM-Finetuning/LoRA/README.md    |  4 ++--
 .../LoRA/lora_finetune_llama2_7b_arc_1_card.sh   |  2 +-
 .../lora_finetune_llama2_7b_pvc_1110_4_card.sh   |  2 +-
 .../lora_finetune_llama2_7b_pvc_1550_1_tile.sh   |  2 +-
 .../lora_finetune_llama2_7b_pvc_1550_4_card.sh   |  2 +-
 .../example/GPU/LLM-Finetuning/QA-LoRA/README.md |  4 ++--
 .../qalora_finetune_llama2_7b_arc_1_card.sh      |  2 +-
 .../qalora_finetune_llama2_7b_arc_2_card.sh      |  2 +-
 .../qalora_finetune_llama2_7b_pvc_1550_1_card.sh |  2 +-
 .../qalora_finetune_llama2_7b_pvc_1550_1_tile.sh |  2 +-
 .../LLM-Finetuning/QLoRA/alpaca-qlora/README.md  |  4 ++--
 .../qlora_finetune_llama2_13b_pvc_1550_1_card.sh |  2 +-
 .../qlora_finetune_llama2_13b_pvc_1550_1_tile.sh |  2 +-
 .../qlora_finetune_llama2_13b_pvc_1550_4_card.sh |  2 +-
 .../qlora_finetune_llama2_70b_pvc_1550_1_card.sh |  2 +-
 .../qlora_finetune_llama2_70b_pvc_1550_4_card.sh |  2 +-
 .../qlora_finetune_llama2_7b_arc_1_card.sh       |  2 +-
 .../qlora_finetune_llama2_7b_arc_2_card.sh       |  2 +-
 .../qlora_finetune_llama2_7b_flex_170_1_card.sh  |  2 +-
 .../qlora_finetune_llama2_7b_flex_170_3_card.sh  |  2 +-
 .../qlora_finetune_llama2_7b_pvc_1100_1_card.sh  |  2 +-
 .../qlora_finetune_llama2_7b_pvc_1100_4_card.sh  |  2 +-
 .../qlora_finetune_llama2_7b_pvc_1550_1_card.sh  |  2 +-
 .../qlora_finetune_llama2_7b_pvc_1550_4_card.sh  |  2 +-
 .../example/GPU/LLM-Finetuning/ReLora/README.md  |  4 ++--
 .../relora_finetune_llama2_7b_arc_1_card.sh      |  2 +-
 .../relora_finetune_llama2_7b_arc_2_card.sh      |  2 +-
 .../relora_finetune_llama2_7b_pvc_1550_1_card.sh |  2 +-
 .../relora_finetune_llama2_7b_pvc_1550_4_card.sh |  2 +-
 .../GPU/ModelScope-Models/Save-Load/generate.py  |  2 +-
 .../example/GPU/ModelScope-Models/generate.py    |  2 +-
 .../GPU/Pipeline-Parallel-Inference/generate.py  |  2 +-
 .../GPU/PyTorch-Models/Model/aquila2/generate.py |  2 +-
 .../PyTorch-Models/Model/baichuan/generate.py    |  2 +-
 .../PyTorch-Models/Model/baichuan2/generate.py   |  2 +-
 .../Model/bark/synthesize_speech.py              |  2 +-
 .../GPU/PyTorch-Models/Model/bluelm/generate.py  |  2 +-
 .../PyTorch-Models/Model/chatglm2/generate.py    |  2 +-
 .../PyTorch-Models/Model/chatglm2/streamchat.py  |  2 +-
 .../PyTorch-Models/Model/chatglm3/generate.py    |  2 +-
 .../PyTorch-Models/Model/chatglm3/streamchat.py  |  2 +-
 .../PyTorch-Models/Model/codellama/generate.py   |  2 +-
 .../PyTorch-Models/Model/dolly-v1/generate.py    |  2 +-
 .../PyTorch-Models/Model/dolly-v2/generate.py    |  2 +-
 .../GPU/PyTorch-Models/Model/flan-t5/generate.py |  2 +-
 .../PyTorch-Models/Model/internlm2/generate.py   |  2 +-
 .../GPU/PyTorch-Models/Model/llama2/generate.py  |  2 +-
 .../GPU/PyTorch-Models/Model/mamba/generate.py   |  2 +-
 .../GPU/PyTorch-Models/Model/mistral/generate.py |  2 +-
 .../GPU/PyTorch-Models/Model/mixtral/generate.py |  2 +-
 .../GPU/PyTorch-Models/Model/phi-1_5/generate.py |  2 +-
 .../GPU/PyTorch-Models/Model/phi-2/generate.py   |  2 +-
 .../PyTorch-Models/Model/phixtral/generate.py    |  2 +-
 .../GPU/PyTorch-Models/Model/replit/generate.py  |  2 +-
 .../Model/speech-t5/synthesize_speech.py         |  2 +-
 .../PyTorch-Models/Model/starcoder/generate.py   |  2 +-
 .../GPU/PyTorch-Models/Model/yi/generate.py      |  2 +-
 .../PyTorch-Models/More-Data-Types/generate.py   |  2 +-
 .../GPU/PyTorch-Models/Save-Load/generate.py     |  2 +-
 .../llm/src/ipex_llm/serving/fastchat/README.md  | 16 ++++++++--------
 python/llm/src/ipex_llm/transformers/convert.py  |  2 +-
 .../src/ipex_llm/transformers/low_bit_linear.py  |  2 +-
 .../src/ipex_llm/transformers/models/chatglm2.py |  4 ++--
 .../llm/src/ipex_llm/transformers/speculative.py |  4 ++--
 106 files changed, 127 insertions(+), 122 deletions(-)

diff --git a/docker/llm/README.md b/docker/llm/README.md
index eba61568..8c58287d 100644
--- a/docker/llm/README.md
+++ b/docker/llm/README.md
@@ -62,7 +62,7 @@ After the container is booted, you could get into the container through `docker
 docker exec -it my_container bash
 ```
 
-To run inference using `IPEX-LLM` using cpu, you could refer to this [documentation](https://github.com/intel-analytics/IPEX/tree/main/python/llm#cpu-int4).
+To run inference using `IPEX-LLM` using cpu, you could refer to this [documentation](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm#cpu-int4).
 
 
 #### Getting started with chat
diff --git a/docker/llm/finetune/qlora/cpu/kubernetes/Chart.yaml b/docker/llm/finetune/qlora/cpu/kubernetes/Chart.yaml
index 2c750b4d..b5d12a1d 100644
--- a/docker/llm/finetune/qlora/cpu/kubernetes/Chart.yaml
+++ b/docker/llm/finetune/qlora/cpu/kubernetes/Chart.yaml
@@ -1,5 +1,5 @@
 apiVersion: v2
-name: ipex-fintune-service
+name: ipex_llm-fintune-service
 description: A Helm chart for IPEX-LLM Finetune Service on Kubernetes
 type: application
 version: 1.1.27
diff --git a/docker/llm/serving/cpu/docker/README.md b/docker/llm/serving/cpu/docker/README.md
index ec1b011d..f56b93a4 100644
--- a/docker/llm/serving/cpu/docker/README.md
+++ b/docker/llm/serving/cpu/docker/README.md
@@ -30,7 +30,7 @@ sudo docker run -itd \
 
 After the container is booted, you could get into the container through `docker exec`.
 
-To run model-serving using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/src/ipex/llm/serving).
+To run model-serving using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/src/ipex_llm/serving/fastchat).
 Also you can set environment variables and start arguments while running a container to get serving started initially. You may need to boot several containers to support. One controller container and at least one worker container are needed. The api server address(host and port) and controller address are set in controller container, and you need to set the same controller address as above, model path on your machine and worker address in worker container.
 
 To start a controller container:
diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md
index f8d745d3..5a337b45 100644
--- a/docker/llm/serving/cpu/kubernetes/README.md
+++ b/docker/llm/serving/cpu/kubernetes/README.md
@@ -10,7 +10,7 @@ To deploy IPEX-LLM-serving cpu in Kubernetes environment, please use this image:
 
 In this document, we will use `vicuna-7b-v1.5` as the deployment model.
 
-After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-ipex` to use `ipex-llm` as the backend. The `ipex-llm` backend will be used if model path contains `ipex-llm`. Otherwise, the original transformer-backend will be used.
+After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-ipex-llm` to use `ipex-llm` as the backend. The `ipex-llm` backend will be used if model path contains `ipex-llm`. Otherwise, the original transformer-backend will be used.
 
 You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
 
diff --git a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
index d42f3887..63f5660a 100644
--- a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -102,7 +102,7 @@ if __name__ == '__main__':
         # Batch tokenizing
         prompt = args.prompt
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to(f'cpu:{local_rank}')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex-llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 use_cache=True)
diff --git a/python/llm/example/CPU/LangChain/README.md b/python/llm/example/CPU/LangChain/README.md
index 104440e4..0d5be536 100644
--- a/python/llm/example/CPU/LangChain/README.md
+++ b/python/llm/example/CPU/LangChain/README.md
@@ -1,8 +1,8 @@
 ## Langchain Examples
 
-This folder contains examples showcasing how to use `langchain` with `ipex`. 
+This folder contains examples showcasing how to use `langchain` with `ipex-llm`. 
 
-### Install IPEX
+### Install-IPEX LLM
 
 Ensure `ipex-llm` is installed by following the [IPEX-LLM Installation Guide](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm#install). 
 
@@ -36,7 +36,7 @@ To run the example, execute the following command in the current directory:
 ```bash
 python transformers_int4/rag.py -m <path_to_model> [-q <your_question>] [-i <path_to_input_txt>]
 ```
-> Note: If `-i` is not specified, it will use a short introduction to Big-DL as input by default. if `-q` is not specified, `What is IPEX?` will be used by default. 
+> Note: If `-i` is not specified, it will use a short introduction to Big-DL as input by default. if `-q` is not specified, `What is IPEX LLM?` will be used by default. 
 
 
 ### Example: Math
@@ -66,3 +66,8 @@ python transformers_int4/voiceassistant.py -m <path_to_model> [-q <your_question
 - `-x MAX_NEW_TOKENS`: the max new tokens of model tokens input
 - `-l LANGUAGE`: you can specify a language such as "english" or "chinese" 
 - `-d True|False`: whether the model path specified in -m is saved low bit model.
+
+### Legacy (Native INT4 examples)
+
+IPEX-LLM also provides langchain integrations using native INT4 mode. Those examples can be foud in [native_int4](./native_int4/) folder. For detailed instructions of settting up and running `native_int4` examples, refer to [Native INT4 Examples README](./README_nativeint4.md). 
+
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
index 557f54c8..330b4349 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
@@ -54,7 +54,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = MIXTRAL_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex-llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md
index b4478b76..edd4d08d 100644
--- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md
+++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md
@@ -28,7 +28,7 @@ Example usage:
 python ./alpaca_qlora_finetuning_cpu.py \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca"
+    --output_dir "./ipex-llm-qlora-alpaca"
 ```
 
 **Note**: You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file.
@@ -109,7 +109,7 @@ def generate_and_tokenize_prompt(data_point):
 python ./quotes_qlora_finetuning_cpu.py \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "./english_quotes" \
-    --output_dir "./ipex-qlora-alpaca" \
+    --output_dir "./ipex-llm-qlora-alpaca" \
     --prompt_template_name "english_quotes"
 ```
 
diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/finetune_one_node_two_sockets.sh b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/finetune_one_node_two_sockets.sh
index af770e4b..88e3fa88 100644
--- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/finetune_one_node_two_sockets.sh
+++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/finetune_one_node_two_sockets.sh
@@ -14,5 +14,5 @@ mpirun -n 2 \
  --max_steps -1 \
  --base_model "meta-llama/Llama-2-7b-hf" \
  --data_path "yahma/alpaca-cleaned" \
- --output_dir "./ipex-qlora-alpaca"
+ --output_dir "./ipex-llm-qlora-alpaca"
 
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
index a64693e8..3f221425 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -109,7 +109,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = args.prompt
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to(f'xpu:{local_rank}')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 use_cache=True)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
index d8342d8e..6834bf88 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
@@ -64,7 +64,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to("xpu")
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         st = time.time()
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
index 4e34654e..8524fa65 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
index 37e65743..d4f53a89 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
@@ -61,7 +61,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
index 2d30cd29..940ac45a 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = BLUELM_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
index 1da87c11..a580b92e 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = CHATGLM_V2_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
index 36752e87..75033a16 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
@@ -54,7 +54,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = args.question
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=32)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
index 109b40ca..9930441e 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = CHATGLM_V3_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py
index 3bf7fd7f..986a7365 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py
@@ -54,7 +54,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = args.question
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=32)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
index cf0e554f..cedf26dd 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
@@ -74,7 +74,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
index 8192d78e..b4833107 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
         prompt = CODELLAMA_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
index 85669e7c..3326a6b7 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
         prompt = FALCON_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
index c0f0773b..e28efcaf 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
@@ -60,7 +60,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = FLAN_T5_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
index 5a328377..1b69e57f 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
@@ -59,7 +59,7 @@ if __name__ == '__main__':
         chat[0]['content'] = args.prompt
         prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
index 87fd5c97..f187ab5f 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
         prompt = GptJ_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
index ce25713b..c1cd1425 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
index 7a751793..33e81f77 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
@@ -62,7 +62,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
index 2fa0c281..d724e899 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
@@ -70,7 +70,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py
index 030ea9b2..390c7129 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = MISTRAL_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
index 3b52a5d4..3a9e2da0 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = MIXTRAL_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
index 20e196a4..93448b67 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = MPT_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
index 5bd9dbd5..165e9e63 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
@@ -59,7 +59,7 @@ if __name__ == '__main__':
         prompt = PHI1_5_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 generation_config = generation_config)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
index b199f08d..6ca9a192 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
@@ -60,7 +60,7 @@ if __name__ == '__main__':
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 generation_config = generation_config)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
index cbe01ebe..b6883b32 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
@@ -61,7 +61,7 @@ if __name__ == '__main__':
         prompt = PHI1_5_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 generation_config = generation_config)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
index 7fb477a2..6035eb1d 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
@@ -64,7 +64,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = QWEN_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
index 39d97ee1..fa55600f 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         prompt = RedPajama_PROMPT_FORMAT.format(prompt=args.prompt)
         inputs = tokenizer(prompt, return_tensors='pt').to('xpu')
 
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(**inputs,
                                 max_new_tokens=args.n_predict,
                                 do_sample=True,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py
index ada97daf..b5001366 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
         prompt = REPLIT_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py
index 3158bc2d..cabe997f 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py
@@ -70,7 +70,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = generate_prompt(instruction=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py
index 7591abd0..714e7884 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py
@@ -67,7 +67,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = generate_prompt(instruction=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py
index ef37b844..93c67f11 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = SOLAR_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
index a29e80ef..8e61fd76 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
         prompt = StarCoder_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py
index 8bf80fdc..60ef19fc 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py
@@ -62,7 +62,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = YI_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py
index 4a5c4f51..e4b081b0 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py
@@ -66,7 +66,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/LLM-Finetuning/LoRA/README.md b/python/llm/example/GPU/LLM-Finetuning/LoRA/README.md
index 108ad81c..4af01ab0 100644
--- a/python/llm/example/GPU/LLM-Finetuning/LoRA/README.md
+++ b/python/llm/example/GPU/LLM-Finetuning/LoRA/README.md
@@ -58,8 +58,8 @@ bash lora_finetune_llama2_7b_pvc_1550_4_card.sh
 python ./alpaca_lora_finetuning.py \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca" \
-    --resume_from_checkpoint "./ipex-qlora-alpaca/checkpoint-1100"
+    --output_dir "./ipex-llm-qlora-alpaca" \
+    --resume_from_checkpoint "./ipex-llm-qlora-alpaca/checkpoint-1100"
 ```
 
 ### 5. Sample Output
diff --git a/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_arc_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_arc_1_card.sh
index 42055fc2..e7db3174 100644
--- a/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_arc_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_arc_1_card.sh
@@ -20,6 +20,6 @@ python ./alpaca_lora_finetuning.py \
     --batch_size 128 \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-lora-alpaca" \
+    --output_dir "./ipex-llm-lora-alpaca" \
     --gradient_checkpointing True \
     --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj']"
diff --git a/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1110_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1110_4_card.sh
index e0c689a7..50ac3999 100644
--- a/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1110_4_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1110_4_card.sh
@@ -25,6 +25,6 @@ mpirun -n 4 \
     --batch_size 128 \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-lora-alpaca" \
+    --output_dir "./ipex-llm-lora-alpaca" \
     --gradient_checkpointing True \
     --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']"
diff --git a/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_1_tile.sh b/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_1_tile.sh
index cdde017b..719a91da 100644
--- a/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_1_tile.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_1_tile.sh
@@ -20,6 +20,6 @@ python ./alpaca_lora_finetuning.py \
     --batch_size 128 \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-lora-alpaca" \
+    --output_dir "./ipex-llm-lora-alpaca" \
     --gradient_checkpointing True \
     --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']"
diff --git a/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_4_card.sh
index 6073f0e9..c76a7f3c 100644
--- a/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_4_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/LoRA/lora_finetune_llama2_7b_pvc_1550_4_card.sh
@@ -25,6 +25,6 @@ mpirun -n 8 \
     --batch_size 128 \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-lora-alpaca" \
+    --output_dir "./ipex-llm-lora-alpaca" \
     --gradient_checkpointing False \
     --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']"
diff --git a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/README.md b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/README.md
index 1bb4b7a2..5ab124f0 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/README.md
+++ b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/README.md
@@ -52,8 +52,8 @@ bash qalora_finetune_llama2_7b_pvc_1550_1_tile.sh
 python ./alpaca_qalora_finetuning.py \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca" \
-    --resume_from_checkpoint "./ipex-qlora-alpaca/checkpoint-1100"
+    --output_dir "./ipex-llm-qlora-alpaca" \
+    --resume_from_checkpoint "./ipex-llm-qlora-alpaca/checkpoint-1100"
 ```
 
 ### 5. Sample Output
diff --git a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_arc_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_arc_1_card.sh
index efa56139..2125a574 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_arc_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_arc_1_card.sh
@@ -18,7 +18,7 @@
 python ./alpaca_qalora_finetuning.py \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca" \
+    --output_dir "./ipex-llm-qlora-alpaca" \
     --learning_rate 9e-5 \
     --micro_batch_size 2 \
     --batch_size 128 \
diff --git a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_arc_2_card.sh b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_arc_2_card.sh
index d30fb7ae..fade7106 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_arc_2_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_arc_2_card.sh
@@ -23,7 +23,7 @@ mpirun -n 2 \
        python -u ./alpaca_qalora_finetuning.py \
        --base_model "meta-llama/Llama-2-7b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" \
+       --output_dir "./ipex-llm-qlora-alpaca" \
        --learning_rate 9e-5 \
        --micro_batch_size 2 \
        --batch_size 128 \
diff --git a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_pvc_1550_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_pvc_1550_1_card.sh
index 70d47833..86d6e87e 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_pvc_1550_1_card.sh
@@ -23,7 +23,7 @@ mpirun -n 2 \
        python -u ./alpaca_qalora_finetuning.py \
        --base_model "meta-llama/Llama-2-7b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" \
+       --output_dir "./ipex-llm-qlora-alpaca" \
        --learning_rate 9e-5 \
        --micro_batch_size 8 \
        --batch_size 128 \
diff --git a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_pvc_1550_1_tile.sh b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_pvc_1550_1_tile.sh
index ed244b83..ac128a12 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_pvc_1550_1_tile.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/qalora_finetune_llama2_7b_pvc_1550_1_tile.sh
@@ -19,7 +19,7 @@
 python ./alpaca_qalora_finetuning.py \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca" \
+    --output_dir "./ipex-llm-qlora-alpaca" \
     --learning_rate 9e-5 \
     --micro_batch_size 8 \
     --batch_size 128 \
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md
index acb57aba..9893c763 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md
@@ -135,8 +135,8 @@ If you fail to complete the whole finetuning process, it is suggested to resume
 python ./alpaca_qlora_finetuning.py \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca" \
-    --resume_from_checkpoint "./ipex-qlora-alpaca/checkpoint-1100"
+    --output_dir "./ipex-llm-qlora-alpaca" \
+    --resume_from_checkpoint "./ipex-llm-qlora-alpaca/checkpoint-1100"
 ```
 
 ### 5. Sample Output
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_card.sh
index bafbc62e..d9b9cb25 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_card.sh
@@ -23,6 +23,6 @@ mpirun -n 2 \
        python -u ./alpaca_qlora_finetuning.py \
        --base_model "meta-llama/Llama-2-13b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" \
+       --output_dir "./ipex-llm-qlora-alpaca" \
        --micro_batch_size 8 \
        --batch_size 128 > training.log
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_tile.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_tile.sh
index 5a5f8c1d..d017f2da 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_tile.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_1_tile.sh
@@ -18,6 +18,6 @@
 python ./alpaca_qlora_finetuning.py \
     --base_model "meta-llama/Llama-2-13b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca" \
+    --output_dir "./ipex-llm-qlora-alpaca" \
     --micro_batch_size 8 \
     --batch_size 128
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_4_card.sh
index f03716d2..78af398b 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_4_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_pvc_1550_4_card.sh
@@ -23,6 +23,6 @@ mpirun -n 8 \
        python -u ./alpaca_qlora_finetuning.py \
        --base_model "meta-llama/Llama-2-13b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" \
+       --output_dir "./ipex-llm-qlora-alpaca" \
        --micro_batch_size 8 \
        --batch_size 128 > training.log
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_1_card.sh
index 9f7bf380..ee6958cf 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_1_card.sh
@@ -27,7 +27,7 @@ mpirun -n 2 \
        python -u ./alpaca_qlora_finetuning.py \
        --base_model "meta-llama/Llama-2-70b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" \
+       --output_dir "./ipex-llm-qlora-alpaca" \
        --gradient_checkpointing True \
        --micro_batch_size 8 \
        --batch_size 128 \
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_4_card.sh
index 9dead743..baa98b9e 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_4_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_70b_pvc_1550_4_card.sh
@@ -27,7 +27,7 @@ mpirun -n 8 \
        python -u ./alpaca_qlora_finetuning.py \
        --base_model "meta-llama/Llama-2-70b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" \
+       --output_dir "./ipex-llm-qlora-alpaca" \
        --gradient_checkpointing True \
        --micro_batch_size 8 \
        --batch_size 128 \
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_1_card.sh
index 12056c20..5066645b 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_1_card.sh
@@ -18,4 +18,4 @@
 python ./alpaca_qlora_finetuning.py \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca"
+    --output_dir "./ipex-llm-qlora-alpaca"
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_2_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_2_card.sh
index cb10a142..b8431eb6 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_2_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_arc_2_card.sh
@@ -23,4 +23,4 @@ mpirun -n 2 \
        python -u ./alpaca_qlora_finetuning.py \
        --base_model "meta-llama/Llama-2-7b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" > training.log
+       --output_dir "./ipex-llm-qlora-alpaca" > training.log
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_1_card.sh
index 316d4cc5..3c61b6a5 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_1_card.sh
@@ -20,4 +20,4 @@ python ./alpaca_qlora_finetuning.py \
     --batch_size 128 \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca"
+    --output_dir "./ipex-llm-qlora-alpaca"
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_3_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_3_card.sh
index bc9b4dcf..4b911a9f 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_3_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_flex_170_3_card.sh
@@ -23,7 +23,7 @@ mpirun -n 3 \
        python -u ./alpaca_qlora_finetuning.py \
        --base_model "meta-llama/Llama-2-7b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" \
+       --output_dir "./ipex-llm-qlora-alpaca" \
        --gradient_checkpointing False \
        --micro_batch_size 2 \
        --batch_size 128 > training.log
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_1_card.sh
index 52e1a304..7deeed6a 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_1_card.sh
@@ -20,4 +20,4 @@ python ./alpaca_qlora_finetuning.py \
     --batch_size 128 \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca"
+    --output_dir "./ipex-llm-qlora-alpaca"
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_4_card.sh
index 8ee4dbbd..4f465b56 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_4_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1100_4_card.sh
@@ -23,6 +23,6 @@ mpirun -n 4 \
        python -u ./alpaca_qlora_finetuning.py \
        --base_model "meta-llama/Llama-2-7b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" \
+       --output_dir "./ipex-llm-qlora-alpaca" \
        --micro_batch_size 8 \
        --batch_size 128 > training.log
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_1_card.sh
index 272870e9..89baa62d 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_1_card.sh
@@ -23,6 +23,6 @@ mpirun -n 2 \
        python -u ./alpaca_qlora_finetuning.py \
        --base_model "meta-llama/Llama-2-7b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" \
+       --output_dir "./ipex-llm-qlora-alpaca" \
        --micro_batch_size 8 \
        --batch_size 128 > training.log
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_4_card.sh
index 801d88fc..16479a8c 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_4_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_7b_pvc_1550_4_card.sh
@@ -23,6 +23,6 @@ mpirun -n 8 \
        python -u ./alpaca_qlora_finetuning.py \
        --base_model "meta-llama/Llama-2-7b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-qlora-alpaca" \
+       --output_dir "./ipex-llm-qlora-alpaca" \
        --micro_batch_size 8 \
        --batch_size 128 > training.log
diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/README.md b/python/llm/example/GPU/LLM-Finetuning/ReLora/README.md
index 4a03e162..3218948b 100644
--- a/python/llm/example/GPU/LLM-Finetuning/ReLora/README.md
+++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/README.md
@@ -58,8 +58,8 @@ bash relora_finetune_llama2_7b_pvc_1550_4_card.sh
 python ./alpaca_relora_finetuning.py \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-qlora-alpaca" \
-    --resume_from_checkpoint "./ipex-qlora-alpaca/checkpoint-1100"
+    --output_dir "./ipex-llm-qlora-alpaca" \
+    --resume_from_checkpoint "./ipex-llm-qlora-alpaca/checkpoint-1100"
 ```
 
 ### 5. Sample Output
diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_1_card.sh
index 4bb00965..dee521e3 100644
--- a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_1_card.sh
@@ -18,6 +18,6 @@
 python ./alpaca_relora_finetuning.py \
     --base_model "meta-llama/Llama-2-7b-hf" \
     --data_path "yahma/alpaca-cleaned" \
-    --output_dir "./ipex-relora-alpaca" \
+    --output_dir "./ipex-llm-relora-alpaca" \
     --relora_steps 300 \
     --relora_warmup_steps 10
diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_2_card.sh b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_2_card.sh
index 3d66fd41..28bce7be 100644
--- a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_2_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_arc_2_card.sh
@@ -23,6 +23,6 @@ mpirun -n 2 \
        python -u ./alpaca_relora_finetuning.py \
        --base_model "meta-llama/Llama-2-7b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-relora-alpaca" \
+       --output_dir "./ipex-llm-relora-alpaca" \
        --relora_steps 300 \
        --relora_warmup_steps 10 > training.log
diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_1_card.sh b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_1_card.sh
index 042c25d5..3b2fbb38 100644
--- a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_1_card.sh
@@ -23,7 +23,7 @@ mpirun -n 2 \
        python -u ./alpaca_relora_finetuning.py \
        --base_model "meta-llama/Llama-2-7b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-relora-alpaca" \
+       --output_dir "./ipex-llm-relora-alpaca" \
        --micro_batch_size 8 \
        --relora_steps 300 \
        --relora_warmup_steps 10 \
diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_4_card.sh b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_4_card.sh
index c2f12c90..9c152a41 100644
--- a/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_4_card.sh
+++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/relora_finetune_llama2_7b_pvc_1550_4_card.sh
@@ -23,7 +23,7 @@ mpirun -n 8 \
        python -u ./alpaca_relora_finetuning.py \
        --base_model "meta-llama/Llama-2-7b-hf" \
        --data_path "yahma/alpaca-cleaned" \
-       --output_dir "./ipex-relora-alpaca" \
+       --output_dir "./ipex-llm-relora-alpaca" \
        --micro_batch_size 8 \
        --relora_steps 300 \
        --relora_warmup_steps 10 \
diff --git a/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py b/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py
index 8c7070ba..ee9e7193 100644
--- a/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py
+++ b/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py
@@ -62,7 +62,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/ModelScope-Models/generate.py b/python/llm/example/GPU/ModelScope-Models/generate.py
index b4fd6637..0df692e1 100644
--- a/python/llm/example/GPU/ModelScope-Models/generate.py
+++ b/python/llm/example/GPU/ModelScope-Models/generate.py
@@ -60,7 +60,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = CHATGLM_V3_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
index cc6c3c48..f9454e78 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
@@ -90,7 +90,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu:0')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         output = model.generate(input_ids,
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py
index 98491948..6258dd2e 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = AQUILA2_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py
index 52f8adf0..9fcddbf1 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py
index 215370b4..9ebb56be 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py
@@ -59,7 +59,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = BAICHUAN2_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py
index 1e830107..9bff3517 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py
@@ -52,7 +52,7 @@ if __name__ == '__main__':
     inputs = processor(text, voice_preset=voice_preset).to('xpu')
 
     with torch.inference_mode():
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         audio_array = model.generate(**inputs)
 
         st = time.time()
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py
index ac6e0842..cdb9567f 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = BLUELM_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py
index 71b6ceea..f7cc3938 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = CHATGLM_V2_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py
index 1e860e80..4c9dbe77 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = args.question
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=32)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py
index 1568e085..ab6ad290 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = CHATGLM_V3_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py
index 20f8b33c..89c93edb 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = args.question
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=32)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
index 9d09c857..a9eaaace 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
@@ -59,7 +59,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = CODELLAMA_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py
index a084b615..9168b46a 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py
@@ -65,7 +65,7 @@ if __name__ == '__main__':
         prompt = DOLLY_V1_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
         end_key_token_id=tokenizer.encode("### End")[0]
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 use_cache=True,
                                 max_new_tokens=args.n_predict,
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py
index 9445f406..82787ec9 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py
@@ -65,7 +65,7 @@ if __name__ == '__main__':
         prompt = DOLLY_V2_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
         end_key_token_id=tokenizer.encode("### End")[0]
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 pad_token_id=tokenizer.pad_token_id,
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py
index 11eedd25..8c216ec4 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py
@@ -60,7 +60,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = FLAN_T5_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py
index 799bb62c..2c9ffa6c 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = INTERNLM_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py
index 3fe07715..4add8dfb 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py
@@ -62,7 +62,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py
index e1b392b2..ac8a8dd9 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py
@@ -54,7 +54,7 @@ if __name__ == '__main__':
     # Generate predicted tokens
     with torch.inference_mode():
         input_ids = tokenizer.encode(args.prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
         st = time.time()
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
index 459d23e5..80e1fc52 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = MISTRAL_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py
index ae8f0a97..ec8c3711 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py
@@ -58,7 +58,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = MIXTRAL_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py
index 827b59bd..fbdef847 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py
@@ -56,7 +56,7 @@ if __name__ == '__main__':
         prompt = PHI_1_5_V1_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
         
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
         # start inference
         st = time.time()
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py
index 3c629f84..ca7499bd 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)
         # start inference
         st = time.time()
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py
index 8287a37b..d1e7e7fa 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py
@@ -61,7 +61,7 @@ if __name__ == '__main__':
         prompt = PHI1_5_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict,
                                 generation_config = generation_config)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py
index 73eecb01..3edd21b7 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = REPLIT_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py
index 9776a039..58e57c8d 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py
@@ -89,7 +89,7 @@ if __name__ == '__main__':
     speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to('xpu')
     
     with torch.inference_mode():
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
 
         st = time.time()
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py
index 380d63c3..1092fe43 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py
@@ -57,7 +57,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = STARCODER_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py
index d08d6087..61fe372e 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py
@@ -55,7 +55,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = YI_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py
index 6caec894..d3a94b07 100644
--- a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py
@@ -62,7 +62,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py b/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py
index 9a289568..fe9fff65 100644
--- a/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py
@@ -73,7 +73,7 @@ if __name__ == '__main__':
     with torch.inference_mode():
         prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt)
         input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-        # ipex model needs a warmup, then inference time can be accurate
+        # ipex_llm model needs a warmup, then inference time can be accurate
         output = model.generate(input_ids,
                                 max_new_tokens=args.n_predict)
 
diff --git a/python/llm/src/ipex_llm/serving/fastchat/README.md b/python/llm/src/ipex_llm/serving/fastchat/README.md
index 4f741792..ec905219 100644
--- a/python/llm/src/ipex_llm/serving/fastchat/README.md
+++ b/python/llm/src/ipex_llm/serving/fastchat/README.md
@@ -52,16 +52,14 @@ python3 -m fastchat.serve.controller
 Using IPEX-LLM in FastChat does not impose any new limitations on model usage. Therefore, all Hugging Face Transformer models can be utilized in FastChat.
 
 #### IPEX-LLM model worker (deprecated)
-<details>
-<summary>details</summary>
 
 > Warning: This method has been deprecated, please change to use `IPEX-LLM` [worker](#ipex-llm-worker) instead.
 
 FastChat determines the Model adapter to use through path matching. Therefore, in order to load models using IPEX-LLM, you need to make some modifications to the model's name.
 
-For instance, assuming you have downloaded the `llama-7b-hf` from [HuggingFace](https://huggingface.co/decapoda-research/llama-7b-hf).  Then, to use the `IPEX-LLM` as backend, you need to change name from `llama-7b-hf` to `ipex-7b`.The key point here is that the model's path should include "ipex" and **should not include paths matched by other model adapters**.
+For instance, assuming you have downloaded the `llama-7b-hf` from [HuggingFace](https://huggingface.co/decapoda-research/llama-7b-hf).  Then, to use the `IPEX-LLM` as backend, you need to change name from `llama-7b-hf` to `ipex-llm-7b`.The key point here is that the model's path should include "ipex" and **should not include paths matched by other model adapters**.
 
-Then we will use `ipex-7b` as model-path.
+Then we will use `ipex-llm-7b` as model-path.
 
 > note: This is caused by the priority of name matching list. The new added `IPEX-LLM` adapter is at the tail of the name-matching list so that it has the lowest priority. If model path contains other keywords like `vicuna` which matches to another adapter with higher priority, then the `IPEX-LLM` adapter will not work.
 
@@ -71,13 +69,13 @@ Then we can run model workers
 
 ```bash
 # On CPU
-python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/ipex-7b --device cpu
+python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/ipex-llm-7b --device cpu
 
 # On GPU
-python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/ipex-7b --device xpu
+python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/ipex-llm-7b --device xpu
 ```
 
-If you run successfully using `IPEX` backend, you can see the output in log like this:
+If you run successfully using `ipex_llm` backend, you can see the output in log like this:
 
 ```bash
 INFO - Converting the current model to sym_int4 format......
@@ -87,9 +85,11 @@ INFO - Converting the current model to sym_int4 format......
 </details>
 
 #### IPEX-LLM worker
+
 To integrate IPEX-LLM with `FastChat` efficiently, we have provided a new model_worker implementation named `ipex_llm_worker.py`.
 
 To run the `ipex_llm_worker` on CPU, using the following code:
+
 ```bash
 source ipex-llm-init -t
 
@@ -97,8 +97,8 @@ source ipex-llm-init -t
 python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu"
 ```
 
-
 For GPU example:
+
 ```bash
 # Available low_bit format including sym_int4, sym_int8, fp16 etc.
 python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu"
diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
index 820dbecc..4e568e7f 100644
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@@ -644,7 +644,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
                 f"format......")
     modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert
 
-    # using ipex optimizer before changing to bigdl linear
+    # using ipex_llm optimizer before changing to bigdl linear
     _enable_ipex = get_enable_ipex()
 
     if _enable_ipex:
diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
index 35093106..d5e35d3e 100644
--- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py
+++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
@@ -85,7 +85,7 @@ Q2_K = ggml_tensor_qtype["q2_k"]
 # new_weight_tile is directly VNNI packed, but I did not find significant
 # performance improvement.
 #
-# Note this format cannot be used directly in IPEX's mm_int4, which expects
+# Note this format cannot be used directly in IPEX-LLM's mm_int4, which expects
 # row major but packing two consecutive columns.
 def q4_0_xpu_transpose(ggml_weight, weight_shape):
     from ipex_llm.transformers.low_bit_linear import get_block_size
diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm2.py b/python/llm/src/ipex_llm/transformers/models/chatglm2.py
index 3308e93f..31a86245 100644
--- a/python/llm/src/ipex_llm/transformers/models/chatglm2.py
+++ b/python/llm/src/ipex_llm/transformers/models/chatglm2.py
@@ -227,7 +227,7 @@ def chatglm2_quantized_attention_forward_8eb45c(
             key_layer = key_layer.transpose(0, 1)
             query_layer_cur = query_layer[..., :rot_dim]
             key_layer_cur = key_layer[..., :rot_dim]
-            # ipex's apply_rotary_embedding can change the origin storage, so query_layer will get
+            # ipex_llm's apply_rotary_embedding can change the origin storage, so query_layer will get
             # the result directly.
             torch.ops.torch_ipex.apply_rotary_embedding(query_layer_cur, sin, cos, query_layer_cur)
             torch.ops.torch_ipex.apply_rotary_embedding(key_layer_cur, sin, cos, key_layer_cur)
@@ -367,7 +367,7 @@ def chatglm2_attention_forward_8eb45c(
             key_layer = key_layer.transpose(0, 1)
             query_layer_cur = query_layer[..., :rot_dim]
             key_layer_cur = key_layer[..., :rot_dim]
-            # ipex's apply_rotary_embedding can change the origin storage, so query_layer will get
+            # ipex_llm's apply_rotary_embedding can change the origin storage, so query_layer will get
             # the result directly.
             torch.ops.torch_ipex.apply_rotary_embedding(query_layer_cur, sin, cos, query_layer_cur)
             torch.ops.torch_ipex.apply_rotary_embedding(key_layer_cur, sin, cos, key_layer_cur)
diff --git a/python/llm/src/ipex_llm/transformers/speculative.py b/python/llm/src/ipex_llm/transformers/speculative.py
index 1c781e66..2585890d 100644
--- a/python/llm/src/ipex_llm/transformers/speculative.py
+++ b/python/llm/src/ipex_llm/transformers/speculative.py
@@ -565,7 +565,7 @@ def speculative_generate(self,
                 ("mistral" in self.config.model_type) or
                 ("qwen" in self.config.model_type) or
                 ("chatglm" in self.config.model_type)):
-            invalidInputError(False, "BigDL Speculative Decoding with IPEX only supports \
+            invalidInputError(False, "BigDL Speculative Decoding with IPEX-LLM only supports \
                               Llama, Baichuan2, Mistral, ChatGLM and Qwen models currently.")
         if "chatglm" in self.config.model_type:
             global query_group_size
@@ -726,7 +726,7 @@ def speculative_generate(self,
                             past_key_values=draft_past_key_values,
                         )
                     else:
-                        invalidInputError(False, "BigDL Speculative Decoding with IPEX only supports \
+                        invalidInputError(False, "BigDL Speculative Decoding with IPEX-LLM only supports \
                               Llama, Baichuan2, Mistral, ChatGLM and Qwen models currently.")
 
                     draft_output = CausalLMOutputWithPast(