diff --git a/README.md b/README.md
index efa55382..4c6110fe 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
 
 ```python
 #load Hugging Face Transformers model with INT4 optimizations
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
 
 #run the optimized model on CPU
@@ -113,7 +113,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
 
 ```python
 #load Hugging Face Transformers model with INT4 optimizations
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
 
 #run the optimized model on Intel GPU
diff --git a/docker/llm/README.md b/docker/llm/README.md
index 2eb47b61..1f418bb9 100644
--- a/docker/llm/README.md
+++ b/docker/llm/README.md
@@ -223,7 +223,7 @@ This controller manages the distributed workers.
 
 ##### Launch the model worker(s)
 ```bash
-python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
+python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
 ```
 Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
 
@@ -252,7 +252,7 @@ python3 -m fastchat.serve.controller
 Then, launch the model worker(s):
 
 ```bash
-python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
+python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
 ```
 
 Finally, launch the RESTful API server
@@ -319,7 +319,7 @@ This controller manages the distributed workers.
 
 ##### Launch the model worker(s)
 ```bash
-python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
+python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
 ```
 Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
 
@@ -346,7 +346,7 @@ python3 -m fastchat.serve.controller
 Then, launch the model worker(s):
 
 ```bash
-python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
+python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
 ```
 
 Finally, launch the RESTful API server
diff --git a/docker/llm/inference/xpu/docker/chat.py b/docker/llm/inference/xpu/docker/chat.py
index b40c5f42..6401a86b 100644
--- a/docker/llm/inference/xpu/docker/chat.py
+++ b/docker/llm/inference/xpu/docker/chat.py
@@ -23,7 +23,7 @@ from transformers import TextIteratorStreamer
 from transformers.tools.agents import StopSequenceCriteria
 from transformers.generation.stopping_criteria import StoppingCriteriaList
 from colorama import Fore
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 SYSTEM_PROMPT = "A chat between a curious human <human> and an artificial intelligence assistant <bot>.\
 The assistant gives helpful, detailed, and polite answers to the human's questions."
 HUMAN_ID = "<human>"
diff --git a/docker/llm/serving/cpu/docker/entrypoint.sh b/docker/llm/serving/cpu/docker/entrypoint.sh
index 87a691c7..36217dd2 100644
--- a/docker/llm/serving/cpu/docker/entrypoint.sh
+++ b/docker/llm/serving/cpu/docker/entrypoint.sh
@@ -135,9 +135,9 @@ else
   done
 
   if [ "$worker_type" == "model_worker" ]; then
-      worker_type="bigdl.llm.serving.model_worker"
+      worker_type="ipex_llm.serving.model_worker"
   elif [ "$worker_type" == "vllm_worker" ]; then
-      worker_type="bigdl.llm.serving.vllm_worker"
+      worker_type="ipex_llm.serving.vllm_worker"
   fi
 
   if [[ -n $CONTROLLER_HOST ]]; then
@@ -220,9 +220,9 @@ else
     echo "Worker type: $worker_type"
     echo "Worker address: $worker_address"
     echo "Controller address: $controller_address"
-    if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
+    if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
       python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
-    elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
+    elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
       python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
     fi
   fi
diff --git a/docker/llm/serving/cpu/docker/model_adapter.py.patch b/docker/llm/serving/cpu/docker/model_adapter.py.patch
index b9a68a3a..6bd43d5a 100644
--- a/docker/llm/serving/cpu/docker/model_adapter.py.patch
+++ b/docker/llm/serving/cpu/docker/model_adapter.py.patch
@@ -9,7 +9,7 @@
          generation_config = GenerationConfig.from_pretrained(
              model_path, trust_remote_code=True
          )
-+        from bigdl.llm.transformers import AutoModelForCausalLM
++        from ipex_llm.transformers import AutoModelForCausalLM
          model = AutoModelForCausalLM.from_pretrained(
              model_path,
              config=config,
diff --git a/docker/llm/serving/xpu/docker/entrypoint.sh b/docker/llm/serving/xpu/docker/entrypoint.sh
index 705797f3..c5f0e92c 100644
--- a/docker/llm/serving/xpu/docker/entrypoint.sh
+++ b/docker/llm/serving/xpu/docker/entrypoint.sh
@@ -66,9 +66,9 @@ else
     done
     
     if [ "$worker_type" == "model_worker" ]; then
-        worker_type="bigdl.llm.serving.model_worker"
+        worker_type="ipex_llm.serving.model_worker"
     elif [ "$worker_type" == "vllm_worker" ]; then
-        worker_type="bigdl.llm.serving.vllm_worker"
+        worker_type="ipex_llm.serving.vllm_worker"
     fi
     
     if [[ -n $CONTROLLER_HOST ]]; then
@@ -127,9 +127,9 @@ else
         echo "Worker address: $worker_address"
         echo "Controller address: $controller_address"
         
-        if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
+        if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
             python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
-        elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
+        elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
             python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
         fi
     fi
diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/finetune.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/finetune.md
index 70157907..e4cf8700 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/finetune.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/finetune.md
@@ -21,7 +21,7 @@ To help you better understand the finetuning process, here we use model [Llama-2
 First, load model using `transformers`-style API and **set it to `to('xpu')`**. We specify `load_in_low_bit="nf4"` here to apply 4-bit NormalFloat optimization. According to the [QLoRA paper](https://arxiv.org/pdf/2305.14314.pdf), using `"nf4"` could yield better model quality than `"int4"`.
 
 ```python
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",
                                              load_in_low_bit="nf4",
@@ -33,14 +33,14 @@ model = model.to('xpu')
 
 Then, we have to apply some preprocessing to the model to prepare it for training.
 ```python
-from bigdl.llm.transformers.qlora import prepare_model_for_kbit_training
+from ipex_llm.transformers.qlora import prepare_model_for_kbit_training
 model.gradient_checkpointing_enable()
 model = prepare_model_for_kbit_training(model)
 ```
 
 Next, we can obtain a Peft model from the optimized model and a configuration object containing the parameters as follows:
 ```python
-from bigdl.llm.transformers.qlora import get_peft_model
+from ipex_llm.transformers.qlora import get_peft_model
 from peft import LoraConfig
 config = LoraConfig(r=8, 
                     lora_alpha=32, 
@@ -54,7 +54,7 @@ model = get_peft_model(model, config)
 ```eval_rst
 .. important::
 
-   Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``bigdl.llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
+   Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``ipex_llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
 ```
 
 ```eval_rst
diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/hugging_face_format.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/hugging_face_format.md
index ef3c6238..387d14d0 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/hugging_face_format.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/hugging_face_format.md
@@ -5,7 +5,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
 
 ```python
 # load Hugging Face Transformers model with INT4 optimizations
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
 ```
diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/inference_on_gpu.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/inference_on_gpu.md
index e76a0f73..332a5c1f 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/inference_on_gpu.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/inference_on_gpu.md
@@ -29,7 +29,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
 
          # Take Llama-2-7b-chat-hf as an example
          from transformers import LlamaForCausalLM
-         from bigdl.llm import optimize_model
+         from ipex_llm import optimize_model
 
          model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
          model = optimize_model(model) # With only one line to enable BigDL-LLM INT4 optimization
@@ -40,14 +40,14 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
 
          When running LLMs on Intel iGPUs for Windows users, we recommend setting ``cpu_embedding=True`` in the ``optimize_model`` function. This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
          
-         See the `API doc <../../../PythonAPI/LLM/optimize.html#bigdl.llm.optimize_model>`_ for ``optimize_model`` to find more information.
+         See the `API doc <../../../PythonAPI/LLM/optimize.html#ipex_llm.optimize_model>`_ for ``optimize_model`` to find more information.
 
       Especially, if you have saved the optimized model following setps `here <./optimize_model.html#save>`_, the loading process on Intel GPUs maybe as follows:
 
       .. code-block:: python
 
          from transformers import LlamaForCausalLM
-         from bigdl.llm.optimize import low_memory_init, load_low_bit
+         from ipex_llm.optimize import low_memory_init, load_low_bit
 
          saved_dir='./llama-2-bigdl-llm-4-bit'
          with low_memory_init(): # Fast and low cost by loading model on meta device
@@ -65,7 +65,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
       .. code-block:: python
 
          # Take Llama-2-7b-chat-hf as an example
-         from bigdl.llm.transformers import AutoModelForCausalLM
+         from ipex_llm.transformers import AutoModelForCausalLM
 
          # Load model in 4 bit, which convert the relevant layers in the model into INT4 format
          model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', load_in_4bit=True)
@@ -82,7 +82,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
 
       .. code-block:: python
 
-         from bigdl.llm.transformers import AutoModelForCausalLM
+         from ipex_llm.transformers import AutoModelForCausalLM
 
          saved_dir='./llama-2-bigdl-llm-4-bit'
          model = AutoModelForCausalLM.load_low_bit(saved_dir) # Load the optimized model
diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md
index 8ec3f433..962f4b2a 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md
@@ -7,8 +7,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
 You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
 
 ```python
-from bigdl.llm.langchain.llms import TransformersLLM
-from bigdl.llm.langchain.embeddings import TransformersEmbeddings
+from ipex_llm.langchain.llms import TransformersLLM
+from ipex_llm.langchain.embeddings import TransformersEmbeddings
 from langchain.chains.question_answering import load_qa_chain
 
 embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
@@ -37,8 +37,8 @@ You may also convert Hugging Face *Transformers* models into native INT4 format,
 ```
 
 ```python
-from bigdl.llm.langchain.llms import LlamaLLM
-from bigdl.llm.langchain.embeddings import LlamaEmbeddings
+from ipex_llm.langchain.llms import LlamaLLM
+from ipex_llm.langchain.embeddings import LlamaEmbeddings
 from langchain.chains.question_answering import load_qa_chain
 
 # switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/native_format.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/native_format.md
index e66d68fd..49184835 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/native_format.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/native_format.md
@@ -10,13 +10,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 format
 
 ```python
 # convert the model
-from bigdl.llm import llm_convert
+from ipex_llm import llm_convert
 bigdl_llm_path = llm_convert(model='/path/to/model/',
        outfile='/path/to/output/', outtype='int4', model_family="llama")
 
 # load the converted model
 # switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
-from bigdl.llm.transformers import LlamaForCausalLM
+from ipex_llm.transformers import LlamaForCausalLM
 llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
 
 # run the converted model
diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md
index e997d32c..9c640e8d 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md
@@ -14,7 +14,7 @@ model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_
 
 Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default: 
 ```python
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # With only one line to enable BigDL-LLM INT4 optimization
 model = optimize_model(model)
@@ -31,7 +31,7 @@ Currently, ``low_bit`` supports options 'sym_int4', 'asym_int4', 'sym_int5', 'as
 You may apply symmetric INT8 optimization as follows:
 
 ```python
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # Apply symmetric INT8 optimization
 model = optimize_model(model, low_bit="sym_int8")
@@ -51,7 +51,7 @@ model.save_low_bit(saved_dir)
 
 We recommend to use the context manager `low_memory_init` to quickly initiate a model instance with low cost, and then use `load_low_bit` to load the optimized low-bit model as follows:
 ```python
-from bigdl.llm.optimize import low_memory_init, load_low_bit
+from ipex_llm.optimize import low_memory_init, load_low_bit
 with low_memory_init(): # Fast and low cost by loading model on meta device
    model = LlamaForCausalLM.from_pretrained(saved_dir,
                                             torch_dtype="auto",
diff --git a/docs/readthedocs/source/doc/LLM/Overview/llm.md b/docs/readthedocs/source/doc/LLM/Overview/llm.md
index a13605a5..7f7d4194 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/llm.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/llm.md
@@ -11,7 +11,7 @@ Here, let's take a relatively small LLM model, i.e [open_llama_3b_v2](https://hu
 Simply use one-line `transformers`-style API in `bigdl-llm` to load `open_llama_3b_v2` with INT4 optimization (by specifying `load_in_4bit=True`) as follows:
 
 ```python
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="openlm-research/open_llama_3b_v2",
                                              load_in_4bit=True)
diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md b/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md
index 96a5b4b3..efdf7d10 100644
--- a/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md
@@ -112,7 +112,7 @@ Install the Miniconda as follows if you don't have conda installed on your machi
 
   python
 
-  > from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+  > from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
   ```
 
   > <img src="https://llm-assets.readthedocs.io/en/latest/_images/verify_bigdl_import.png" alt="image-20240221102252562" width=100%; />
@@ -170,7 +170,7 @@ Now let's play with a real LLM. We'll be using the [phi-1.5](https://huggingface
    ```python
    # Copy/Paste the contents to a new file demo.py
    import torch
-   from bigdl.llm.transformers import AutoModelForCausalLM
+   from ipex_llm.transformers import AutoModelForCausalLM
    from transformers import AutoTokenizer, GenerationConfig
    generation_config = GenerationConfig(use_cache = True)
    
diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md b/docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md
index 88baeb9b..370422d1 100644
--- a/docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md
@@ -130,7 +130,7 @@ You can verify if `bigdl-llm` is successfully installed by simply running a few
 * Step 5: Copy following code to Anaconda prompt **line by line** and press Enter **after copying each line**.
   ```python
   import torch 
-  from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM    
+  from ipex_llm.transformers import AutoModel,AutoModelForCausalLM    
   tensor_1 = torch.randn(1, 1, 40, 128).to('xpu') 
   tensor_2 = torch.randn(1, 1, 128, 40).to('xpu') 
   print(torch.matmul(tensor_1, tensor_2).size()) 
@@ -200,7 +200,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
   
            # Copy/Paste the contents to a new file demo.py
            import torch
-           from bigdl.llm.transformers import AutoModelForCausalLM
+           from ipex_llm.transformers import AutoModelForCausalLM
            from transformers import AutoTokenizer, GenerationConfig
            generation_config = GenerationConfig(use_cache=True)
            
@@ -260,7 +260,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
 
            # Copy/Paste the contents to a new file demo.py
            import torch
-           from bigdl.llm.transformers import AutoModelForCausalLM
+           from ipex_llm.transformers import AutoModelForCausalLM
            from transformers import GenerationConfig
            from modelscope import AutoTokenizer
            generation_config = GenerationConfig(use_cache=True)
diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst
index 445e71f8..bf0fa88d 100644
--- a/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst
+++ b/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst
@@ -13,7 +13,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im
 
     .. tab:: AutoModel
 
-        .. automodule:: bigdl.llm.langchain.llms.transformersllm
+        .. automodule:: ipex_llm.langchain.llms.transformersllm
             :members:
             :undoc-members:
             :show-inheritance:
@@ -21,7 +21,7 @@ BigDL-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which im
 
     .. tab:: pipeline
 
-        .. automodule:: bigdl.llm.langchain.llms.transformerspipelinellm
+        .. automodule:: ipex_llm.langchain.llms.transformerspipelinellm
             :members:
             :undoc-members:
             :show-inheritance:
@@ -37,7 +37,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
 
     .. tab:: Llama
 
-        .. autoclass:: bigdl.llm.langchain.llms.bigdlllm.LlamaLLM
+        .. autoclass:: ipex_llm.langchain.llms.bigdlllm.LlamaLLM
             :members:
             :undoc-members:
             :show-inheritance:
@@ -49,7 +49,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
 
     .. tab:: ChatGLM
 
-        .. autoclass:: bigdl.llm.langchain.llms.bigdlllm.ChatGLMLLM
+        .. autoclass:: ipex_llm.langchain.llms.bigdlllm.ChatGLMLLM
             :members:
             :undoc-members:
             :show-inheritance:
@@ -61,7 +61,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
 
     .. tab:: Bloom
 
-        .. autoclass:: bigdl.llm.langchain.llms.bigdlllm.BloomLLM
+        .. autoclass:: ipex_llm.langchain.llms.bigdlllm.BloomLLM
             :members:
             :undoc-members:
             :show-inheritance:
@@ -73,7 +73,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
 
     .. tab:: Gptneox
 
-        .. autoclass:: bigdl.llm.langchain.llms.bigdlllm.GptneoxLLM
+        .. autoclass:: ipex_llm.langchain.llms.bigdlllm.GptneoxLLM
             :members:
             :undoc-members:
             :show-inheritance:
@@ -85,7 +85,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
 
     .. tab:: Starcoder
 
-        .. autoclass:: bigdl.llm.langchain.llms.bigdlllm.StarcoderLLM
+        .. autoclass:: ipex_llm.langchain.llms.bigdlllm.StarcoderLLM
             :members:
             :undoc-members:
             :show-inheritance:
@@ -102,7 +102,7 @@ Embeddings Wrapper of LangChain
 Hugging Face ``transformers`` AutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. automodule:: bigdl.llm.langchain.embeddings.transformersembeddings
+.. automodule:: ipex_llm.langchain.embeddings.transformersembeddings
     :members:
     :undoc-members:
     :show-inheritance:
@@ -117,7 +117,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
 
     .. tab:: Llama
 
-        .. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.LlamaEmbeddings
+        .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.LlamaEmbeddings
             :members:
             :undoc-members:
             :show-inheritance:
@@ -129,7 +129,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
 
     .. tab:: Bloom
 
-        .. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.BloomEmbeddings
+        .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.BloomEmbeddings
             :members:
             :undoc-members:
             :show-inheritance:
@@ -141,7 +141,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
 
     .. tab:: Gptneox
 
-        .. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings
+        .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.GptneoxEmbeddings
             :members:
             :undoc-members:
             :show-inheritance:
@@ -153,7 +153,7 @@ For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also
 
     .. tab:: Starcoder
 
-        .. autoclass:: bigdl.llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings
+        .. autoclass:: ipex_llm.langchain.embeddings.bigdlllm.StarcoderEmbeddings
             :members:
             :undoc-members:
             :show-inheritance:
diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
index f28211ca..d979376e 100644
--- a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
+++ b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
@@ -6,7 +6,7 @@ Optimize Model
 
 You can run any PyTorch model with ``optimize_model`` through only one-line code change to benefit from BigDL-LLM optimization, regardless of the library or API you are using.
 
-.. automodule:: bigdl.llm
+.. automodule:: ipex_llm
     :members: optimize_model
     :undoc-members:
     :show-inheritance:
@@ -18,7 +18,7 @@ Load Optimized Model
 
 To avoid high resource consumption during the loading processes of the original model, we provide save/load API to support the saving of model after low-bit optimization and the loading of the saved low-bit model. Saving and loading operations are platform-independent, regardless of their operating systems.
 
-.. automodule:: bigdl.llm.optimize
+.. automodule:: ipex_llm.optimize
     :members: load_low_bit
     :undoc-members:
     :show-inheritance:
diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/transformers.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/transformers.rst
index 23aa10a3..711f397a 100644
--- a/docs/readthedocs/source/doc/PythonAPI/LLM/transformers.rst
+++ b/docs/readthedocs/source/doc/PythonAPI/LLM/transformers.rst
@@ -10,7 +10,7 @@ You can apply BigDL-LLM optimizations on any Hugging Face Transformers models by
 AutoModelForCausalLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: bigdl.llm.transformers.AutoModelForCausalLM
+.. autoclass:: ipex_llm.transformers.AutoModelForCausalLM
     :members:
     :undoc-members:
     :show-inheritance:
@@ -22,7 +22,7 @@ AutoModelForCausalLM
 AutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: bigdl.llm.transformers.AutoModel
+.. autoclass:: ipex_llm.transformers.AutoModel
     :members:
     :undoc-members:
     :show-inheritance:
@@ -34,7 +34,7 @@ AutoModel
 AutoModelForSpeechSeq2Seq
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: bigdl.llm.transformers.AutoModelForSpeechSeq2Seq
+.. autoclass:: ipex_llm.transformers.AutoModelForSpeechSeq2Seq
     :members:
     :undoc-members:
     :show-inheritance:
@@ -46,7 +46,7 @@ AutoModelForSpeechSeq2Seq
 AutoModelForSeq2SeqLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: bigdl.llm.transformers.AutoModelForSeq2SeqLM
+.. autoclass:: ipex_llm.transformers.AutoModelForSeq2SeqLM
     :members:
     :undoc-members:
     :show-inheritance:
@@ -67,7 +67,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
 
     .. tab:: Llama
 
-        .. autoclass:: bigdl.llm.transformers.LlamaForCausalLM
+        .. autoclass:: ipex_llm.transformers.LlamaForCausalLM
             :members:
             :undoc-members:
             :show-inheritance:
@@ -77,7 +77,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
 
     .. tab:: ChatGLM
 
-        .. autoclass:: bigdl.llm.transformers.ChatGLMForCausalLM
+        .. autoclass:: ipex_llm.transformers.ChatGLMForCausalLM
             :members:
             :undoc-members:
             :show-inheritance:
@@ -87,7 +87,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
 
     .. tab:: Gptneox
 
-        .. autoclass:: bigdl.llm.transformers.GptneoxForCausalLM
+        .. autoclass:: ipex_llm.transformers.GptneoxForCausalLM
             :members:
             :undoc-members:
             :show-inheritance:
@@ -96,7 +96,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
             .. automethod:: from_pretrained
 
     .. tab:: Bloom
-        .. autoclass:: bigdl.llm.transformers.BloomForCausalLM
+        .. autoclass:: ipex_llm.transformers.BloomForCausalLM
             :members:
             :undoc-members:
             :show-inheritance:
@@ -106,7 +106,7 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo
 
     .. tab:: Starcoder
 
-        .. autoclass:: bigdl.llm.transformers.StarcoderForCausalLM
+        .. autoclass:: ipex_llm.transformers.StarcoderForCausalLM
             :members:
             :undoc-members:
             :show-inheritance:
diff --git a/docs/readthedocs/source/index.rst b/docs/readthedocs/source/index.rst
index b1d29875..958cd6d9 100644
--- a/docs/readthedocs/source/index.rst
+++ b/docs/readthedocs/source/index.rst
@@ -112,7 +112,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models
 .. code-block:: python
 
    #load Hugging Face Transformers model with INT4 optimizations
-   from bigdl.llm.transformers import AutoModelForCausalLM
+   from ipex_llm.transformers import AutoModelForCausalLM
    model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
 
    #run the optimized model on Intel CPU
@@ -146,7 +146,7 @@ You can then apply INT4 optimizations to any Hugging Face *Transformers* models
 .. code-block:: python
 
    #load Hugging Face Transformers model with INT4 optimizations
-   from bigdl.llm.transformers import AutoModelForCausalLM
+   from ipex_llm.transformers import AutoModelForCausalLM
    model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
 
    #run the optimized model on Intel GPU
diff --git a/python/llm/README.md b/python/llm/README.md
index 4ddced57..dc7df9ff 100644
--- a/python/llm/README.md
+++ b/python/llm/README.md
@@ -146,7 +146,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int
 
 ```python
 #load Hugging Face Transformers model with INT4 optimizations
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
 
 #run the optimized model on Intel CPU
@@ -164,7 +164,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* model on Int
 
 ```python
 #load Hugging Face Transformers model with INT4 optimizations
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 import intel_extension_for_pytorch
 model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
 
@@ -206,13 +206,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 model f
   
 ```python
 #convert the model
-from bigdl.llm import llm_convert
+from ipex_llm import llm_convert
 bigdl_llm_path = llm_convert(model='/path/to/model/',
         outfile='/path/to/output/', outtype='int4', model_family="llama")
 
 #load the converted model
 #switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
-from bigdl.llm.transformers import LlamaForCausalLM
+from ipex_llm.transformers import LlamaForCausalLM
 llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
   
 #run the converted model
@@ -231,8 +231,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
   You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
 
   ```python
-  from bigdl.llm.langchain.llms import TransformersLLM
-  from bigdl.llm.langchain.embeddings import TransformersEmbeddings
+  from ipex_llm.langchain.llms import TransformersLLM
+  from ipex_llm.langchain.embeddings import TransformersEmbeddings
   from langchain.chains.question_answering import load_qa_chain
 
   embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
@@ -250,8 +250,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
   >**Notes**:* Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face `transformers` model format as described above).
 
   ```python
-  from bigdl.llm.langchain.llms import LlamaLLM
-  from bigdl.llm.langchain.embeddings import LlamaEmbeddings
+  from ipex_llm.langchain.llms import LlamaLLM
+  from ipex_llm.langchain.embeddings import LlamaEmbeddings
   from langchain.chains.question_answering import load_qa_chain
 
   #switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
diff --git a/python/llm/dev/benchmark/README.md b/python/llm/dev/benchmark/README.md
index bd297133..c44a7c7a 100644
--- a/python/llm/dev/benchmark/README.md
+++ b/python/llm/dev/benchmark/README.md
@@ -7,7 +7,7 @@ Just put this file into your benchmark directory, and then wrap your transformer
 Take `chatglm-6b` as an example:
 ```python
 import torch
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 from benchmark_util import BenchmarkWrapper
 
@@ -35,7 +35,7 @@ Take `chatglm-6b` as an example:
 ```python
 import torch
 import intel_extension_for_pytorch as ipex
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 from benchmark_util import BenchmarkWrapper
 
diff --git a/python/llm/dev/benchmark/all-in-one/run-stress-test.py b/python/llm/dev/benchmark/all-in-one/run-stress-test.py
index 20e16c53..9de9cfaa 100644
--- a/python/llm/dev/benchmark/all-in-one/run-stress-test.py
+++ b/python/llm/dev/benchmark/all-in-one/run-stress-test.py
@@ -31,7 +31,7 @@ benchmark_util_path = os.path.join(current_dir, '..')
 import sys
 sys.path.append(benchmark_util_path)
 from benchmark_util import BenchmarkWrapper
-from bigdl.llm.utils.common.log4Error import invalidInputError
+from ipex_llm.utils.common.log4Error import invalidInputError
 
 LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
              'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
@@ -85,7 +85,7 @@ def run_transformer_int4(repo_id,
                          num_trials,
                          num_beams,
                          low_bit):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, LlamaTokenizer
 
     model_path = get_model_path(repo_id, local_model_hub)
@@ -149,7 +149,7 @@ def run_transformer_int4_gpu(repo_id,
                              num_trials,
                              num_beams,
                              low_bit):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
     import intel_extension_for_pytorch as ipex
     reserved_mem_list = []
diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 3a9b5347..f5c0ecfa 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -32,7 +32,7 @@ benchmark_util_path = os.path.join(current_dir, '..')
 import sys
 sys.path.append(benchmark_util_path)
 from benchmark_util import BenchmarkWrapper
-from bigdl.llm.utils.common.log4Error import invalidInputError
+from ipex_llm.utils.common.log4Error import invalidInputError
 
 LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
              'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
@@ -143,8 +143,8 @@ def run_native_int4(repo_id,
                     warm_up,
                     num_trials):
     model_path = get_model_path(repo_id, local_model_hub)
-    from bigdl.llm.transformers import BigdlNativeForCausalLM
-    from bigdl.llm import llm_convert
+    from ipex_llm.transformers import BigdlNativeForCausalLM
+    from ipex_llm import llm_convert
     if "chatglm" in repo_id.lower():
         family = "chatglm"
     elif "llama" in repo_id.lower():
@@ -184,7 +184,7 @@ def run_transformer_int4(repo_id,
                          num_beams,
                          low_bit,
                          batch_size):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, LlamaTokenizer
 
     model_path = get_model_path(repo_id, local_model_hub)
@@ -319,7 +319,7 @@ def run_optimize_model(repo_id,
                        low_bit,
                        batch_size):
     from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
-    from bigdl.llm import optimize_model
+    from ipex_llm import optimize_model
 
     model_path = get_model_path(repo_id, local_model_hub)
     # Load model in 4 bit,
@@ -389,7 +389,7 @@ def run_transformer_int4_gpu(repo_id,
                              num_beams,
                              low_bit,
                              batch_size):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
     import intel_extension_for_pytorch as ipex
     model_path = get_model_path(repo_id, local_model_hub)
@@ -495,7 +495,7 @@ def run_optimize_model_gpu(repo_id,
                            low_bit,
                            batch_size):
     from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
-    from bigdl.llm import optimize_model
+    from ipex_llm import optimize_model
     import intel_extension_for_pytorch as ipex
     model_path = get_model_path(repo_id, local_model_hub)
     # Load model in 4 bit,
@@ -651,7 +651,7 @@ def run_bigdl_fp16_gpu(repo_id,
                        num_trials,
                        num_beams,
                        batch_size):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
     import intel_extension_for_pytorch as ipex
     model_path = get_model_path(repo_id, local_model_hub)
@@ -731,7 +731,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
                          batch_size):
     from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
     import deepspeed
-    from bigdl.llm import optimize_model
+    from ipex_llm import optimize_model
     import argparse
     # parser is for deepspeed subprocesses' inline parameter
     parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
@@ -822,7 +822,7 @@ def run_transformer_int4_gpu_win(repo_id,
                                  cpu_embedding,
                                  batch_size,
                                  streaming):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
     import intel_extension_for_pytorch as ipex
     model_path = get_model_path(repo_id, local_model_hub)
@@ -928,7 +928,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
                                       cpu_embedding,
                                       batch_size,
                                       streaming):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
     import intel_extension_for_pytorch as ipex
     model_path = get_model_path(repo_id, local_model_hub)
@@ -1038,7 +1038,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
                                             cpu_embedding,
                                             batch_size,
                                             streaming):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer, TextStreamer
     import intel_extension_for_pytorch as ipex
     model_path = get_model_path(repo_id, local_model_hub)
@@ -1140,7 +1140,7 @@ def run_transformer_autocast_bf16( repo_id,
                     num_trials,
                     num_beams,
                     batch_size):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, LlamaTokenizer
 
     model_path = get_model_path(repo_id, local_model_hub)
@@ -1209,7 +1209,7 @@ def run_bigdl_ipex_bf16(repo_id,
                     num_trials,
                     num_beams,
                     batch_size):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, LlamaTokenizer
 
     os.environ["BIGDL_OPT_IPEX"] = "true"
@@ -1280,7 +1280,7 @@ def run_bigdl_ipex_int4(repo_id,
                     num_trials,
                     num_beams,
                     batch_size):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, LlamaTokenizer
 
     os.environ["BIGDL_OPT_IPEX"] = "true"
@@ -1350,7 +1350,7 @@ def run_bigdl_ipex_int8(repo_id,
                     num_trials,
                     num_beams,
                     batch_size):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, LlamaTokenizer
 
     os.environ["BIGDL_OPT_IPEX"] = "true"
@@ -1434,7 +1434,7 @@ def run_deepspeed_optimize_model_gpu(repo_id,
     os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500")
 
     from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
-    from bigdl.llm import optimize_model
+    from ipex_llm import optimize_model
     import intel_extension_for_pytorch as ipex
     import deepspeed
     from deepspeed.accelerator.cpu_accelerator import CPU_Accelerator
@@ -1535,9 +1535,9 @@ def run_speculative_cpu(repo_id,
                     num_trials,
                     num_beams,
                     batch_size):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, LlamaTokenizer
-    from bigdl.llm.transformers.convert import get_enable_ipex
+    from ipex_llm.transformers.convert import get_enable_ipex
 
     _enable_ipex = get_enable_ipex()
 
@@ -1615,7 +1615,7 @@ def run_speculative_gpu(repo_id,
                     num_trials,
                     num_beams,
                     batch_size):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, LlamaTokenizer
 
     model_path = get_model_path(repo_id, local_model_hub)
diff --git a/python/llm/dev/benchmark/all-in-one/save.py b/python/llm/dev/benchmark/all-in-one/save.py
index ea3ed638..48aa3d98 100644
--- a/python/llm/dev/benchmark/all-in-one/save.py
+++ b/python/llm/dev/benchmark/all-in-one/save.py
@@ -30,7 +30,7 @@ current_dir = os.path.dirname(os.path.realpath(__file__))
 def save_model_in_low_bit(repo_id,
                           local_model_hub,
                           low_bit):
-    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, LlamaTokenizer
     model_path = get_model_path(repo_id, local_model_hub)
     # Load model in 4 bit,
diff --git a/python/llm/dev/benchmark/ceval/eval.py b/python/llm/dev/benchmark/ceval/eval.py
index e0530d46..8f8637ed 100644
--- a/python/llm/dev/benchmark/ceval/eval.py
+++ b/python/llm/dev/benchmark/ceval/eval.py
@@ -21,7 +21,7 @@ import torch
 import json
 from tqdm import tqdm
 
-from bigdl.llm.utils.common.log4Error import invalidInputError
+from ipex_llm.utils.common.log4Error import invalidInputError
 from evaluators.qwen import QwenEvaluator
 from evaluators.llama import LlamaEvaluator
 from evaluators.chatglm import ChatGLMEvaluator
diff --git a/python/llm/dev/benchmark/ceval/evaluators/chatglm.py b/python/llm/dev/benchmark/ceval/evaluators/chatglm.py
index 717f386f..2c0b5ec7 100644
--- a/python/llm/dev/benchmark/ceval/evaluators/chatglm.py
+++ b/python/llm/dev/benchmark/ceval/evaluators/chatglm.py
@@ -22,7 +22,7 @@ from thefuzz import process
 from transformers import AutoTokenizer
 
 from evaluators.evaluator import Evaluator
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers.generation.utils import LogitsProcessorList
 from transformers.generation.logits_process import LogitsProcessor
 
diff --git a/python/llm/dev/benchmark/ceval/evaluators/llama.py b/python/llm/dev/benchmark/ceval/evaluators/llama.py
index ba1dfc3e..c6944f72 100644
--- a/python/llm/dev/benchmark/ceval/evaluators/llama.py
+++ b/python/llm/dev/benchmark/ceval/evaluators/llama.py
@@ -22,7 +22,7 @@ import numpy as np
 import torch
 from transformers import LlamaTokenizer, GenerationConfig
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from evaluators.evaluator import Evaluator
 
 
diff --git a/python/llm/dev/benchmark/ceval/evaluators/qwen.py b/python/llm/dev/benchmark/ceval/evaluators/qwen.py
index 561bb6da..dcb1ee91 100644
--- a/python/llm/dev/benchmark/ceval/evaluators/qwen.py
+++ b/python/llm/dev/benchmark/ceval/evaluators/qwen.py
@@ -22,7 +22,7 @@ from thefuzz import process
 from transformers import AutoTokenizer
 from transformers.generation import GenerationConfig
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from evaluators.evaluator import Evaluator
 
 
diff --git a/python/llm/dev/benchmark/harness/bigdl_llm.py b/python/llm/dev/benchmark/harness/bigdl_llm.py
index b370301e..8626fc1a 100644
--- a/python/llm/dev/benchmark/harness/bigdl_llm.py
+++ b/python/llm/dev/benchmark/harness/bigdl_llm.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 import inspect
 from lm_eval.models.huggingface import AutoCausalLM
diff --git a/python/llm/dev/benchmark/perplexity/ppl.py b/python/llm/dev/benchmark/perplexity/ppl.py
index 672a5c19..1b71d9fe 100644
--- a/python/llm/dev/benchmark/perplexity/ppl.py
+++ b/python/llm/dev/benchmark/perplexity/ppl.py
@@ -20,7 +20,7 @@ from torch.nn import CrossEntropyLoss
 from tqdm import tqdm
 import gc
 
-from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
+from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
 
 class BigDLPPL:
     def __init__(self, model_path, device, **model_kwargs) -> None:
diff --git a/python/llm/dev/benchmark/perplexity/run.py b/python/llm/dev/benchmark/perplexity/run.py
index 27c22112..d548e984 100644
--- a/python/llm/dev/benchmark/perplexity/run.py
+++ b/python/llm/dev/benchmark/perplexity/run.py
@@ -21,7 +21,7 @@ from datasets import concatenate_datasets, load_dataset
 from transformers import AutoTokenizer
 
 from ppl import BigDLPPL
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
 
 import os
 import json
diff --git a/python/llm/dev/benchmark/whisper/run_whisper.py b/python/llm/dev/benchmark/whisper/run_whisper.py
index 286025d5..97705920 100644
--- a/python/llm/dev/benchmark/whisper/run_whisper.py
+++ b/python/llm/dev/benchmark/whisper/run_whisper.py
@@ -15,7 +15,7 @@
 #
 
 from datasets import load_dataset
-from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
+from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
 from transformers import WhisperProcessor
 import torch
 from evaluate import load
diff --git a/python/llm/example/CPU/Applications/autogen/README.md b/python/llm/example/CPU/Applications/autogen/README.md
index 4e112f9c..41e39727 100644
--- a/python/llm/example/CPU/Applications/autogen/README.md
+++ b/python/llm/example/CPU/Applications/autogen/README.md
@@ -69,11 +69,11 @@ conda activate autogen
 cd autogen
 
 # load the local model with cpu with your downloaded model
-python -m bigdl.llm.serving.model_worker --model-path ... --device cpu
+python -m ipex_llm.serving.model_worker --model-path ... --device cpu
 ```
 
 Change the Model Name:
-> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m bigdl.llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat.
+> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m ipex_llm.serving.model_worker --model-path ... --device cpu`. This ensures the proper usage of the BigDL-adapted FastChat.
 
 Potential Error Note:
 > If you get `RuntimeError: Error register to Controller` in the worker terminal, please set `export no_proxy='localhost'` to ensure the registration
diff --git a/python/llm/example/CPU/Applications/hf-agent/run_agent.py b/python/llm/example/CPU/Applications/hf-agent/run_agent.py
index a9a57373..8517fc58 100644
--- a/python/llm/example/CPU/Applications/hf-agent/run_agent.py
+++ b/python/llm/example/CPU/Applications/hf-agent/run_agent.py
@@ -19,7 +19,7 @@ import argparse
 from PIL import Image
 from transformers import AutoTokenizer, LocalAgent
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run agent using vicuna model")
diff --git a/python/llm/example/CPU/Applications/streaming-llm/README.md b/python/llm/example/CPU/Applications/streaming-llm/README.md
index 75c3a202..0bc1a627 100644
--- a/python/llm/example/CPU/Applications/streaming-llm/README.md
+++ b/python/llm/example/CPU/Applications/streaming-llm/README.md
@@ -3,7 +3,7 @@
 In this example, we apply low-bit optimizations to [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs.
 Only one code change is needed to load the model using bigdl-llm as follows:
 ```python
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
 ```
 
diff --git a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py
index 2aa1ded1..163ccc71 100644
--- a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py
+++ b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py
@@ -49,7 +49,7 @@ import urllib.request
 import os
 import json
 # code change to import from bigdl-llm API instead of using transformers API
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 import intel_extension_for_pytorch as ipex
 
diff --git a/python/llm/example/CPU/Deepspeed-AutoTP/README.md b/python/llm/example/CPU/Deepspeed-AutoTP/README.md
index fcd9f1be..8cc4d7ab 100644
--- a/python/llm/example/CPU/Deepspeed-AutoTP/README.md
+++ b/python/llm/example/CPU/Deepspeed-AutoTP/README.md
@@ -39,7 +39,7 @@ Distributed model managed by deepspeed can be further optimized with BigDL low-b
 
 ```python
 # Apply BigDL-LLM INT4 optimizations on transformers
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
 model = model.to(f'cpu:{local_rank}') # move partial model to local rank
diff --git a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
index 91d11dfa..d42f3887 100644
--- a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -45,7 +45,7 @@ import os
 import torch
 from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
 import deepspeed
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 import torch
 import intel_extension_for_pytorch as ipex
 import time
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
index 42cb6ed5..37843751 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
index 3842164a..4acad805 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
index 70ccef6d..1f5852b6 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer, GPTQConfig
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
index 4d6aebc3..69d9045f 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
index 6badf85b..b9bc0ee2 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
index bdaa7f7a..df64f80e 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
index 8d1cce0c..59dccfe8 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227 
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
index b5812ba5..07a4359e 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
index b5f5ab6e..e38f56c4 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
index 18f6a863..fb1423fa 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
index 3bbf5333..5cab690d 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 
 
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
index 9372ed8a..d3d8daae 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py
index 3006299d..5094a66b 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 
 
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py
index 3a383de7..b8329d61 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import CodeLlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py
index 50a542f6..adc79339 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
index 6f5ede1f..b82ddc7f 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoTokenizer
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/generate.py
index 87cca75f..6bce7e4f 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/generate.py
@@ -36,7 +36,7 @@ if __name__ == '__main__':
 
     # Load model in 4 bit,
     # which convert the relevant layers in the model into INT4 format
-    from bigdl.llm.transformers import AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True).eval()
     model.generation_config = GenerationConfig.from_pretrained(model_path)
     model.generation_config.pad_token_id = model.generation_config.eos_token_id
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
index 8a14e335..679fe2e6 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py
index e954ed78..77088018 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py
@@ -17,7 +17,7 @@
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
+from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
 from datasets import load_dataset
 from transformers import pipeline
 from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
index a159a98c..ee043e0b 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
index 87298307..18867636 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py
index 1f582bcc..f6776860 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py
@@ -301,7 +301,7 @@ class Attention(nn.Module):
             # resize qk to 4D to match apply_rotary_pos_emb_no_cache_xpu's requirements.
             query_layer = query_layer.reshape(batch_size, self.num_heads, q_length, self.head_dim)
             key_layer = key_layer.reshape(batch_size, self.num_kv, q_length, self.head_dim)
-            from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
+            from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
             query_layer, key_layer = apply_rotary_pos_emb_no_cache_xpu(query_layer,
                                                                        key_layer,
                                                                        position_ids,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
index 1c2cab8c..6419aa5a 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
index 58d5e446..91b8addc 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForSeq2SeqLM
+from ipex_llm.transformers import AutoModelForSeq2SeqLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py
index 7fe83502..271b1d4f 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py
@@ -19,7 +19,7 @@ import torch
 import argparse
 import time
 from PIL import Image
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Fuyu model')
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py
index 0c1539ff..4606e2b5 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # The instruction-tuned models use a chat template that must be adhered to for conversational use.
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py
index dd824043..6834f582 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py
@@ -14,14 +14,14 @@
 # limitations under the License.
 #
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 from transformers.generation import GenerationConfig
 import torch
 import time
 import os
 import argparse
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for InternLM-XComposer model')
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
index 1817079d..1c33a1fe 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
index 00ffec57..7e05e153 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
@@ -39,7 +39,7 @@ if __name__ == '__main__':
     model_path = args.repo_id_or_model_path
 
     
-    from bigdl.llm.transformers import AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
                                                  trust_remote_code=True)
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
index 1d6c56e2..ed5c93b0 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py
index 72e4e269..94e6ab48 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
index 5e1d4065..cd8b9f60 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
index 73ec0837..786f7f0e 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
index eb70ea70..e4caa938 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer, GenerationConfig
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
index e934d99b..710d2a39 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
+from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
 from transformers import AutoTokenizer, GenerationConfig
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
index 53a95623..91930b72 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
+from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
 from transformers import AutoTokenizer, GenerationConfig
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
index 276fa09e..395481ae 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
@@ -20,7 +20,7 @@ import argparse
 import numpy as np
 
 from transformers import AutoTokenizer, GenerationConfig
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to  # TODO: https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py
 PHI1_5_PROMPT_FORMAT = " Question:{prompt}\n\n Answer:"
@@ -41,7 +41,7 @@ if __name__ == '__main__':
 
     # Load model in 4 bit,
     # which convert the relevant layers in the model into INT4 format
-    from bigdl.llm.transformers import AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
                                                  trust_remote_code=True)
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
index e22733e3..264f27e2 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could revise it based on the Phoenix model you choose to use
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
index 6c017755..60d71a89 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
@@ -14,14 +14,14 @@
 # limitations under the License.
 #
 
-from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
 from transformers import AutoTokenizer, LlamaTokenizer
 from transformers.generation import GenerationConfig
 import torch
 import time
 import os
 import argparse
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 torch.manual_seed(1234)
 
 if __name__ == '__main__':
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
index 8143e34c..4f260181 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py
index becfb0cc..2b1cebf3 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py
@@ -36,7 +36,7 @@ if __name__ == '__main__':
     model_path = args.repo_id_or_model_path
 
     
-    from bigdl.llm.transformers import AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
                                                  trust_remote_code=True)
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
index 08154485..da5f69ee 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py
index 014d5f13..0599df2c 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py
index 7a0ae024..47bc1c79 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py
index 91ec5000..b84d7b61 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
index d1291d7d..e6a80a71 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
index 2b76f478..118b6084 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py
index 7e67ae45..942e7f14 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py
@@ -19,7 +19,7 @@ import librosa
 import argparse
 
 from transformers import pipeline
-from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
+from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
 from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
 
 
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
index e4a1185d..60de9751 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
+from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
 from transformers import WhisperProcessor
 from datasets import load_dataset
 
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py
index 52f61a5b..72d5dd97 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 
 WIZARDCODERPYTHON_PROMPT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py
index c7024b3c..f809c44b 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
index 1c792931..46115bc0 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
@@ -18,7 +18,7 @@ import torch, transformers
 import sys, os, time
 import argparse
 from transformers import LlamaTokenizer
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 # Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage
 YUAN2_PROMPT_FORMAT = """
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py
index ab5708e8..cf4914c2 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/generate.py
@@ -39,7 +39,7 @@ if __name__ == '__main__':
     model_path = args.repo_id_or_model_path
 
     
-    from bigdl.llm.transformers import AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModelForCausalLM
     # enabling `use_cache=True` allows the model to utilize the previous
     # key/values attentions to speed up decoding;
     # to obtain optimal performance with BigDL-LLM INT4 optimizations,
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py b/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py
index 9cf9cffb..02ea399e 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py
@@ -15,7 +15,7 @@
 #
 
 import argparse
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer, TextGenerationPipeline
 
 if __name__ == '__main__':
@@ -38,7 +38,7 @@ if __name__ == '__main__':
         model = AutoModelForCausalLM.load_low_bit(load_path)
         tokenizer = LlamaTokenizer.from_pretrained(load_path)
     else:
-        # load_in_low_bit in bigdl.llm.transformers will convert
+        # load_in_low_bit in ipex_llm.transformers will convert
         # the relevant layers in the model into corresponding int X format
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
         tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py
index 9cf9cffb..02ea399e 100644
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py
@@ -15,7 +15,7 @@
 #
 
 import argparse
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer, TextGenerationPipeline
 
 if __name__ == '__main__':
@@ -38,7 +38,7 @@ if __name__ == '__main__':
         model = AutoModelForCausalLM.load_low_bit(load_path)
         tokenizer = LlamaTokenizer.from_pretrained(load_path)
     else:
-        # load_in_low_bit in bigdl.llm.transformers will convert
+        # load_in_low_bit in ipex_llm.transformers will convert
         # the relevant layers in the model into corresponding int X format
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
         tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
diff --git a/python/llm/example/CPU/LangChain/native_int4/docqa.py b/python/llm/example/CPU/LangChain/native_int4/docqa.py
index 8f3b4bdd..ce7cf8f1 100644
--- a/python/llm/example/CPU/LangChain/native_int4/docqa.py
+++ b/python/llm/example/CPU/LangChain/native_int4/docqa.py
@@ -31,8 +31,8 @@ from langchain.chains.question_answering import load_qa_chain
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 
-from bigdl.llm.langchain.llms import *
-from bigdl.llm.langchain.embeddings import *
+from ipex_llm.langchain.llms import *
+from ipex_llm.langchain.embeddings import *
 
 
 def main(args):
diff --git a/python/llm/example/CPU/LangChain/native_int4/streamchat.py b/python/llm/example/CPU/LangChain/native_int4/streamchat.py
index baa94db1..a0127dd0 100644
--- a/python/llm/example/CPU/LangChain/native_int4/streamchat.py
+++ b/python/llm/example/CPU/LangChain/native_int4/streamchat.py
@@ -21,7 +21,7 @@
 
 import argparse
 
-from bigdl.llm.langchain.llms import *
+from ipex_llm.langchain.llms import *
 from langchain import PromptTemplate, LLMChain
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
diff --git a/python/llm/example/CPU/LangChain/native_int4/voiceassistant.py b/python/llm/example/CPU/LangChain/native_int4/voiceassistant.py
index 80718c50..c41666ca 100644
--- a/python/llm/example/CPU/LangChain/native_int4/voiceassistant.py
+++ b/python/llm/example/CPU/LangChain/native_int4/voiceassistant.py
@@ -23,7 +23,7 @@
 
 
 from langchain import LLMChain, PromptTemplate
-from bigdl.llm.langchain.llms import *
+from ipex_llm.langchain.llms import *
 from langchain.memory import ConversationBufferWindowMemory
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
diff --git a/python/llm/example/CPU/LangChain/transformers_int4/chat.py b/python/llm/example/CPU/LangChain/transformers_int4/chat.py
index e6be1ca1..b3695199 100644
--- a/python/llm/example/CPU/LangChain/transformers_int4/chat.py
+++ b/python/llm/example/CPU/LangChain/transformers_int4/chat.py
@@ -21,7 +21,7 @@
 
 import argparse
 
-from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
+from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
 from langchain import PromptTemplate, LLMChain
 from langchain import HuggingFacePipeline
 
diff --git a/python/llm/example/CPU/LangChain/transformers_int4/llm_math.py b/python/llm/example/CPU/LangChain/transformers_int4/llm_math.py
index 456ac567..567b0071 100644
--- a/python/llm/example/CPU/LangChain/transformers_int4/llm_math.py
+++ b/python/llm/example/CPU/LangChain/transformers_int4/llm_math.py
@@ -25,7 +25,7 @@
 import argparse
 
 from langchain.chains import LLMMathChain
-from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
+from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
 
 
 def main(args):
diff --git a/python/llm/example/CPU/LangChain/transformers_int4/rag.py b/python/llm/example/CPU/LangChain/transformers_int4/rag.py
index 7d9ae45f..960d23b6 100644
--- a/python/llm/example/CPU/LangChain/transformers_int4/rag.py
+++ b/python/llm/example/CPU/LangChain/transformers_int4/rag.py
@@ -30,8 +30,8 @@ from langchain.text_splitter import CharacterTextSplitter
 from langchain.chains.question_answering import load_qa_chain
 from langchain.callbacks.manager import CallbackManager
 
-from bigdl.llm.langchain.llms import TransformersLLM
-from bigdl.llm.langchain.embeddings import TransformersEmbeddings
+from ipex_llm.langchain.llms import TransformersLLM
+from ipex_llm.langchain.embeddings import TransformersEmbeddings
 
 text_doc = '''
 BigDL seamlessly scales your data analytics & AI applications from laptop to cloud, with the following libraries:
diff --git a/python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py b/python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py
index 7c649fdb..279a14e5 100644
--- a/python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py
+++ b/python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py
@@ -23,9 +23,9 @@
 
 
 from langchain import LLMChain, PromptTemplate
-from bigdl.llm.langchain.llms import TransformersLLM
+from ipex_llm.langchain.llms import TransformersLLM
 from langchain.memory import ConversationBufferWindowMemory
-from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
+from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
 from transformers import WhisperProcessor
 import speech_recognition as sr
 import numpy as np
diff --git a/python/llm/example/CPU/LlamaIndex/rag.py b/python/llm/example/CPU/LlamaIndex/rag.py
index 9fd81ca1..c4c4c8f8 100644
--- a/python/llm/example/CPU/LlamaIndex/rag.py
+++ b/python/llm/example/CPU/LlamaIndex/rag.py
@@ -164,7 +164,7 @@ def main(args):
     embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path)
     
     # Use custom LLM in BigDL
-    from bigdl.llm.llamaindex.llms import BigdlLLM
+    from ipex_llm.llamaindex.llms import BigdlLLM
     llm = BigdlLLM(
         model_name=args.model_path,
         tokenizer_name=args.model_path,
diff --git a/python/llm/example/CPU/ModelScope-Models/generate.py b/python/llm/example/CPU/ModelScope-Models/generate.py
index 3fef46fd..274566f3 100644
--- a/python/llm/example/CPU/ModelScope-Models/generate.py
+++ b/python/llm/example/CPU/ModelScope-Models/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from modelscope import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
index fe55784e..aa349c29 100644
--- a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
+++ b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py
@@ -16,11 +16,11 @@
 
 import time
 import argparse
-from bigdl.llm.transformers import *
+from ipex_llm.transformers import *
 
 
 def convert(repo_id_or_model_path, model_family, tmp_path):
-    from bigdl.llm import llm_convert
+    from ipex_llm import llm_convert
     original_llm_path = repo_id_or_model_path
     bigdl_llm_path = llm_convert(
         model=original_llm_path,
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py
index 6a9c7fde..9b219452 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/BAAI/AquilaChat2-7B/tree/main/predict.py
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py
index bb8a61b8..1811c36b 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py
@@ -20,7 +20,7 @@ import argparse
 
 from TTS.tts.configs.bark_config import BarkConfig
 from TTS.tts.models.bark import Bark
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 
 if __name__ == '__main__':
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py
index aab47ff7..cd0c73b7 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import BertTokenizer, BertModel
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 
 if __name__ == '__main__':
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py
index 97d66281..d16d2331 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model
 BLUELM_PROMPT_FORMAT = "[|Human|]:{prompt}[|AI|]:"
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py
index d3f4b6cd..89d26761 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModel, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/THUDM/chatglm-6b/blob/294cb13118a1e08ad8449ca542624a5c6aecc401/modeling_chatglm.py#L1281
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py
index 72a0ab99..22fdeaad 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModel, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://github.com/THUDM/ChatGLM3/blob/main/PROMPT.md
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py
index f217676a..12266e99 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/codellama/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, CodeLlamaTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/docs/transformers/v4.34.1/model_doc/code_llama
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py
index cd235025..a0610bc6 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import  AutoTokenizer, AutoModelForCausalLM
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/WisdomShell/CodeShell-7B
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py
index 2f0cdc72..8714b419 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/generate.py
index d0ca949a..af5ec2f0 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/generate.py
@@ -36,7 +36,7 @@ if __name__ == '__main__':
 
 
     from transformers import AutoModelForCausalLM
-    from bigdl.llm import optimize_model
+    from ipex_llm import optimize_model
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype = torch.bfloat16, device_map = "auto", attn_implementation="eager")
     model.generation_config = GenerationConfig.from_pretrained(model_path)
     model.generation_config.pad_token_id = model.generation_config.eos_token_id
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py
index b64eaa9b..1a2cbaec 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # Refer to https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/recognize.py b/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/recognize.py
index 338db3f6..bcd7b852 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/recognize.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/recognize.py
@@ -17,7 +17,7 @@
 import time
 import argparse
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 from datasets import load_dataset
 from transformers import AutoModelForSpeechSeq2Seq, pipeline
 from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py
index 51ba2500..35e1b25d 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 FLAN_T5_PROMPT_FORMAT = "<|User|>:{prompt}"
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py
index 234e4741..8e2397ba 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/generate.py
@@ -19,7 +19,7 @@ import torch
 import argparse
 import time
 from PIL import Image
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Fuyu model')
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py
index 3463eb3a..dc664493 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/chat.py
@@ -20,7 +20,7 @@ import torch
 import time
 import os
 import argparse
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for InternLM-XComposer model')
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py
index afca8397..7cfb80c5 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
@@ -40,7 +40,7 @@ if __name__ == '__main__':
     model_path = args.repo_id_or_model_path
 
 
-    from bigdl.llm import optimize_model
+    from ipex_llm import optimize_model
     from transformers import AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  trust_remote_code=True)
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py
index b2c5ca70..6c4ab17a 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, LlamaTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
index d5d0d2ce..c27d1a50 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
@@ -56,7 +56,7 @@ from llava.mm_utils import (
     KeywordsStoppingCriteria
 )
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # Load the pretrained model.
 # Adapted from llava.model.builder.load_pretrained_model.
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py
index abc050a2..9462474a 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/mamba/generate.py
@@ -17,7 +17,7 @@
 import argparse
 import time
 import torch
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 from transformers import AutoTokenizer
 
 from model import MambaLMHeadModel
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/cpu.patch b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/cpu.patch
index fab864f8..d7b5e9dd 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/cpu.patch
+++ b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/cpu.patch
@@ -36,7 +36,7 @@ index acedf44..df4e5d6 100644
  
  from llama import Llama, Dialog
  
--from bigdl.llm.optimize import optimize_model
+-from ipex_llm.optimize import optimize_model
 -
  
  def main(
@@ -67,7 +67,7 @@ index 1f63bb0..0d60b9c 100755
  from llama import Llama
  from typing import List
  
--from bigdl.llm.optimize import optimize_model
+-from ipex_llm.optimize import optimize_model
 -
  def main(
      ckpt_dir: str,
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_chat_completion.py b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_chat_completion.py
index d50f1608..dd4863d2 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_chat_completion.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_chat_completion.py
@@ -24,7 +24,7 @@ import fire
 
 from llama import Llama, Dialog
 
-from bigdl.llm.optimize import optimize_model
+from ipex_llm.optimize import optimize_model
 
 
 def main(
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_text_completion.py b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_text_completion.py
index 9342cfce..3744540b 100755
--- a/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_text_completion.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/meta-llama/example_text_completion.py
@@ -23,7 +23,7 @@ import fire
 from llama import Llama
 from typing import List
 
-from bigdl.llm.optimize import optimize_model
+from ipex_llm.optimize import optimize_model
 
 def main(
     ckpt_dir: str,
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py
index 6fa1522a..37958b67 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/mistral/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
index d79e8a72..557f54c8 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1#instruction-format
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py
index ca2c5dc4..1b071d57 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py
@@ -19,7 +19,7 @@ import whisper
 import time
 import librosa
 import argparse
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 
 if __name__ == '__main__':
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py
index f819a47f..f70da15d 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py
index a4f54355..319c009f 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/microsoft/phi-2
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phixtral/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/phixtral/generate.py
index e66863ad..75f4ba6a 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/phixtral/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/phixtral/generate.py
@@ -20,7 +20,7 @@ import argparse
 import numpy as np
 
 from transformers import AutoTokenizer, GenerationConfig
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to  # TODO: https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py
 PHI1_5_PROMPT_FORMAT = " Question:{prompt}\n\n Answer:"
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py
index 5502a697..6ed3adec 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/chat.py
@@ -20,7 +20,7 @@ import torch
 import time
 import os
 import argparse
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 torch.manual_seed(1234)
 
 if __name__ == '__main__':
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/generate.py
index da769fc3..82f97ae2 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/generate.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
     # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_path,
                                               trust_remote_code=True)
-    from bigdl.llm import optimize_model
+    from ipex_llm import optimize_model
     model = optimize_model(model)
     
     prompt = args.prompt
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py
index 8528de69..fa52a2e9 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/skywork/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 SKYWORK_PROMPT_FORMAT = "{prompt}"
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py
index 612d9aca..2ddd48af 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/solar/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # prompt format is tuned based on the output example in this link:
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py
index ed9da6ff..832f7623 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, LlamaTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 WIZARDCODERPYTHON_PROMPT_FORMAT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py
index ddfe6d49..bf6af053 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/yi/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py
index 5ebbe21d..ea71ad76 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/generate.py
@@ -18,7 +18,7 @@ import torch, transformers
 import sys, os, time
 import argparse
 from transformers import LlamaTokenizer, AutoModelForCausalLM
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage
 YUAN2_PROMPT_FORMAT = """
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py
index 16046951..e6f2c02d 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/ziya/generate.py
@@ -39,7 +39,7 @@ if __name__ == '__main__':
 
     
     from transformers import AutoModelForCausalLM
-    from bigdl.llm import optimize_model
+    from ipex_llm import optimize_model
     # enabling `use_cache=True` allows the model to utilize the previous
     # key/values attentions to speed up decoding;
     # to obtain optimal performance with BigDL-LLM `optimization_model` API optimizations,
diff --git a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py
index 5e1b67cc..59c01f63 100644
--- a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, LlamaTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style
diff --git a/python/llm/example/CPU/PyTorch-Models/Save-Load/generate.py b/python/llm/example/CPU/PyTorch-Models/Save-Load/generate.py
index ccbee8aa..6d13258d 100644
--- a/python/llm/example/CPU/PyTorch-Models/Save-Load/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Save-Load/generate.py
@@ -17,8 +17,8 @@
 import torch
 import time
 import argparse
-from bigdl.llm import optimize_model
-from bigdl.llm.optimize import low_memory_init, load_low_bit
+from ipex_llm import optimize_model
+from ipex_llm.optimize import low_memory_init, load_low_bit
 from transformers import AutoModelForCausalLM, LlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py
index 3e3fd34d..cdf3196c 100644
--- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py
+++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py
@@ -47,11 +47,11 @@ from peft import (
 from utils.prompter import Prompter
 
 from transformers import BitsAndBytesConfig
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
-# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model
-from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
-from bigdl.llm.utils.isa_checker import ISAChecker
+# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
+from ipex_llm.utils.isa_checker import ISAChecker
 
 def get_int_from_env(env_keys, default):
     """Returns the first positive env value found in the `env_keys` list or the default."""
diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/utils/prompter.py b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/utils/prompter.py
index 33355129..86d6422d 100644
--- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/utils/prompter.py
+++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/utils/prompter.py
@@ -34,7 +34,7 @@
 import json
 import os.path as osp
 from typing import Union
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 
 class Prompter(object):
diff --git a/python/llm/example/CPU/QLoRA-FineTuning/qlora_finetuning_cpu.py b/python/llm/example/CPU/QLoRA-FineTuning/qlora_finetuning_cpu.py
index 1a8c6054..6b177056 100644
--- a/python/llm/example/CPU/QLoRA-FineTuning/qlora_finetuning_cpu.py
+++ b/python/llm/example/CPU/QLoRA-FineTuning/qlora_finetuning_cpu.py
@@ -21,11 +21,11 @@ import transformers
 from transformers import LlamaTokenizer
 
 from transformers import BitsAndBytesConfig
-from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
+from ipex_llm.transformers import AutoModelForCausalLM
 from datasets import load_dataset
 import argparse
-from bigdl.llm.utils.isa_checker import ISAChecker
+from ipex_llm.utils.isa_checker import ISAChecker
 
 current_dir = os.path.dirname(os.path.realpath(__file__))
 common_util_path = os.path.join(current_dir, '..', '..', 'GPU', 'LLM-Finetuning')
diff --git a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
index bdc1e533..1010618c 100644
--- a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
index b60b87b7..971e60e6 100644
--- a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
index 5c827e55..5e3c5f8b 100644
--- a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
 from transformers import LlamaTokenizer, AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
index ba9d2d10..714eb430 100644
--- a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py
index d3e71739..c92b8512 100644
--- a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
index 08305401..0bcd026e 100644
--- a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
index 265eadf2..279f3550 100644
--- a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
index 0d35294d..6db383c4 100644
--- a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/CPU/vLLM-Serving/README.md b/python/llm/example/CPU/vLLM-Serving/README.md
index 8d650d8a..a7e6ec1e 100644
--- a/python/llm/example/CPU/vLLM-Serving/README.md
+++ b/python/llm/example/CPU/vLLM-Serving/README.md
@@ -56,7 +56,7 @@ To fully utilize the continuous batching feature of the `vLLM`, you can send req
 #!/bin/bash
 # You may also want to adjust the `--max-num-batched-tokens` argument, it indicates the hard limit
 # of batched prompt length the server will accept
-numactl -C 48-95 -m 1 python -m bigdl.llm.vllm.entrypoints.openai.api_server \
+numactl -C 48-95 -m 1 python -m ipex_llm.vllm.entrypoints.openai.api_server \
         --model /MODEL_PATH/Llama-2-7b-chat-hf-bigdl/ --port 8000  \
         --load-format 'auto' --device cpu --dtype bfloat16 \
         --load-in-low-bit sym_int4 \
diff --git a/python/llm/example/CPU/vLLM-Serving/offline_inference.py b/python/llm/example/CPU/vLLM-Serving/offline_inference.py
index 84ecb5a1..00fe1f55 100644
--- a/python/llm/example/CPU/vLLM-Serving/offline_inference.py
+++ b/python/llm/example/CPU/vLLM-Serving/offline_inference.py
@@ -31,8 +31,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from bigdl.llm.vllm.entrypoints.llm import LLM
-from bigdl.llm.vllm.sampling_params import SamplingParams
+from ipex_llm.vllm.entrypoints.llm import LLM
+from ipex_llm.vllm.sampling_params import SamplingParams
 
 # Sample prompts.
 prompts = [
diff --git a/python/llm/example/GPU/Applications/autogen/README.md b/python/llm/example/GPU/Applications/autogen/README.md
index f18aa3cb..7ac7c4eb 100644
--- a/python/llm/example/GPU/Applications/autogen/README.md
+++ b/python/llm/example/GPU/Applications/autogen/README.md
@@ -71,11 +71,11 @@ conda activate autogen
 cd autogen
 
 # load the local model with xpu with your downloaded model
-python -m bigdl.llm.serving.model_worker --model-path ... --device xpu
+python -m ipex_llm.serving.model_worker --model-path ... --device xpu
 ```
 
 Model Name Note:
-> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m bigdl.llm.serving.model_worker --model-path ... --device xpu`. This ensures the proper usage of the BigDL-adapted FastChat.
+> Assume you use the model `Mistral-7B-Instruct-v0.2` and your model is downloaded to `autogen/model/Mistral-7B-Instruct-v0.2`. You should rename the model to `autogen/model/bigdl` and run `python -m ipex_llm.serving.model_worker --model-path ... --device xpu`. This ensures the proper usage of the BigDL-adapted FastChat.
 
 Device Note:
 > Please set `--device` to `xpu` to enable the Intel GPU usage.
diff --git a/python/llm/example/GPU/Applications/streaming-llm/README.md b/python/llm/example/GPU/Applications/streaming-llm/README.md
index 54aa89e2..c783bc09 100644
--- a/python/llm/example/GPU/Applications/streaming-llm/README.md
+++ b/python/llm/example/GPU/Applications/streaming-llm/README.md
@@ -3,7 +3,7 @@
 In this example, we apply low-bit optimizations to [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs.
 Only one code change is needed to load the model using bigdl-llm as follows:
 ```python
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
 ```
 
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
index 54994c0f..a64693e8 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -34,7 +34,7 @@ os.environ["WORLD_SIZE"] = str(world_size)
 os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "29500")
 
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 import torch
 import time
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
index 7402a378..c6ff5241 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py
@@ -17,7 +17,7 @@
 import torch
 import time
 import argparse
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
index f6ca2511..d8342d8e 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py
@@ -17,7 +17,7 @@
 import torch
 import time
 import argparse
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 import warnings
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
index e8ee9c36..9272b727 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import LlamaTokenizer
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
index 0f595d6e..4317730f 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py
@@ -17,7 +17,7 @@
 import torch
 import time
 import argparse
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer, GPTQConfig
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py
index 0f233796..ec729d12 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
index c423904b..cc75bba7 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
index 18c0e10d..4e34654e 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
index ac004a52..37e65743 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
index d34ff2ef..2d30cd29 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/bluelm/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
index b3f86e90..1da87c11 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
index 1e7ee00e..36752e87 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
index 35ecfb49..109b40ca 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py
index 9fbbd16c..3bf7fd7f 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm3/streamchat.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
index da977404..cf0e554f 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
index 6de2f2d7..8192d78e 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codellama/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import CodeLlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py
index 5a77812c..9aeed3ba 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py
@@ -28,7 +28,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from transformers import AutoTokenizer
 # from transformers import AutoModelForCausalLM, AutoModel
-from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
+from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
 from transformers.generation import GenerationConfig, TextIteratorStreamer
 from transformers import StoppingCriteriaList, StoppingCriteria
 from sse_starlette.sse import EventSourceResponse
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
index b9a63832..728ae71f 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoTokenizer
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deepseek/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
index 7a7d9aaf..802bb284 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/deepseek/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoTokenizer
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py
index 67e3cbe4..935216fd 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/distil-whisper/recognize.py
@@ -18,7 +18,7 @@ import time
 import argparse
 
 from transformers import pipeline
-from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
+from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
 from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
 from datasets import load_dataset
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py
index b4a9c439..21d2a43b 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v1/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py
index 93729f74..b5182e39 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/dolly-v2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py
index 1f582bcc..f6776860 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py
@@ -301,7 +301,7 @@ class Attention(nn.Module):
             # resize qk to 4D to match apply_rotary_pos_emb_no_cache_xpu's requirements.
             query_layer = query_layer.reshape(batch_size, self.num_heads, q_length, self.head_dim)
             key_layer = key_layer.reshape(batch_size, self.num_kv, q_length, self.head_dim)
-            from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
+            from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
             query_layer, key_layer = apply_rotary_pos_emb_no_cache_xpu(query_layer,
                                                                        key_layer,
                                                                        position_ids,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
index 81229c5e..85669e7c 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
index 7ebfc1f2..c0f0773b 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForSeq2SeqLM
+from ipex_llm.transformers import AutoModelForSeq2SeqLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
index c8abc40f..5a328377 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gemma/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # The instruction-tuned models use a chat template that must be adhered to for conversational use.
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
index d9da4b5c..87fd5c97 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
index 99e5b52f..ce25713b 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
index fdbd312c..7a751793 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 import intel_extension_for_pytorch as ipex
 
 # you could tune the prompt based on your own model,
@@ -43,7 +43,7 @@ if __name__ == '__main__':
     # which convert the relevant layers in the model into INT4 format
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
-    from bigdl.llm.transformers import AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
                                                  trust_remote_code=True,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
index e9095acc..2fa0c281 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py
index faecbcf3..030ea9b2 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mistral/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
index 79cc9995..3b52a5d4 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mixtral/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
index ff9b4b06..20e196a4 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer, GenerationConfig
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
index 0437fa5f..5bd9dbd5 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
+from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
 from transformers import AutoTokenizer, GenerationConfig
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
index d5aa3a74..b199f08d 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
+from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
 from transformers import AutoTokenizer, GenerationConfig
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
index e806ba54..cbe01ebe 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py
@@ -45,7 +45,7 @@ if __name__ == '__main__':
     # which convert the relevant layers in the model into INT4 format
     # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
     # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
-    from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModel,AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  load_in_4bit=True,
                                                  trust_remote_code=True)
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
index 4701eb5a..03127136 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py
@@ -21,7 +21,7 @@ import torch
 from transformers import AutoTokenizer
 from transformers.generation import GenerationConfig
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 torch.manual_seed(1234)
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
index 182a093f..7fb477a2 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py
index 423f82aa..557b0d55 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 import intel_extension_for_pytorch as ipex
 import numpy as np
 
@@ -38,7 +38,7 @@ if __name__ == '__main__':
     model_path = args.repo_id_or_model_path
 
     
-    from bigdl.llm.transformers import AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModelForCausalLM
     # Load model in 4 bit,
     # which convert the relevant layers in the model into INT4 format
     model = AutoModelForCausalLM.from_pretrained(model_path,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
index f6be1ff9..39d97ee1 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/redpajama/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py
index 5720a3eb..ada97daf 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/replit/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py
index 2152b38d..3158bc2d 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv4/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py
index 7099ab1b..7591abd0 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/rwkv5/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py
index 7dd54586..ef37b844 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py
@@ -19,7 +19,7 @@ import intel_extension_for_pytorch as ipex
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
index 8be048ff..a29e80ef 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
index 387d815f..df9686f8 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/vicuna/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py
index 57db00f2..33d3e651 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py
@@ -24,8 +24,8 @@ import inquirer
 # For Windows users, please remove `import sounddevice`
 import sounddevice
 
-from bigdl.llm.transformers import AutoModelForCausalLM
-from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
+from ipex_llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
 from transformers import LlamaTokenizer
 from transformers import WhisperProcessor
 from transformers import TextStreamer
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
index 41729c65..4a0ca795 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForSpeechSeq2Seq
+from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
 from transformers import WhisperProcessor
 from datasets import load_dataset
 
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py
index 4af84098..8bf80fdc 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yi/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 # Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
index c01cf828..e84e0e46 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py
@@ -19,7 +19,7 @@ import sys, os, time
 import intel_extension_for_pytorch as ipex
 import argparse
 from transformers import LlamaTokenizer
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 # Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage
 YUAN2_PROMPT_FORMAT = """
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py b/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py
index 7a2f61a5..f43f0c3e 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py
@@ -16,7 +16,7 @@
 
 import torch
 import argparse
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer, TextGenerationPipeline
 
 if __name__ == '__main__':
@@ -40,7 +40,7 @@ if __name__ == '__main__':
         model = model.to('xpu')
         tokenizer = AutoTokenizer.from_pretrained(load_path)
     else:
-        # load_in_low_bit in bigdl.llm.transformers will convert
+        # load_in_low_bit in ipex_llm.transformers will convert
         # the relevant layers in the model into corresponding int X format
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True)
         model = model.to('xpu')
diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py
index ee36132a..4a5c4f51 100644
--- a/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py
+++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/generate.py
@@ -17,7 +17,7 @@
 import torch
 import time
 import argparse
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py
index 3d0708b5..cb4a35f6 100644
--- a/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/DPO/dpo_finetuning.py
@@ -38,8 +38,8 @@ import transformers
 from transformers import AutoTokenizer, TrainingArguments, BitsAndBytesConfig
 from datasets import load_dataset
 from peft import LoraConfig
-from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training
+from ipex_llm.transformers import AutoModelForCausalLM
 from trl import DPOTrainer
 import argparse
 
diff --git a/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py b/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py
index d6360fd1..a829cd40 100644
--- a/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py
+++ b/python/llm/example/GPU/LLM-Finetuning/HF-PEFT/alpaca-lora/finetune.py
@@ -30,7 +30,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from bigdl.llm import llm_patch
+from ipex_llm import llm_patch
 llm_patch(train=True)
 
 # The following is the original LLM finetuning code using PEFT (without BigDL-LLM)
diff --git a/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py
index 2f238d2c..4af84ed6 100644
--- a/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py
@@ -52,11 +52,11 @@ sys.path.append(common_util_path)
 from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data
 
 from transformers import BitsAndBytesConfig
-from bigdl.llm.transformers import AutoModelForCausalLM
-# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model
-from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
+from ipex_llm.transformers import AutoModelForCausalLM
+# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
     LoraConfig
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0")
 world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1")
diff --git a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py
index 65660115..647cf9e9 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py
@@ -52,11 +52,11 @@ sys.path.append(common_util_path)
 from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data
 
 from transformers import BitsAndBytesConfig
-from bigdl.llm.transformers import AutoModelForCausalLM
-# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model
-from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
+from ipex_llm.transformers import AutoModelForCausalLM
+# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
     LoraConfig
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
  
 local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0")
 world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1")
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py
index 41f7dc88..3ffd6727 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py
@@ -52,11 +52,11 @@ sys.path.append(common_util_path)
 from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data
 
 from transformers import BitsAndBytesConfig
-from bigdl.llm.transformers import AutoModelForCausalLM
-# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model
-from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
+from ipex_llm.transformers import AutoModelForCausalLM
+# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
     LoraConfig
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
  
 local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0")
 world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1")
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/save_low_bit_70b_model.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/save_low_bit_70b_model.py
index b3b044fa..63d4e8b4 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/save_low_bit_70b_model.py
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/save_low_bit_70b_model.py
@@ -15,7 +15,7 @@
 #
 
 from transformers import LlamaTokenizer
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 import torch
 import argparse
 
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py
index 2435e797..b8a1fb8e 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py
@@ -21,8 +21,8 @@ import transformers
 from transformers import LlamaTokenizer
 from peft import LoraConfig
 from transformers import BitsAndBytesConfig
-from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training
+from ipex_llm.transformers import AutoModelForCausalLM
 from datasets import load_dataset
 import argparse
 
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/qlora_finetuning.py
index db1f0656..dc78ae33 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/qlora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/qlora_finetuning.py
@@ -21,8 +21,8 @@ import transformers
 from transformers import LlamaTokenizer
 from peft import LoraConfig
 from transformers import BitsAndBytesConfig
-from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training
+from ipex_llm.transformers import AutoModelForCausalLM
 from datasets import load_dataset
 from trl import SFTTrainer
 import argparse
diff --git a/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py
index 7fd2b5fd..2a2ff947 100644
--- a/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/ReLora/alpaca_relora_finetuning.py
@@ -52,12 +52,12 @@ sys.path.append(common_util_path)
 from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data
 
 from transformers import BitsAndBytesConfig
-from bigdl.llm.transformers import AutoModelForCausalLM
-from bigdl.llm.transformers.relora import ReLoRATrainer
-# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model
-from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
+from ipex_llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers.relora import ReLoRATrainer
+# import them from ipex_llm.transformers.qlora to get a BigDL-LLM compatible Peft model
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
     LoraConfig
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
  
 local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0")
 world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1")
diff --git a/python/llm/example/GPU/LLM-Finetuning/common/utils/prompter.py b/python/llm/example/GPU/LLM-Finetuning/common/utils/prompter.py
index 835b80bc..f8ae79cb 100644
--- a/python/llm/example/GPU/LLM-Finetuning/common/utils/prompter.py
+++ b/python/llm/example/GPU/LLM-Finetuning/common/utils/prompter.py
@@ -34,7 +34,7 @@
 import json
 import os.path as osp
 from typing import Union
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 
 class Prompter(object):
diff --git a/python/llm/example/GPU/LLM-Finetuning/common/utils/util.py b/python/llm/example/GPU/LLM-Finetuning/common/utils/util.py
index e8bd0a2f..76931f2c 100644
--- a/python/llm/example/GPU/LLM-Finetuning/common/utils/util.py
+++ b/python/llm/example/GPU/LLM-Finetuning/common/utils/util.py
@@ -141,9 +141,9 @@ def get_train_val_data(data, tokenizer, prompter, train_on_inputs,
 def merge_adapter(base_model, tokenizer, adapter_path, output_path):
     """Merge the adapter into the original model and save"""
     import torch
-    from bigdl.llm.transformers.qlora import PeftModel, LoraConfig
-    from bigdl.llm.transformers import AutoModelForCausalLM
-    from bigdl.llm.transformers.low_bit_linear import get_block_size
+    from ipex_llm.transformers.qlora import PeftModel, LoraConfig
+    from ipex_llm.transformers import AutoModelForCausalLM
+    from ipex_llm.transformers.low_bit_linear import get_block_size
     import tempfile
     import shutil
 
diff --git a/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py b/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py
index e5ced801..96df015f 100644
--- a/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py
+++ b/python/llm/example/GPU/LangChain/transformer_int4_gpu/chat.py
@@ -21,7 +21,7 @@
 
 import argparse
 
-from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
+from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM
 from langchain import PromptTemplate, LLMChain
 from langchain import HuggingFacePipeline
 
diff --git a/python/llm/example/GPU/LangChain/transformer_int4_gpu/rag.py b/python/llm/example/GPU/LangChain/transformer_int4_gpu/rag.py
index 1e46d8d4..a1633dd3 100644
--- a/python/llm/example/GPU/LangChain/transformer_int4_gpu/rag.py
+++ b/python/llm/example/GPU/LangChain/transformer_int4_gpu/rag.py
@@ -31,8 +31,8 @@ from langchain.text_splitter import CharacterTextSplitter
 from langchain.chains.question_answering import load_qa_chain
 from langchain.callbacks.manager import CallbackManager
 
-from bigdl.llm.langchain.llms import TransformersLLM
-from bigdl.llm.langchain.embeddings import TransformersEmbeddings
+from ipex_llm.langchain.llms import TransformersLLM
+from ipex_llm.langchain.embeddings import TransformersEmbeddings
 
 text_doc = '''
 BigDL seamlessly scales your data analytics & AI applications from laptop to cloud, with the following libraries:
diff --git a/python/llm/example/GPU/LlamaIndex/rag.py b/python/llm/example/GPU/LlamaIndex/rag.py
index 7fb1146e..97dc3ae7 100644
--- a/python/llm/example/GPU/LlamaIndex/rag.py
+++ b/python/llm/example/GPU/LlamaIndex/rag.py
@@ -163,7 +163,7 @@ def main(args):
     embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path)
     
     # Use custom LLM in BigDL
-    from bigdl.llm.llamaindex.llms import BigdlLLM
+    from ipex_llm.llamaindex.llms import BigdlLLM
     llm = BigdlLLM(
         model_name=args.model_path,
         tokenizer_name=args.model_path,
diff --git a/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py b/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py
index f3f1414e..8c7070ba 100644
--- a/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py
+++ b/python/llm/example/GPU/ModelScope-Models/Save-Load/generate.py
@@ -17,7 +17,7 @@
 import torch
 import time
 import argparse
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from modelscope import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/ModelScope-Models/generate.py b/python/llm/example/GPU/ModelScope-Models/generate.py
index 48bce466..b4fd6637 100644
--- a/python/llm/example/GPU/ModelScope-Models/generate.py
+++ b/python/llm/example/GPU/ModelScope-Models/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 import numpy as np
 
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from modelscope import AutoTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
index 4247fdcc..cc6c3c48 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
+++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py
@@ -20,7 +20,7 @@ import intel_extension_for_pytorch as ipex
 import time
 import argparse
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py
index 5733e2a3..98491948 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/aquila2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/BAAI/AquilaChat2-7B/tree/main/predict.py
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py
index 108ccb00..52f8adf0 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model
 BAICHUAN_PROMPT_FORMAT = "<human>{prompt} <bot>"
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py
index 15d3db8a..215370b4 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/baichuan2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # prompt format referred from https://github.com/baichuan-inc/Baichuan2/issues/227 
 # and https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/generation_utils.py#L7-L49
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py
index c7c3a3f0..1e830107 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/bark/synthesize_speech.py
@@ -20,7 +20,7 @@ import scipy
 import time
 import argparse
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 from transformers import AutoProcessor, BarkModel
 
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py
index d62eea25..ac6e0842 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/bluelm/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model
 BLUELM_PROMPT_FORMAT = "<human>{prompt} <bot>"
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py
index bd7ac596..71b6ceea 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModel, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L1007
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py
index 78bf75d9..1e860e80 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm2/streamchat.py
@@ -20,7 +20,7 @@ import argparse
 import numpy as np
 
 from transformers import AutoModel, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 
 if __name__ == '__main__':
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py
index a6432ef3..1568e085 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModel, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://github.com/THUDM/ChatGLM3/blob/main/PROMPT.md
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py
index 569f6ec7..20f8b33c 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/chatglm3/streamchat.py
@@ -20,7 +20,7 @@ import argparse
 import numpy as np
 
 from transformers import AutoModel, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 
 if __name__ == '__main__':
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
index c65b3079..9d09c857 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, CodeLlamaTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/docs/transformers/v4.34.1/model_doc/code_llama
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py
index d2f32ce7..3a4c7e52 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/Deci/DeciLM-7B-instruct#prompt-template
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py
index 70baa8a3..be92fd95 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/deepseek/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # Refer to https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/distil-whisper/recognize.py b/python/llm/example/GPU/PyTorch-Models/Model/distil-whisper/recognize.py
index 2c5326b1..313b1a06 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/distil-whisper/recognize.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/distil-whisper/recognize.py
@@ -17,7 +17,7 @@
 import time
 import argparse
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 from datasets import load_dataset
 from transformers import AutoModelForSpeechSeq2Seq, pipeline
 from transformers.models.whisper import WhisperFeatureExtractor, WhisperTokenizer
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py
index 403135de..a084b615 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v1/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/databricks/dolly-v1-6b#generate-text
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py
index a9f218bd..9445f406 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/dolly-v2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/databricks/dolly-v2-12b/blob/main/instruct_pipeline.py#L15
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py
index c9ded902..11eedd25 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/flan-t5/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 FLAN_T5_PROMPT_FORMAT = "<|User|>:{prompt}"
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py
index 570edfbe..799bb62c 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 import intel_extension_for_pytorch as ipex
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py
index 81042d06..3fe07715 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llama2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llama2/low_memory_generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llama2/low_memory_generate.py
index 105e1d5f..55c9e70b 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/llama2/low_memory_generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llama2/low_memory_generate.py
@@ -32,8 +32,8 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from bigdl.llm import optimize_model
-from bigdl.llm.transformers.low_bit_linear import FP4Params, LowBitLinear
+from ipex_llm import optimize_model
+from ipex_llm.transformers.low_bit_linear import FP4Params, LowBitLinear
 
 MAX_LENGTH = 4096
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
index 0bf3f23d..6a6e5a4a 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
@@ -56,7 +56,7 @@ from llava.mm_utils import (
     KeywordsStoppingCriteria
 )
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # Load the pretrained model.
 # Adapted from llava.model.builder.load_pretrained_model.
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py
index 2f4c4879..e1b392b2 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py
@@ -18,7 +18,7 @@ import argparse
 import time
 import torch
 import intel_extension_for_pytorch as ipex
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 from transformers import AutoTokenizer
 
 from model import MambaLMHeadModel
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
index d05ee560..459d23e5 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py
index 83d74959..ae8f0a97 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/mixtral/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1#instruction-format
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py
index 743192e2..827b59bd 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-1_5/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_mixformer_sequential.py
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py
index a91b64e7..3c629f84 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/microsoft/phi-2
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py
index 991377fd..8287a37b 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py
@@ -21,7 +21,7 @@ import numpy as np
 
 from transformers import AutoTokenizer, GenerationConfig
 import intel_extension_for_pytorch as ipex
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py
index 29adf173..839d7772 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/chat.py
@@ -21,7 +21,7 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 torch.manual_seed(1234)
 
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py
index 04b4779d..8b81a4f5 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 import intel_extension_for_pytorch as ipex
 import numpy as np
 
@@ -39,7 +39,7 @@ if __name__ == '__main__':
 
     
     from transformers import AutoModelForCausalLM
-    from bigdl.llm import optimize_model
+    from ipex_llm import optimize_model
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  trust_remote_code=True,
                                                  torch_dtype = torch.float16,
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py
index 57f2c9f6..73eecb01 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/replit/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 REPLIT_PROMPT_FORMAT = "{prompt}"
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py
index 930a3881..95388061 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py
@@ -20,7 +20,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # prompt format is tuned based on the output example in this link:
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py
index 342d6229..9776a039 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/synthesize_speech.py
@@ -43,7 +43,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech
 from datasets import load_dataset
 import soundfile as sf
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py
index a4e0a08a..380d63c3 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/starcoder/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 STARCODER_PROMPT_FORMAT = "{prompt}"
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py
index d6649004..d08d6087 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py
@@ -18,7 +18,7 @@ import torch
 import time
 import argparse
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 # you could tune the prompt based on your own model
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py
index 9615f943..31179c8c 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py
@@ -19,7 +19,7 @@ import sys, os, time
 import intel_extension_for_pytorch as ipex
 import argparse
 from transformers import LlamaTokenizer, AutoModelForCausalLM
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # Refer to https://huggingface.co/IEITYuan/Yuan2-2B-hf#Usage
 YUAN2_PROMPT_FORMAT = """
diff --git a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py
index ddd85a94..6caec894 100644
--- a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/generate.py
@@ -19,7 +19,7 @@ import time
 import argparse
 
 from transformers import AutoModelForCausalLM, LlamaTokenizer
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # you could tune the prompt based on your own model,
 # here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style
diff --git a/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py b/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py
index b19c3571..9a289568 100644
--- a/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Save-Load/generate.py
@@ -17,8 +17,8 @@
 import torch
 import time
 import argparse
-from bigdl.llm import optimize_model
-from bigdl.llm.optimize import low_memory_init, load_low_bit
+from ipex_llm import optimize_model
+from ipex_llm.optimize import low_memory_init, load_low_bit
 from transformers import AutoModelForCausalLM, LlamaTokenizer
 
 # you could tune the prompt based on your own model,
diff --git a/python/llm/example/GPU/Speculative-Decoding/baichuan2/speculative.py b/python/llm/example/GPU/Speculative-Decoding/baichuan2/speculative.py
index c09d57a1..af970acd 100644
--- a/python/llm/example/GPU/Speculative-Decoding/baichuan2/speculative.py
+++ b/python/llm/example/GPU/Speculative-Decoding/baichuan2/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/GPU/Speculative-Decoding/chatglm3/speculative.py b/python/llm/example/GPU/Speculative-Decoding/chatglm3/speculative.py
index 2ceb77bd..e754e7c7 100644
--- a/python/llm/example/GPU/Speculative-Decoding/chatglm3/speculative.py
+++ b/python/llm/example/GPU/Speculative-Decoding/chatglm3/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModel
+from ipex_llm.transformers import AutoModel
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/GPU/Speculative-Decoding/gpt-j/speculative.py b/python/llm/example/GPU/Speculative-Decoding/gpt-j/speculative.py
index 46ee8399..4ea31b1b 100644
--- a/python/llm/example/GPU/Speculative-Decoding/gpt-j/speculative.py
+++ b/python/llm/example/GPU/Speculative-Decoding/gpt-j/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/GPU/Speculative-Decoding/llama2/speculative.py b/python/llm/example/GPU/Speculative-Decoding/llama2/speculative.py
index 443e55c9..7a102ab5 100644
--- a/python/llm/example/GPU/Speculative-Decoding/llama2/speculative.py
+++ b/python/llm/example/GPU/Speculative-Decoding/llama2/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 import argparse
 import time
diff --git a/python/llm/example/GPU/Speculative-Decoding/mistral/speculative.py b/python/llm/example/GPU/Speculative-Decoding/mistral/speculative.py
index 9fad5d94..1f6763a3 100644
--- a/python/llm/example/GPU/Speculative-Decoding/mistral/speculative.py
+++ b/python/llm/example/GPU/Speculative-Decoding/mistral/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/GPU/Speculative-Decoding/qwen/speculative.py b/python/llm/example/GPU/Speculative-Decoding/qwen/speculative.py
index d5ac109d..d1761af2 100644
--- a/python/llm/example/GPU/Speculative-Decoding/qwen/speculative.py
+++ b/python/llm/example/GPU/Speculative-Decoding/qwen/speculative.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
 from transformers import AutoTokenizer
 import argparse
 import time
diff --git a/python/llm/example/GPU/vLLM-Serving/README.md b/python/llm/example/GPU/vLLM-Serving/README.md
index e09206b1..595b0d84 100644
--- a/python/llm/example/GPU/vLLM-Serving/README.md
+++ b/python/llm/example/GPU/vLLM-Serving/README.md
@@ -73,7 +73,7 @@ To fully utilize the continuous batching feature of the `vLLM`, you can send req
 #!/bin/bash
 # You may also want to adjust the `--max-num-batched-tokens` argument, it indicates the hard limit
 # of batched prompt length the server will accept
-python -m bigdl.llm.vllm.entrypoints.openai.api_server \
+python -m ipex_llm.vllm.entrypoints.openai.api_server \
         --model /MODEL_PATH/Llama-2-7b-chat-hf/ --port 8000  \
         --load-format 'auto' --device xpu --dtype bfloat16 \
         --load-in-low-bit sym_int4 \
diff --git a/python/llm/example/GPU/vLLM-Serving/offline_inference.py b/python/llm/example/GPU/vLLM-Serving/offline_inference.py
index f74dbcd0..327cfb55 100644
--- a/python/llm/example/GPU/vLLM-Serving/offline_inference.py
+++ b/python/llm/example/GPU/vLLM-Serving/offline_inference.py
@@ -31,8 +31,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from bigdl.llm.vllm.entrypoints.llm import LLM
-from bigdl.llm.vllm.sampling_params import SamplingParams
+from ipex_llm.vllm.entrypoints.llm import LLM
+from ipex_llm.vllm.sampling_params import SamplingParams
 
 # Sample prompts.
 prompts = [
diff --git a/python/llm/portable-zip/chat-ui.bat b/python/llm/portable-zip/chat-ui.bat
index b4748a3d..cb43dc66 100644
--- a/python/llm/portable-zip/chat-ui.bat
+++ b/python/llm/portable-zip/chat-ui.bat
@@ -11,7 +11,7 @@ if errorlevel 1 (
 )
 echo [1/3] Controller started successfully
 
-powershell -Command "Start-Process -FilePath PowerShell -ArgumentList '-Command', '& { .\python-embed\python.exe -m bigdl.llm.serving.model_worker --model-path %modelpath% --device cpu > zip_model_worker.log 2>&1 }' -NoNewWindow"
+powershell -Command "Start-Process -FilePath PowerShell -ArgumentList '-Command', '& { .\python-embed\python.exe -m ipex_llm.serving.model_worker --model-path %modelpath% --device cpu > zip_model_worker.log 2>&1 }' -NoNewWindow"
 timeout /t 1 /nobreak >nul 2>&1
 :loop2
 powershell -Command "$output = Get-Content zip_model_worker.log; if($null -eq $output -or !($output | Select-String -Pattern 'Uvicorn running on')) { exit 1 } else { exit 0 }"
diff --git a/python/llm/portable-zip/chat.py b/python/llm/portable-zip/chat.py
index 18bd319f..439bdd31 100644
--- a/python/llm/portable-zip/chat.py
+++ b/python/llm/portable-zip/chat.py
@@ -51,7 +51,7 @@ from transformers.generation.stopping_criteria import StoppingCriteriaList
 
 from colorama import Fore
 
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 from kv_cache import StartRecentKVCache
 
 HUMAN_ID = "<human>"
diff --git a/python/llm/setup.py b/python/llm/setup.py
index b1dcca9f..69a5c39b 100644
--- a/python/llm/setup.py
+++ b/python/llm/setup.py
@@ -311,11 +311,11 @@ def setup_package():
         packages=get_llm_packages(),
         package_dir={"": "src"},
         package_data={
-            "bigdl.llm": package_data[platform_name] + ["cli/prompts/*.txt"] + ["transformers/gguf/models/model_implement/*/*.json"]},
+            "ipex_llm": package_data[platform_name] + ["cli/prompts/*.txt"] + ["transformers/gguf/models/model_implement/*/*.json"]},
         include_package_data=True,
         entry_points={
             "console_scripts": [
-                'llm-convert=bigdl.llm.convert_model:main'
+                'llm-convert=ipex_llm.convert_model:main'
             ]
         },
         extras_require={"all": all_requires,
diff --git a/python/llm/src/bigdl/__init__.py b/python/llm/src/bigdl/__init__.py
deleted file mode 100644
index 30646857..00000000
--- a/python/llm/src/bigdl/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#
-# Copyright 2016 The BigDL Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This would makes sure Python is aware there is more than one sub-package within bigdl,
-# physically located elsewhere.
-# Otherwise there would be module not found error in non-pip's setting as Python would
-# only search the first bigdl package and end up finding only one sub-package.
-import pkgutil
-__path__ = pkgutil.extend_path(__path__, __name__)  # type: ignore
diff --git a/python/llm/src/bigdl/llm/__init__.py b/python/llm/src/ipex_llm/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/__init__.py
rename to python/llm/src/ipex_llm/__init__.py
diff --git a/python/llm/src/bigdl/llm/cli/llm-chat b/python/llm/src/ipex_llm/cli/llm-chat
similarity index 96%
rename from python/llm/src/bigdl/llm/cli/llm-chat
rename to python/llm/src/ipex_llm/cli/llm-chat
index 0b84171d..6f480fae 100755
--- a/python/llm/src/bigdl/llm/cli/llm-chat
+++ b/python/llm/src/ipex_llm/cli/llm-chat
@@ -8,7 +8,7 @@ n_predict=512
 
 EXTRA_ARGS=('--color')
 
-llm_dir="$(dirname "$(python -c "import bigdl.llm;print(bigdl.llm.__file__)")")"
+llm_dir="$(dirname "$(python -c "import ipex_llm;print(ipex_llm.__file__)")")"
 lib_dir="$llm_dir/libs"
 prompts_dir="$llm_dir/cli/prompts"
 
diff --git a/python/llm/src/bigdl/llm/cli/llm-chat.ps1 b/python/llm/src/ipex_llm/cli/llm-chat.ps1
similarity index 92%
rename from python/llm/src/bigdl/llm/cli/llm-chat.ps1
rename to python/llm/src/ipex_llm/cli/llm-chat.ps1
index b3ff0403..6e2a8747 100644
--- a/python/llm/src/bigdl/llm/cli/llm-chat.ps1
+++ b/python/llm/src/ipex_llm/cli/llm-chat.ps1
@@ -1,8 +1,8 @@
-$llm_dir = (Split-Path -Parent (python -c "import bigdl.llm;print(bigdl.llm.__file__)"))
+$llm_dir = (Split-Path -Parent (python -c "import ipex_llm;print(ipex_llm.__file__)"))
 $lib_dir = Join-Path $llm_dir "libs"
 $prompt_dir = Join-Path $llm_dir "cli/prompts"
 
-$vnni_enable = ((python -c "from bigdl.llm.utils.isa_checker import check_avx_vnni;print(check_avx_vnni())").ToLower() -eq "true")
+$vnni_enable = ((python -c "from ipex_llm.utils.isa_checker import check_avx_vnni;print(check_avx_vnni())").ToLower() -eq "true")
 $model_family = ""
 $threads = 8
 # Number of tokens to predict (made it larger than default because we want a long interaction)
diff --git a/python/llm/src/bigdl/llm/cli/llm-cli b/python/llm/src/ipex_llm/cli/llm-cli
similarity index 96%
rename from python/llm/src/bigdl/llm/cli/llm-cli
rename to python/llm/src/ipex_llm/cli/llm-cli
index a145c09a..fdb182ea 100755
--- a/python/llm/src/bigdl/llm/cli/llm-cli
+++ b/python/llm/src/ipex_llm/cli/llm-cli
@@ -6,7 +6,7 @@ threads=8
 n_predict=128
 
 
-llm_dir="$(dirname "$(python -c "import bigdl.llm;print(bigdl.llm.__file__)")")"
+llm_dir="$(dirname "$(python -c "import ipex_llm;print(ipex_llm.__file__)")")"
 lib_dir="$llm_dir/libs"
 
 # Function to display help message
diff --git a/python/llm/src/bigdl/llm/cli/llm-cli.ps1 b/python/llm/src/ipex_llm/cli/llm-cli.ps1
similarity index 92%
rename from python/llm/src/bigdl/llm/cli/llm-cli.ps1
rename to python/llm/src/ipex_llm/cli/llm-cli.ps1
index c30138c2..bebcb044 100755
--- a/python/llm/src/bigdl/llm/cli/llm-cli.ps1
+++ b/python/llm/src/ipex_llm/cli/llm-cli.ps1
@@ -1,8 +1,8 @@
-$llm_dir = (Split-Path -Parent (python -c "import bigdl.llm;print(bigdl.llm.__file__)"))
+$llm_dir = (Split-Path -Parent (python -c "import ipex_llm;print(ipex_llm.__file__)"))
 $lib_dir = Join-Path $llm_dir "libs"
 
 
-$vnni_enable = ((python -c "from bigdl.llm.utils.isa_checker import check_avx_vnni;print(check_avx_vnni())").ToLower() -eq "true")
+$vnni_enable = ((python -c "from ipex_llm.utils.isa_checker import check_avx_vnni;print(check_avx_vnni())").ToLower() -eq "true")
 $model_family = ""
 $threads = 8
 $n_predict = 128
diff --git a/python/llm/src/bigdl/llm/cli/prompts/chat-with-llm.txt b/python/llm/src/ipex_llm/cli/prompts/chat-with-llm.txt
similarity index 100%
rename from python/llm/src/bigdl/llm/cli/prompts/chat-with-llm.txt
rename to python/llm/src/ipex_llm/cli/prompts/chat-with-llm.txt
diff --git a/python/llm/src/bigdl/llm/convert_model.py b/python/llm/src/ipex_llm/convert_model.py
similarity index 96%
rename from python/llm/src/bigdl/llm/convert_model.py
rename to python/llm/src/ipex_llm/convert_model.py
index 026dcae6..8d9ce2b4 100644
--- a/python/llm/src/bigdl/llm/convert_model.py
+++ b/python/llm/src/ipex_llm/convert_model.py
@@ -15,7 +15,7 @@
 #
 
 
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 import argparse
 import os
 
@@ -75,7 +75,7 @@ def llm_convert(model,
     :return: the path string to the converted lower precision checkpoint.
     """
     if model_format == "pth":
-        from bigdl.llm.ggml.convert_model import convert_model as ggml_convert_model
+        from ipex_llm.ggml.convert_model import convert_model as ggml_convert_model
         _, _used_args = _special_kwarg_check(kwargs=kwargs,
                                              check_args=["tmp_path"])
         return ggml_convert_model(input_path=model,
@@ -85,7 +85,7 @@ def llm_convert(model,
                                   **_used_args,
                                   )
     elif model_format == "gptq":
-        from bigdl.llm.gptq.convert.convert_gptq_to_ggml import convert_gptq2ggml
+        from ipex_llm.gptq.convert.convert_gptq_to_ggml import convert_gptq2ggml
         invalidInputError(model_family == "llama" and outtype == 'int4',
                           "Convert GPTQ models should always "
                           "specify `--model-family llama --dtype int4` in the command line.")
diff --git a/python/llm/src/bigdl/llm/format.sh b/python/llm/src/ipex_llm/format.sh
similarity index 100%
rename from python/llm/src/bigdl/llm/format.sh
rename to python/llm/src/ipex_llm/format.sh
diff --git a/python/llm/src/bigdl/llm/ggml/__init__.py b/python/llm/src/ipex_llm/ggml/__init__.py
similarity index 85%
rename from python/llm/src/bigdl/llm/ggml/__init__.py
rename to python/llm/src/ipex_llm/ggml/__init__.py
index 21ab9d44..6df2d794 100644
--- a/python/llm/src/bigdl/llm/ggml/__init__.py
+++ b/python/llm/src/ipex_llm/ggml/__init__.py
@@ -19,13 +19,13 @@
 # Otherwise there would be module not found error in non-pip's setting as Python would
 # only search the first bigdl package and end up finding only one sub-package.
 
-from bigdl.llm.utils.common import LazyImport
+from ipex_llm.utils.common import LazyImport
 import os
 
-convert_model = LazyImport('bigdl.llm.ggml.convert_model.convert_model')
+convert_model = LazyImport('ipex_llm.ggml.convert_model.convert_model')
 
 # Default is false, set to true to auto importing glibc_checker.
 BIGDL_GLIBC_CHECK = os.getenv("BIGDL_GLIBC_CHECK", 'False').lower() in ('true', '1', 't')
 if BIGDL_GLIBC_CHECK:
-    from bigdl.llm.utils.glibc_checker import check_glibc_version
+    from ipex_llm.utils.glibc_checker import check_glibc_version
     check_glibc_version()
diff --git a/python/llm/src/bigdl/llm/ggml/convert.py b/python/llm/src/ipex_llm/ggml/convert.py
similarity index 98%
rename from python/llm/src/bigdl/llm/ggml/convert.py
rename to python/llm/src/ipex_llm/ggml/convert.py
index 0ab511ba..cb4f6efb 100644
--- a/python/llm/src/bigdl/llm/ggml/convert.py
+++ b/python/llm/src/ipex_llm/ggml/convert.py
@@ -40,8 +40,8 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.utils.convert_util import *
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.utils.convert_util import *
 from pathlib import Path
 import os
 
diff --git a/python/llm/src/bigdl/llm/ggml/convert_model.py b/python/llm/src/ipex_llm/ggml/convert_model.py
similarity index 97%
rename from python/llm/src/bigdl/llm/ggml/convert_model.py
rename to python/llm/src/ipex_llm/ggml/convert_model.py
index 5b76dca7..074a7d35 100644
--- a/python/llm/src/bigdl/llm/ggml/convert_model.py
+++ b/python/llm/src/ipex_llm/ggml/convert_model.py
@@ -16,9 +16,9 @@
 import os
 import time
 from pathlib import Path
-from bigdl.llm.ggml.convert import _convert_to_ggml, _convert_chatglm
-from bigdl.llm.ggml.quantize import quantize
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.ggml.convert import _convert_to_ggml, _convert_chatglm
+from ipex_llm.ggml.quantize import quantize
+from ipex_llm.utils.common import invalidInputError
 import argparse
 import tempfile
 
diff --git a/python/llm/src/bigdl/llm/ggml/model/__init__.py b/python/llm/src/ipex_llm/ggml/model/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/ggml/model/__init__.py
rename to python/llm/src/ipex_llm/ggml/model/__init__.py
diff --git a/python/llm/src/bigdl/llm/ggml/model/bloom/__init__.py b/python/llm/src/ipex_llm/ggml/model/bloom/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/ggml/model/bloom/__init__.py
rename to python/llm/src/ipex_llm/ggml/model/bloom/__init__.py
diff --git a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py b/python/llm/src/ipex_llm/ggml/model/bloom/bloom.py
similarity index 99%
rename from python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py
rename to python/llm/src/ipex_llm/ggml/model/bloom/bloom.py
index 349cec3a..6299e203 100644
--- a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom.py
+++ b/python/llm/src/ipex_llm/ggml/model/bloom/bloom.py
@@ -47,8 +47,8 @@
 
 from .bloom_cpp import bloom_load, bloom_free, bloom_run
 from .bloom_cpp import bloom_tokenize, bloom_detokenize, bloom_forward, bloom_eval, bloom_embed
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.ggml.model.generation import GenerationMixin
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.ggml.model.generation import GenerationMixin
 from typing import List, Optional, Generator, Sequence, Union
 import time
 import uuid
diff --git a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom_cpp.py b/python/llm/src/ipex_llm/ggml/model/bloom/bloom_cpp.py
similarity index 98%
rename from python/llm/src/bigdl/llm/ggml/model/bloom/bloom_cpp.py
rename to python/llm/src/ipex_llm/ggml/model/bloom/bloom_cpp.py
index 912fa729..a7178a8e 100644
--- a/python/llm/src/bigdl/llm/ggml/model/bloom/bloom_cpp.py
+++ b/python/llm/src/ipex_llm/ggml/model/bloom/bloom_cpp.py
@@ -64,8 +64,8 @@ from ctypes import (
     c_size_t,
 )
 import pathlib
-from bigdl.llm.utils.utils import get_shared_lib_info
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.utils import get_shared_lib_info
+from ipex_llm.utils.common import invalidInputError
 
 
 # Load the library
diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py b/python/llm/src/ipex_llm/ggml/model/chatglm/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/ggml/model/chatglm/__init__.py
rename to python/llm/src/ipex_llm/ggml/model/chatglm/__init__.py
diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py
similarity index 99%
rename from python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
rename to python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py
index 66341b0e..cd1efba6 100644
--- a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm.py
+++ b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py
@@ -48,8 +48,8 @@
 
 from .chatglm_cpp import chatglm_load, chatglm_tokenize, chatglm_detokenize, \
     chatglm_forward, chatglm_eos_token
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.ggml.model.generation import GenerationMixin
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.ggml.model.generation import GenerationMixin
 from typing import List, Optional, Generator, Sequence, Union
 import time
 import uuid
diff --git a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py
similarity index 97%
rename from python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py
rename to python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py
index 47513e54..ce136cdb 100644
--- a/python/llm/src/bigdl/llm/ggml/model/chatglm/chatglm_cpp.py
+++ b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py
@@ -23,7 +23,7 @@
 from typing import List
 from pathlib import Path
 
-from bigdl.llm.libs.chatglm_C import Pipeline, GenerationConfig
+from ipex_llm.libs.chatglm_C import Pipeline, GenerationConfig
 
 
 class ChatGLMContext:
diff --git a/python/llm/src/bigdl/llm/ggml/model/generation/__init__.py b/python/llm/src/ipex_llm/ggml/model/generation/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/ggml/model/generation/__init__.py
rename to python/llm/src/ipex_llm/ggml/model/generation/__init__.py
diff --git a/python/llm/src/bigdl/llm/ggml/model/generation/utils.py b/python/llm/src/ipex_llm/ggml/model/generation/utils.py
similarity index 99%
rename from python/llm/src/bigdl/llm/ggml/model/generation/utils.py
rename to python/llm/src/ipex_llm/ggml/model/generation/utils.py
index 5e61bdfc..710c729d 100644
--- a/python/llm/src/bigdl/llm/ggml/model/generation/utils.py
+++ b/python/llm/src/ipex_llm/ggml/model/generation/utils.py
@@ -21,7 +21,7 @@
 
 
 from typing import Optional, Union, Sequence, List
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 import torch
 
 
diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/__init__.py b/python/llm/src/ipex_llm/ggml/model/gptneox/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/ggml/model/gptneox/__init__.py
rename to python/llm/src/ipex_llm/ggml/model/gptneox/__init__.py
diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py b/python/llm/src/ipex_llm/ggml/model/gptneox/gptneox.py
similarity index 99%
rename from python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py
rename to python/llm/src/ipex_llm/ggml/model/gptneox/gptneox.py
index 6d07e216..41b62413 100644
--- a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox.py
+++ b/python/llm/src/ipex_llm/ggml/model/gptneox/gptneox.py
@@ -54,8 +54,8 @@ import multiprocessing
 import ctypes
 from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple
 from collections import deque, OrderedDict
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.ggml.model.generation import GenerationMixin
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.ggml.model.generation import GenerationMixin
 from . import gptneox_cpp
 from .gptneox_types import *
 
diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_cpp.py b/python/llm/src/ipex_llm/ggml/model/gptneox/gptneox_cpp.py
similarity index 99%
rename from python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_cpp.py
rename to python/llm/src/ipex_llm/ggml/model/gptneox/gptneox_cpp.py
index edc5cc81..1de5ee4e 100644
--- a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_cpp.py
+++ b/python/llm/src/ipex_llm/ggml/model/gptneox/gptneox_cpp.py
@@ -63,8 +63,8 @@ from ctypes import (
     c_size_t,
 )
 import pathlib
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.utils.utils import get_shared_lib_info
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.utils.utils import get_shared_lib_info
 
 
 # Load the library
diff --git a/python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_types.py b/python/llm/src/ipex_llm/ggml/model/gptneox/gptneox_types.py
similarity index 100%
rename from python/llm/src/bigdl/llm/ggml/model/gptneox/gptneox_types.py
rename to python/llm/src/ipex_llm/ggml/model/gptneox/gptneox_types.py
diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/__init__.py b/python/llm/src/ipex_llm/ggml/model/llama/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/ggml/model/llama/__init__.py
rename to python/llm/src/ipex_llm/ggml/model/llama/__init__.py
diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py b/python/llm/src/ipex_llm/ggml/model/llama/llama.py
similarity index 99%
rename from python/llm/src/bigdl/llm/ggml/model/llama/llama.py
rename to python/llm/src/ipex_llm/ggml/model/llama/llama.py
index 9319d390..7d34957d 100644
--- a/python/llm/src/bigdl/llm/ggml/model/llama/llama.py
+++ b/python/llm/src/ipex_llm/ggml/model/llama/llama.py
@@ -54,8 +54,8 @@ import math
 import multiprocessing
 from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple
 from collections import deque, OrderedDict
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.ggml.model.generation import GenerationMixin
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.ggml.model.generation import GenerationMixin
 from . import llama_cpp
 from .llama_types import *
 
diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/llama_cpp.py b/python/llm/src/ipex_llm/ggml/model/llama/llama_cpp.py
similarity index 99%
rename from python/llm/src/bigdl/llm/ggml/model/llama/llama_cpp.py
rename to python/llm/src/ipex_llm/ggml/model/llama/llama_cpp.py
index b7b5d2ed..bea2fef5 100644
--- a/python/llm/src/bigdl/llm/ggml/model/llama/llama_cpp.py
+++ b/python/llm/src/ipex_llm/ggml/model/llama/llama_cpp.py
@@ -63,8 +63,8 @@ from ctypes import (
     c_size_t,
 )
 import pathlib
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.utils.utils import get_shared_lib_info
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.utils.utils import get_shared_lib_info
 
 
 # Load the library
diff --git a/python/llm/src/bigdl/llm/ggml/model/llama/llama_types.py b/python/llm/src/ipex_llm/ggml/model/llama/llama_types.py
similarity index 100%
rename from python/llm/src/bigdl/llm/ggml/model/llama/llama_types.py
rename to python/llm/src/ipex_llm/ggml/model/llama/llama_types.py
diff --git a/python/llm/src/bigdl/llm/ggml/model/starcoder/__init__.py b/python/llm/src/ipex_llm/ggml/model/starcoder/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/ggml/model/starcoder/__init__.py
rename to python/llm/src/ipex_llm/ggml/model/starcoder/__init__.py
diff --git a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py b/python/llm/src/ipex_llm/ggml/model/starcoder/starcoder.py
similarity index 99%
rename from python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py
rename to python/llm/src/ipex_llm/ggml/model/starcoder/starcoder.py
index c00935cb..13fb174d 100644
--- a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder.py
+++ b/python/llm/src/ipex_llm/ggml/model/starcoder/starcoder.py
@@ -48,8 +48,8 @@
 from .starcoder_cpp import starcoder_load, starcoder_free, starcoder_run
 from .starcoder_cpp import starcoder_tokenize, starcoder_detokenize
 from .starcoder_cpp import starcoder_forward, starcoder_eval, starcoder_embed
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.ggml.model.generation import GenerationMixin
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.ggml.model.generation import GenerationMixin
 from typing import List, Optional, Generator, Sequence, Union
 import time
 import uuid
diff --git a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder_cpp.py b/python/llm/src/ipex_llm/ggml/model/starcoder/starcoder_cpp.py
similarity index 98%
rename from python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder_cpp.py
rename to python/llm/src/ipex_llm/ggml/model/starcoder/starcoder_cpp.py
index 2b0d80ee..1346324f 100644
--- a/python/llm/src/bigdl/llm/ggml/model/starcoder/starcoder_cpp.py
+++ b/python/llm/src/ipex_llm/ggml/model/starcoder/starcoder_cpp.py
@@ -64,8 +64,8 @@ from ctypes import (
     c_size_t,
 )
 import pathlib
-from bigdl.llm.utils.utils import get_shared_lib_info
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.utils import get_shared_lib_info
+from ipex_llm.utils.common import invalidInputError
 
 
 # Load the library
diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/ipex_llm/ggml/quantize.py
similarity index 99%
rename from python/llm/src/bigdl/llm/ggml/quantize.py
rename to python/llm/src/ipex_llm/ggml/quantize.py
index 382b15eb..15d36202 100644
--- a/python/llm/src/bigdl/llm/ggml/quantize.py
+++ b/python/llm/src/ipex_llm/ggml/quantize.py
@@ -16,7 +16,7 @@
 
 import os
 import subprocess
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 import platform
 from pathlib import Path
 
diff --git a/python/llm/src/bigdl/llm/gptq/__init__.py b/python/llm/src/ipex_llm/gptq/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/gptq/__init__.py
rename to python/llm/src/ipex_llm/gptq/__init__.py
diff --git a/python/llm/src/bigdl/llm/gptq/convert/__init__.py b/python/llm/src/ipex_llm/gptq/convert/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/gptq/convert/__init__.py
rename to python/llm/src/ipex_llm/gptq/convert/__init__.py
diff --git a/python/llm/src/bigdl/llm/gptq/convert/convert_gptq_to_ggml.py b/python/llm/src/ipex_llm/gptq/convert/convert_gptq_to_ggml.py
similarity index 99%
rename from python/llm/src/bigdl/llm/gptq/convert/convert_gptq_to_ggml.py
rename to python/llm/src/ipex_llm/gptq/convert/convert_gptq_to_ggml.py
index 8ae3a0ba..9cd66d89 100644
--- a/python/llm/src/bigdl/llm/gptq/convert/convert_gptq_to_ggml.py
+++ b/python/llm/src/ipex_llm/gptq/convert/convert_gptq_to_ggml.py
@@ -29,7 +29,7 @@ import numpy as np
 import torch
 from sentencepiece import SentencePieceProcessor
 from pathlib import Path
-from bigdl.llm.utils.common.log4Error import invalidInputError
+from ipex_llm.utils.common.log4Error import invalidInputError
 
 
 def write_header(fout, shape, dst_name, ftype_cur):
diff --git a/python/llm/src/bigdl/llm/langchain/__init__.py b/python/llm/src/ipex_llm/langchain/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/langchain/__init__.py
rename to python/llm/src/ipex_llm/langchain/__init__.py
diff --git a/python/llm/src/bigdl/llm/langchain/embeddings/__init__.py b/python/llm/src/ipex_llm/langchain/embeddings/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/langchain/embeddings/__init__.py
rename to python/llm/src/ipex_llm/langchain/embeddings/__init__.py
diff --git a/python/llm/src/bigdl/llm/langchain/embeddings/bigdlllm.py b/python/llm/src/ipex_llm/langchain/embeddings/bigdlllm.py
similarity index 94%
rename from python/llm/src/bigdl/llm/langchain/embeddings/bigdlllm.py
rename to python/llm/src/ipex_llm/langchain/embeddings/bigdlllm.py
index c4686af3..1947039a 100644
--- a/python/llm/src/bigdl/llm/langchain/embeddings/bigdlllm.py
+++ b/python/llm/src/ipex_llm/langchain/embeddings/bigdlllm.py
@@ -61,7 +61,7 @@ class BigdlNativeEmbeddings(BaseModel, Embeddings):
     Example:
         .. code-block:: python
 
-            from bigdl.llm.langchain.embeddings import BigdlNativeEmbeddings
+            from ipex_llm.langchain.embeddings import BigdlNativeEmbeddings
             llama = BigdlNativeEmbeddings(model_path="/path/to/model.bin")
     """
 
@@ -72,10 +72,10 @@ class BigdlNativeEmbeddings(BaseModel, Embeddings):
     """The model family: currently supports llama, gptneox, bloom and starcoder."""
 
     family_info = {
-        'llama': {'module': "bigdl.llm.models", 'class': "Llama"},
-        'bloom': {'module': "bigdl.llm.models", 'class': "Bloom"},
-        'gptneox': {'module': "bigdl.llm.models", 'class': "Gptneox"},
-        'starcoder': {'module':"bigdl.llm.models", 'class': "Starcoder"},
+        'llama': {'module': "ipex_llm.models", 'class': "Llama"},
+        'bloom': {'module': "ipex_llm.models", 'class': "Bloom"},
+        'gptneox': {'module': "ipex_llm.models", 'class': "Gptneox"},
+        'starcoder': {'module':"ipex_llm.models", 'class': "Starcoder"},
     }  #: :meta private:
     """Info necessary for different model family initiation and configure."""
 
@@ -156,7 +156,7 @@ class BigdlNativeEmbeddings(BaseModel, Embeddings):
 
             values["client"] = class_(model_path, embedding=True, **model_params)
 
-            # from bigdl.llm.ggml.model.llama import Llama
+            # from ipex_llm.ggml.model.llama import Llama
 
             # values["client"] = Llama(model_path, embedding=True, **model_params)
 
@@ -205,14 +205,14 @@ class _BaseEmbeddings(BaseModel, Embeddings):
     """Wrapper around bigdl-llm embedding models.
 
     param model_path: If running with ``native int4``, the path should be converted BigDL-LLM
-          optimized ggml binary checkpoint, which should be converted by ``bigdl.llm.llm_convert``.
+          optimized ggml binary checkpoint, which should be converted by ``ipex_llm.llm_convert``.
           If running with ``transformers int4``, the path should be the huggingface repo id
           to be downloaded or the huggingface checkpoint folder.
 
     Example:
         .. code-block:: python
 
-            from bigdl.llm.langchain.embeddings import LlamaEmbeddings
+            from ipex_llm.langchain.embeddings import LlamaEmbeddings
             llama = LlamaEmbeddings(model_path="/path/to/model.bin")
     """
 
@@ -313,7 +313,7 @@ class _BaseEmbeddings(BaseModel, Embeddings):
                 values["client"] = TransformersEmbeddings.from_model_id(model_path, model_kwargs,
                                                                         **kwargs)
 
-            # from bigdl.llm.ggml.model.llama import Llama
+            # from ipex_llm.ggml.model.llama import Llama
 
             # values["client"] = Llama(model_path, embedding=True, **model_params)
 
@@ -366,19 +366,19 @@ class _BaseEmbeddings(BaseModel, Embeddings):
 
 class LlamaEmbeddings(_BaseEmbeddings):
     ggml_model = "Llama"
-    ggml_module = "bigdl.llm.models"
+    ggml_module = "ipex_llm.models"
 
 
 class BloomEmbeddings(_BaseEmbeddings):
     ggml_model = "Bloom"
-    ggml_module = "bigdl.llm.models"
+    ggml_module = "ipex_llm.models"
 
 
 class GptneoxEmbeddings(_BaseEmbeddings):
     ggml_model = "Gptneox"
-    ggml_module = "bigdl.llm.models"
+    ggml_module = "ipex_llm.models"
 
 
 class StarcoderEmbeddings(_BaseEmbeddings):
     ggml_model = "Starcoder"
-    ggml_module = "bigdl.llm.models"
+    ggml_module = "ipex_llm.models"
diff --git a/python/llm/src/bigdl/llm/langchain/embeddings/transformersembeddings.py b/python/llm/src/ipex_llm/langchain/embeddings/transformersembeddings.py
similarity index 98%
rename from python/llm/src/bigdl/llm/langchain/embeddings/transformersembeddings.py
rename to python/llm/src/ipex_llm/langchain/embeddings/transformersembeddings.py
index 9c69f474..dcc1c733 100644
--- a/python/llm/src/bigdl/llm/langchain/embeddings/transformersembeddings.py
+++ b/python/llm/src/ipex_llm/langchain/embeddings/transformersembeddings.py
@@ -64,7 +64,7 @@ class TransformersEmbeddings(BaseModel, Embeddings):
     Example:
         .. code-block:: python
 
-            from bigdl.llm.langchain.embeddings import TransformersEmbeddings
+            from ipex_llm.langchain.embeddings import TransformersEmbeddings
             embeddings = TransformersEmbeddings.from_model_id(model_id)
     """
 
@@ -101,7 +101,7 @@ class TransformersEmbeddings(BaseModel, Embeddings):
             An object of TransformersEmbeddings.
         """
         try:
-            from bigdl.llm.transformers import AutoModel
+            from ipex_llm.transformers import AutoModel
             from transformers import AutoTokenizer, LlamaTokenizer
 
         except ImportError:
diff --git a/python/llm/src/bigdl/llm/langchain/llms/__init__.py b/python/llm/src/ipex_llm/langchain/llms/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/langchain/llms/__init__.py
rename to python/llm/src/ipex_llm/langchain/llms/__init__.py
diff --git a/python/llm/src/bigdl/llm/langchain/llms/bigdlllm.py b/python/llm/src/ipex_llm/langchain/llms/bigdlllm.py
similarity index 96%
rename from python/llm/src/bigdl/llm/langchain/llms/bigdlllm.py
rename to python/llm/src/ipex_llm/langchain/llms/bigdlllm.py
index 3a606a06..afd51780 100644
--- a/python/llm/src/bigdl/llm/langchain/llms/bigdlllm.py
+++ b/python/llm/src/ipex_llm/langchain/llms/bigdlllm.py
@@ -61,7 +61,7 @@ class BigdlNativeLLM(LLM):
     Example:
         .. code-block:: python
 
-            from bigdl.llm.langchain.llms import BigdlNativeLLM
+            from ipex_llm.langchain.llms import BigdlNativeLLM
             llm = BigdlNativeLLM(model_path="/path/to/llama/model")
     """
 
@@ -73,11 +73,11 @@ class BigdlNativeLLM(LLM):
     """The model family: currently supports llama, gptneox, bloom, starcoder and chatglm."""
 
     family_info = {
-        'llama': {'module': "bigdl.llm.models" , 'class': "Llama"},
-        'bloom': {'module': "bigdl.llm.models", 'class': "Bloom"},
-        'gptneox': {'module': "bigdl.llm.models", 'class': "Gptneox"},
-        'starcoder': {'module':"bigdl.llm.models", 'class': "Starcoder"},
-        'chatglm': {'module':"bigdl.llm.ggml.model.chatglm", 'class': "ChatGLM"},
+        'llama': {'module': "ipex_llm.models" , 'class': "Llama"},
+        'bloom': {'module': "ipex_llm.models", 'class': "Bloom"},
+        'gptneox': {'module': "ipex_llm.models", 'class': "Gptneox"},
+        'starcoder': {'module':"ipex_llm.models", 'class': "Starcoder"},
+        'chatglm': {'module':"ipex_llm.ggml.model.chatglm", 'class': "ChatGLM"},
     }  #: :meta private:
     """Info necessary for different model families initiation and configure."""
 
@@ -286,7 +286,7 @@ class BigdlNativeLLM(LLM):
         Example:
             .. code-block:: python
 
-                from bigdl.llm.langchain.llms import BigdlNativeLLM
+                from ipex_llm.langchain.llms import BigdlNativeLLM
                 llm = BigdlNativeLLM(model_path="/path/to/local/llama/model.bin")
                 llm("This is a prompt.")
         """
@@ -331,7 +331,7 @@ class BigdlNativeLLM(LLM):
         Example:
             .. code-block:: python
 
-                from bigdl.llm.langchain.llms import BigdlNativeLLM
+                from ipex_llm.langchain.llms import BigdlNativeLLM
                 llm = BigdlNativeLLM(
                     model_path="/path/to/local/model.bin",
                     temperature = 0.5
@@ -364,7 +364,7 @@ class _BaseCausalLM(LLM):
     Example:
         .. code-block:: python
 
-            from bigdl.llm.langchain.llms import LlamaLLM
+            from ipex_llm.langchain.llms import LlamaLLM
             llm = LlamaLLM(model_path="/path/to/llama/model")
     """
 
@@ -588,7 +588,7 @@ class _BaseCausalLM(LLM):
         Example:
             .. code-block:: python
 
-                from bigdl.llm.langchain.llms import LlamaLLM
+                from ipex_llm.langchain.llms import LlamaLLM
                 llm = LlamaLLM(model_path="/path/to/local/llama/model.bin")
                 llm("This is a prompt.")
         """
@@ -636,7 +636,7 @@ class _BaseCausalLM(LLM):
         Example:
             .. code-block:: python
 
-                from bigdl.llm.langchain.llms import LlamaLLM
+                from ipex_llm.langchain.llms import LlamaLLM
                 llm = LlamaLLM(
                     model_path="/path/to/local/model.bin",
                     temperature = 0.5
@@ -675,24 +675,24 @@ class _BaseCausalLM(LLM):
 
 class LlamaLLM(_BaseCausalLM):
     ggml_model = "Llama"
-    ggml_module = "bigdl.llm.models"
+    ggml_module = "ipex_llm.models"
 
 
 class BloomLLM(_BaseCausalLM):
     ggml_model = "Bloom"
-    ggml_module = "bigdl.llm.models"
+    ggml_module = "ipex_llm.models"
 
 
 class GptneoxLLM(_BaseCausalLM):
     ggml_model = "Gptneox"
-    ggml_module = "bigdl.llm.models"
+    ggml_module = "ipex_llm.models"
 
 
 class ChatGLMLLM(_BaseCausalLM):
     ggml_model = "ChatGLM"
-    ggml_module = "bigdl.llm.ggml.model.chatglm"
+    ggml_module = "ipex_llm.ggml.model.chatglm"
 
 
 class StarcoderLLM(_BaseCausalLM):
     ggml_model = "Starcoder"
-    ggml_module = "bigdl.llm.models"
+    ggml_module = "ipex_llm.models"
diff --git a/python/llm/src/bigdl/llm/langchain/llms/transformersllm.py b/python/llm/src/ipex_llm/langchain/llms/transformersllm.py
similarity index 98%
rename from python/llm/src/bigdl/llm/langchain/llms/transformersllm.py
rename to python/llm/src/ipex_llm/langchain/llms/transformersllm.py
index 85eebf78..f3498e9d 100644
--- a/python/llm/src/bigdl/llm/langchain/llms/transformersllm.py
+++ b/python/llm/src/ipex_llm/langchain/llms/transformersllm.py
@@ -64,7 +64,7 @@ class TransformersLLM(LLM):
     Example:
         .. code-block:: python
 
-            from bigdl.llm.langchain.llms import TransformersLLM
+            from ipex_llm.langchain.llms import TransformersLLM
             llm = TransformersLLM.from_model_id(model_id="THUDM/chatglm-6b")
     """
 
@@ -106,7 +106,7 @@ class TransformersLLM(LLM):
             An object of TransformersLLM.
         """
         try:
-            from bigdl.llm.transformers import (
+            from ipex_llm.transformers import (
                 AutoModel,
                 AutoModelForCausalLM,
                 # AutoModelForSeq2SeqLM,
@@ -170,7 +170,7 @@ class TransformersLLM(LLM):
             An object of TransformersLLM.
         """
         try:
-            from bigdl.llm.transformers import (
+            from ipex_llm.transformers import (
                 AutoModel,
                 AutoModelForCausalLM,
             )
diff --git a/python/llm/src/bigdl/llm/langchain/llms/transformerspipelinellm.py b/python/llm/src/ipex_llm/langchain/llms/transformerspipelinellm.py
similarity index 98%
rename from python/llm/src/bigdl/llm/langchain/llms/transformerspipelinellm.py
rename to python/llm/src/ipex_llm/langchain/llms/transformerspipelinellm.py
index 0049e841..42bc68ed 100644
--- a/python/llm/src/bigdl/llm/langchain/llms/transformerspipelinellm.py
+++ b/python/llm/src/ipex_llm/langchain/llms/transformerspipelinellm.py
@@ -66,7 +66,7 @@ class TransformersPipelineLLM(LLM):
     Example:
         .. code-block:: python
 
-            from bigdl.llm.langchain.llms import TransformersPipelineLLM
+            from ipex_llm.langchain.llms import TransformersPipelineLLM
             llm = TransformersPipelineLLM.from_model_id(model_id="decapoda-research/llama-7b-hf")
     """
 
@@ -94,7 +94,7 @@ class TransformersPipelineLLM(LLM):
     ) -> LLM:
         """Construct the pipeline object from model_id and task."""
         try:
-            from bigdl.llm.transformers import (
+            from ipex_llm.transformers import (
                 AutoModel,
                 AutoModelForCausalLM,
                 # AutoModelForSeq2SeqLM,
diff --git a/python/llm/src/bigdl/llm/llamaindex/__init__.py b/python/llm/src/ipex_llm/llamaindex/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/llamaindex/__init__.py
rename to python/llm/src/ipex_llm/llamaindex/__init__.py
diff --git a/python/llm/src/bigdl/llm/llamaindex/llms/__init__.py b/python/llm/src/ipex_llm/llamaindex/llms/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/llamaindex/llms/__init__.py
rename to python/llm/src/ipex_llm/llamaindex/llms/__init__.py
diff --git a/python/llm/src/bigdl/llm/llamaindex/llms/bigdlllm.py b/python/llm/src/ipex_llm/llamaindex/llms/bigdlllm.py
similarity index 98%
rename from python/llm/src/bigdl/llm/llamaindex/llms/bigdlllm.py
rename to python/llm/src/ipex_llm/llamaindex/llms/bigdlllm.py
index d6f61854..96550f6a 100644
--- a/python/llm/src/bigdl/llm/llamaindex/llms/bigdlllm.py
+++ b/python/llm/src/ipex_llm/llamaindex/llms/bigdlllm.py
@@ -91,7 +91,7 @@ class BigdlLLM(CustomLLM):
     Example:
         .. code-block:: python
 
-            from bigdl.llm.llamaindex.llms import BigdlLLM
+            from ipex_llm.llamaindex.llms import BigdlLLM
             llm = BigdlLLM(model_path="/path/to/llama/model")
     """
 
@@ -234,7 +234,7 @@ class BigdlLLM(CustomLLM):
             None.
         """
         model_kwargs = model_kwargs or {}
-        from bigdl.llm.transformers import AutoModelForCausalLM
+        from ipex_llm.transformers import AutoModelForCausalLM
         if model:
             self._model = model
         else:
@@ -244,7 +244,7 @@ class BigdlLLM(CustomLLM):
                     trust_remote_code=True, **model_kwargs
                 )
             except:
-                from bigdl.llm.transformers import AutoModel
+                from ipex_llm.transformers import AutoModel
                 self._model = AutoModel.from_pretrained(model_name,
                                                         load_in_4bit=True, **model_kwargs)
 
diff --git a/python/llm/src/bigdl/llm/llm_patching.py b/python/llm/src/ipex_llm/llm_patching.py
similarity index 92%
rename from python/llm/src/bigdl/llm/llm_patching.py
rename to python/llm/src/ipex_llm/llm_patching.py
index d84ca4f1..8c0a94e5 100644
--- a/python/llm/src/bigdl/llm/llm_patching.py
+++ b/python/llm/src/ipex_llm/llm_patching.py
@@ -17,7 +17,7 @@
 import transformers
 import importlib
 import sys
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 from enum import Enum
 
 bigdl_patched = None  # None or 'Train' or 'Inference'
@@ -43,7 +43,7 @@ def llm_patch(train=False):
 
     # Initial version of patch for llm finetuning, inference support TBD
     if train:
-        from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
+        from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
         replace_attr(transformers, "AutoModelForCausalLM", AutoModelForCausalLM)
         replace_attr(transformers, "LlamaForCausalLM", AutoModelForCausalLM)
         replace_attr(transformers, "AutoModel", AutoModel)
@@ -53,7 +53,7 @@ def llm_patch(train=False):
         invalidInputError(not import_peft_check,
                           'llm_patch() should be called at the beginning of your code.')
         import peft
-        from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
+        from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
             LoraConfig, TrainingArguments
         replace_attr(transformers, "TrainingArguments", TrainingArguments)
         get_peft_model_original = getattr(peft, "get_peft_model")
diff --git a/python/llm/src/bigdl/llm/models.py b/python/llm/src/ipex_llm/models.py
similarity index 79%
rename from python/llm/src/bigdl/llm/models.py
rename to python/llm/src/ipex_llm/models.py
index 0a1a2123..9157af42 100644
--- a/python/llm/src/bigdl/llm/models.py
+++ b/python/llm/src/ipex_llm/models.py
@@ -19,9 +19,9 @@
 # Otherwise there would be module not found error in non-pip's setting as Python would
 # only search the first bigdl package and end up finding only one sub-package.
 
-from bigdl.llm.ggml.model.llama import Llama
-from bigdl.llm.ggml.model.gptneox import Gptneox
-from bigdl.llm.ggml.model.bloom import Bloom
-from bigdl.llm.ggml.model.starcoder import Starcoder
+from ipex_llm.ggml.model.llama import Llama
+from ipex_llm.ggml.model.gptneox import Gptneox
+from ipex_llm.ggml.model.bloom import Bloom
+from ipex_llm.ggml.model.starcoder import Starcoder
 # temporarily disable until linux binary file for chatglm ready
-# from bigdl.llm.ggml.model.chatglm import ChatGLM
+# from ipex_llm.ggml.model.chatglm import ChatGLM
diff --git a/python/llm/src/bigdl/llm/optimize.py b/python/llm/src/ipex_llm/optimize.py
similarity index 96%
rename from python/llm/src/bigdl/llm/optimize.py
rename to python/llm/src/ipex_llm/optimize.py
index 75b1760b..ee1afc4d 100644
--- a/python/llm/src/bigdl/llm/optimize.py
+++ b/python/llm/src/ipex_llm/optimize.py
@@ -22,9 +22,9 @@ from torch.nn.modules import Module
 from torch.nn.modules.module import _IncompatibleKeys
 from accelerate import init_empty_weights
 from accelerate.utils import set_module_tensor_to_device
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.utils import extract_local_archive_file, get_local_shard_files
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.utils import extract_local_archive_file, get_local_shard_files
 import transformers
 import warnings
 from transformers import PreTrainedModel
@@ -145,7 +145,7 @@ def load_low_bit(model, model_path):
     >>> # Example 1:
     >>> # Take ChatGLM2-6B model as an example
     >>> # Make sure you have saved the optimized model by calling 'save_low_bit'
-    >>> from bigdl.llm.optimize import low_memory_init, load_low_bit
+    >>> from ipex_llm.optimize import low_memory_init, load_low_bit
     >>> with low_memory_init(): # Fast and low cost by loading model on meta device
     >>>     model = AutoModel.from_pretrained(saved_dir,
     >>>                                       torch_dtype="auto",
@@ -157,7 +157,7 @@ def load_low_bit(model, model_path):
     >>> # alternatively, you can obtain the model instance through traditional loading method.
     >>> # Take OpenAI Whisper model as an example
     >>> # Make sure you have saved the optimized model by calling 'save_low_bit'
-    >>> from bigdl.llm.optimize import load_low_bit
+    >>> from ipex_llm.optimize import load_low_bit
     >>> model = whisper.load_model('tiny') # A model instance through traditional loading method
     >>> model = load_low_bit(model, saved_dir) # Load the optimized model
     """
@@ -216,7 +216,7 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
     :return: The optimized model.
 
     >>> # Take OpenAI Whisper model as an example
-    >>> from bigdl.llm import optimize_model
+    >>> from ipex_llm import optimize_model
     >>> model = whisper.load_model('tiny') # Load whisper model under pytorch framework
     >>> model = optimize_model(model) # With only one line code change
     >>> # Use the optimized model without other API change
diff --git a/python/llm/src/bigdl/llm/serving/__init__.py b/python/llm/src/ipex_llm/serving/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/serving/__init__.py
rename to python/llm/src/ipex_llm/serving/__init__.py
diff --git a/python/llm/src/bigdl/llm/serving/fastchat/README.md b/python/llm/src/ipex_llm/serving/fastchat/README.md
similarity index 87%
rename from python/llm/src/bigdl/llm/serving/fastchat/README.md
rename to python/llm/src/ipex_llm/serving/fastchat/README.md
index c78b1179..20c4893a 100644
--- a/python/llm/src/bigdl/llm/serving/fastchat/README.md
+++ b/python/llm/src/ipex_llm/serving/fastchat/README.md
@@ -71,10 +71,10 @@ Then we can run model workers
 
 ```bash
 # On CPU
-python3 -m bigdl.llm.serving.fastchat.model_worker --model-path PATH/TO/bigdl-7b --device cpu
+python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/bigdl-7b --device cpu
 
 # On GPU
-python3 -m bigdl.llm.serving.fastchat.model_worker --model-path PATH/TO/bigdl-7b --device xpu
+python3 -m ipex_llm.serving.fastchat.model_worker --model-path PATH/TO/bigdl-7b --device xpu
 ```
 
 If you run successfully using `BigDL` backend, you can see the output in log like this:
@@ -94,14 +94,14 @@ To run the `bigdl_worker` on CPU, using the following code:
 source bigdl-llm-init -t
 
 # Available low_bit format including sym_int4, sym_int8, bf16 etc.
-python3 -m bigdl.llm.serving.fastchat.bigdl_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu"
+python3 -m ipex_llm.serving.fastchat.bigdl_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu"
 ```
 
 
 For GPU example:
 ```bash
 # Available low_bit format including sym_int4, sym_int8, fp16 etc.
-python3 -m bigdl.llm.serving.fastcaht.bigdl_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu"
+python3 -m ipex_llm.serving.fastcaht.bigdl_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu"
 ```
 
 For a full list of accepted arguments, you can refer to the main method of the `bigdl_worker.py`
@@ -114,10 +114,10 @@ To run using the `vLLM_worker`,  we don't need to change model name, just simply
 
 ```bash
 # On CPU
-python3 -m bigdl.llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device cpu
+python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device cpu
 
 # On GPU
-python3 -m bigdl.llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu
+python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu
 ```
 
 ### Launch Gradio web server
diff --git a/python/llm/src/bigdl/llm/serving/fastchat/__init__.py b/python/llm/src/ipex_llm/serving/fastchat/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/serving/fastchat/__init__.py
rename to python/llm/src/ipex_llm/serving/fastchat/__init__.py
diff --git a/python/llm/src/bigdl/llm/serving/fastchat/bigdl_llm_model.py b/python/llm/src/ipex_llm/serving/fastchat/bigdl_llm_model.py
similarity index 97%
rename from python/llm/src/bigdl/llm/serving/fastchat/bigdl_llm_model.py
rename to python/llm/src/ipex_llm/serving/fastchat/bigdl_llm_model.py
index fafb0a36..ba405239 100644
--- a/python/llm/src/bigdl/llm/serving/fastchat/bigdl_llm_model.py
+++ b/python/llm/src/ipex_llm/serving/fastchat/bigdl_llm_model.py
@@ -33,7 +33,7 @@ from transformers import AutoTokenizer
 from typing import Dict, List, Optional
 import math
 import psutil
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 is_fastchat_patched = False
 _mapping_fastchat = None
@@ -63,7 +63,7 @@ def load_model_base(self, model_path: str, from_pretrained_kwargs: dict):
         use_fast=self.use_fast_tokenizer,
         revision=revision,
     )
-    from bigdl.llm.transformers import AutoModelForCausalLM
+    from ipex_llm.transformers import AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(
         model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
     )
@@ -76,7 +76,7 @@ def load_model_chatglm(self, model_path: str, from_pretrained_kwargs: dict):
     tokenizer = AutoTokenizer.from_pretrained(
         model_path, trust_remote_code=True, revision=revision
     )
-    from bigdl.llm.transformers import AutoModel
+    from ipex_llm.transformers import AutoModel
     model = AutoModel.from_pretrained(
         model_path, trust_remote_code=True, load_in_4bit=True, **from_pretrained_kwargs
     )
@@ -246,7 +246,7 @@ class BigDLLLMAdapter(BaseModelAdapter):
             model_path, use_fast=False, revision=revision, trust_remote_code=True
         )
         print("Customized bigdl-llm loader")
-        from bigdl.llm.transformers import AutoModelForCausalLM
+        from ipex_llm.transformers import AutoModelForCausalLM
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
             load_in_4bit=True,
@@ -268,7 +268,7 @@ class BigDLLMLOWBITAdapter(BaseModelAdapter):
             model_path, use_fast=False, revision=revision
         )
         print("Customized bigdl-llm loader")
-        from bigdl.llm.transformers import AutoModelForCausalLM
+        from ipex_llm.transformers import AutoModelForCausalLM
         model = AutoModelForCausalLM.load_low_bit(model_path)
         return model, tokenizer
 
diff --git a/python/llm/src/bigdl/llm/serving/fastchat/bigdl_worker.py b/python/llm/src/ipex_llm/serving/fastchat/bigdl_worker.py
similarity index 99%
rename from python/llm/src/bigdl/llm/serving/fastchat/bigdl_worker.py
rename to python/llm/src/ipex_llm/serving/fastchat/bigdl_worker.py
index 35c009f7..b357c8d3 100644
--- a/python/llm/src/bigdl/llm/serving/fastchat/bigdl_worker.py
+++ b/python/llm/src/ipex_llm/serving/fastchat/bigdl_worker.py
@@ -43,7 +43,7 @@ from fastchat.serve.base_model_worker import (
 )
 from fastchat.utils import get_context_length, is_partial_stop
 
-from bigdl.llm.transformers.loader import load_model
+from ipex_llm.transformers.loader import load_model
 from transformers import TextIteratorStreamer
 
 app = FastAPI()
diff --git a/python/llm/src/bigdl/llm/serving/fastchat/model_worker.py b/python/llm/src/ipex_llm/serving/fastchat/model_worker.py
similarity index 99%
rename from python/llm/src/bigdl/llm/serving/fastchat/model_worker.py
rename to python/llm/src/ipex_llm/serving/fastchat/model_worker.py
index 13b98057..c822769f 100644
--- a/python/llm/src/bigdl/llm/serving/fastchat/model_worker.py
+++ b/python/llm/src/ipex_llm/serving/fastchat/model_worker.py
@@ -28,7 +28,7 @@ import time
 from typing import List, Optional
 import threading
 import uuid
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 from fastapi import FastAPI, Request, BackgroundTasks
 from fastapi.responses import StreamingResponse, JSONResponse
diff --git a/python/llm/src/bigdl/llm/serving/fastchat/vllm_worker.py b/python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py
similarity index 97%
rename from python/llm/src/bigdl/llm/serving/fastchat/vllm_worker.py
rename to python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py
index 780ca2ae..53f99746 100644
--- a/python/llm/src/bigdl/llm/serving/fastchat/vllm_worker.py
+++ b/python/llm/src/ipex_llm/serving/fastchat/vllm_worker.py
@@ -35,10 +35,10 @@ import uvicorn
 # from vllm.engine.arg_utils import AsyncEngineArgs
 # from vllm.sampling_params import SamplingParams
 # from vllm.utils import random_uuid
-from bigdl.llm.vllm.engine.async_llm_engine import AsyncLLMEngine
-from bigdl.llm.vllm.engine.arg_utils import AsyncEngineArgs
-from bigdl.llm.vllm.sampling_params import SamplingParams
-from bigdl.llm.vllm.utils import random_uuid
+from ipex_llm.vllm.engine.async_llm_engine import AsyncLLMEngine
+from ipex_llm.vllm.engine.arg_utils import AsyncEngineArgs
+from ipex_llm.vllm.sampling_params import SamplingParams
+from ipex_llm.vllm.utils import random_uuid
 
 import numpy as np
 
diff --git a/python/llm/src/bigdl/llm/transformers/__init__.py b/python/llm/src/ipex_llm/transformers/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/__init__.py
rename to python/llm/src/ipex_llm/transformers/__init__.py
diff --git a/python/llm/src/bigdl/llm/transformers/awq/__init__.py b/python/llm/src/ipex_llm/transformers/awq/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/awq/__init__.py
rename to python/llm/src/ipex_llm/transformers/awq/__init__.py
diff --git a/python/llm/src/bigdl/llm/transformers/awq/act.py b/python/llm/src/ipex_llm/transformers/awq/act.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/awq/act.py
rename to python/llm/src/ipex_llm/transformers/awq/act.py
diff --git a/python/llm/src/bigdl/llm/transformers/awq/awq.py b/python/llm/src/ipex_llm/transformers/awq/awq.py
similarity index 97%
rename from python/llm/src/bigdl/llm/transformers/awq/awq.py
rename to python/llm/src/ipex_llm/transformers/awq/awq.py
index 308a62f1..c4f822e9 100644
--- a/python/llm/src/bigdl/llm/transformers/awq/awq.py
+++ b/python/llm/src/ipex_llm/transformers/awq/awq.py
@@ -52,9 +52,9 @@ from transformers.models.opt.modeling_opt import OPTForCausalLM
 from transformers.models.llama.modeling_llama import LlamaForCausalLM
 from transformers.models.bloom.modeling_bloom import BloomBlock
 from transformers import AwqConfig, AutoConfig
-from bigdl.llm.transformers.awq.linear import WQLinear_GEMM, WQLinear_GEMV
+from ipex_llm.transformers.awq.linear import WQLinear_GEMM, WQLinear_GEMV
 from huggingface_hub import snapshot_download
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 
 layer_type_dict = {
@@ -151,7 +151,7 @@ def get_layer_type(config):
 
 
 def scale_activations(module):
-    from bigdl.llm.transformers.awq.act import ScaledActivation
+    from ipex_llm.transformers.awq.act import ScaledActivation
     param = next(module.parameters())
     dtype = param.dtype
     device = param.device
diff --git a/python/llm/src/bigdl/llm/transformers/awq/awq_config.py b/python/llm/src/ipex_llm/transformers/awq/awq_config.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/awq/awq_config.py
rename to python/llm/src/ipex_llm/transformers/awq/awq_config.py
index 79d011f5..0f6fe60d 100644
--- a/python/llm/src/bigdl/llm/transformers/awq/awq_config.py
+++ b/python/llm/src/ipex_llm/transformers/awq/awq_config.py
@@ -34,7 +34,7 @@
 #
 
 from dataclasses import dataclass
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 from transformers.utils.quantization_config import QuantizationConfigMixin
 from transformers.utils.quantization_config import AwqBackendPackingMethod,\
     AWQLinearVersion, QuantizationMethod
diff --git a/python/llm/src/bigdl/llm/transformers/awq/linear.py b/python/llm/src/ipex_llm/transformers/awq/linear.py
similarity index 99%
rename from python/llm/src/bigdl/llm/transformers/awq/linear.py
rename to python/llm/src/ipex_llm/transformers/awq/linear.py
index b2e4958f..750ffc53 100644
--- a/python/llm/src/bigdl/llm/transformers/awq/linear.py
+++ b/python/llm/src/ipex_llm/transformers/awq/linear.py
@@ -43,7 +43,7 @@
 
 import torch
 import torch.nn as nn
-from bigdl.llm.utils.common import invalidOperationError, invalidInputError
+from ipex_llm.utils.common import invalidOperationError, invalidInputError
 from transformers import AwqConfig
 from transformers.utils.quantization_config import AwqBackendPackingMethod
 
diff --git a/python/llm/src/bigdl/llm/transformers/bmm.py b/python/llm/src/ipex_llm/transformers/bmm.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/bmm.py
rename to python/llm/src/ipex_llm/transformers/bmm.py
diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
similarity index 90%
rename from python/llm/src/bigdl/llm/transformers/convert.py
rename to python/llm/src/ipex_llm/transformers/convert.py
index f21b7207..820dbecc 100644
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@@ -42,12 +42,12 @@ from accelerate import init_empty_weights
 import warnings
 import transformers
 import importlib.util
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
 from .utils import logger, get_cur_qtype_and_imatrix
 from typing import Union
 import numpy as np
 import os
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 from typing import List, Optional, Tuple, Union
 
 
@@ -76,7 +76,7 @@ if is_auto_gptq_available():
     from auto_gptq.utils.peft_utils import QuantLinearCuda, QuantLinearCudaOld
 
 if is_auto_awq_available():
-    from bigdl.llm.transformers.awq.linear import WQLinear_GEMM
+    from ipex_llm.transformers.awq.linear import WQLinear_GEMM
     from transformers.utils.quantization_config import AwqBackendPackingMethod
 
 
@@ -120,7 +120,7 @@ def is_linear_module(module):
 
 
 def convert_gptq(module, awq=False, llm_awq=False):
-    from bigdl.llm.transformers.low_bit_linear import get_block_size
+    from ipex_llm.transformers.low_bit_linear import get_block_size
     Q4_1 = get_block_size("asym_int4")
 
     scales = module.scales
@@ -194,9 +194,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                                  imatrix_data=None, embedding_qtype=None,
                                  model_type=None, torch_dtype=torch.float32,
                                  enable_xetla=False):
-    from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \
+    from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \
         FP16Linear, BF16Linear
-    from bigdl.llm.transformers.embedding import LLMEmbedding, LowBitEmbedding
+    from ipex_llm.transformers.embedding import LLMEmbedding, LowBitEmbedding
     has_been_replaced = False
 
     for name, module in model.named_children():
@@ -288,7 +288,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                         optimize_lm_head=optimize_lm_head
                     )
                     device = module.weight.data.device
-                    from bigdl.llm.transformers.utils import get_ipex_version
+                    from ipex_llm.transformers.utils import get_ipex_version
                     if get_ipex_version() < "2.1.10+xpu":
                         new_linear._parameters['weight'] = nn.Parameter(module.weight)
                     else:
@@ -389,7 +389,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
 def replace_with_low_bit_linear_for_module(model, qtype, module_name=None,
                                            modules_to_not_convert=None, current_key_name=None,
                                            convert_shape_only=False, torch_dtype="auto"):
-    from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \
+    from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \
         FP16Linear, BF16Linear
     has_been_replaced = False
 
@@ -479,7 +479,7 @@ def replace_with_low_bit_linear_for_module(model, qtype, module_name=None,
                         mp_group=mp_group,
                     )
                     device = module.weight.data.device
-                    from bigdl.llm.transformers.utils import get_ipex_version
+                    from ipex_llm.transformers.utils import get_ipex_version
                     if get_ipex_version() < "2.1.10+xpu":
                         new_linear._parameters['weight'] = nn.Parameter(module.weight)
                     else:
@@ -592,7 +592,7 @@ def _optimize_pre(model):
         not model.config.is_decoder and
         model.config.position_embedding_type == "absolute"
     ):
-        from bigdl.llm.transformers.models.bert import merge_qkv
+        from ipex_llm.transformers.models.bert import merge_qkv
         model.apply(merge_qkv)
     if model.config.model_type == "qwen":
         position_ids = torch.arange(0, model.config.max_position_embeddings)
@@ -695,8 +695,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
         # currently put interpolation execution into cpu
         visual_module_name = model.transformer.visual.__class__.__module__
         visual_module = importlib.import_module(visual_module_name)
-        from bigdl.llm.transformers.models.qwen_vl import qwen_vl_vision_transformer_forward
-        from bigdl.llm.transformers.models.qwen_vl import qwen_vl_resampler_forward
+        from ipex_llm.transformers.models.qwen_vl import qwen_vl_vision_transformer_forward
+        from ipex_llm.transformers.models.qwen_vl import qwen_vl_resampler_forward
         convert_forward(model,
                         visual_module.VisionTransformer,
                         qwen_vl_vision_transformer_forward
@@ -710,7 +710,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
 
 def convert_bigdl_other_module(model, dtype):
     # Convert modules outside of bigdl linear to corresponding dtype
-    from bigdl.llm.transformers.low_bit_linear import LowBitLinear, \
+    from ipex_llm.transformers.low_bit_linear import LowBitLinear, \
         FP16Linear, BF16Linear
     for module in model.modules():
         if list(module.children()) == []:
@@ -745,7 +745,7 @@ def _optimize_ipex(model, qtype=ggml_tensor_qtype["bf16"]):
     import intel_extension_for_pytorch as ipex
     from intel_extension_for_pytorch.transformers.optimize import model_convert_reference
     from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-    from bigdl.llm.transformers.convert_ipex import (
+    from ipex_llm.transformers.convert_ipex import (
         _ipex_optimize_model, _ipex_jit, _make_causal_mask,
         _llama_model_forward_4_35, convert_function, GLM_get_masks,
     )
@@ -780,13 +780,13 @@ def _optimize_ipex(model, qtype=ggml_tensor_qtype["bf16"]):
 
 def _optimize_post(model, lightweight_bmm=False):
     from packaging import version
-    from bigdl.llm.transformers.models.llama import llama_attention_forward_4_31
-    from bigdl.llm.transformers.models.llama import llama_attention_selective_batching_forward_4_31
-    from bigdl.llm.transformers.models.llama import llama_model_selective_batching_forward_4_31
-    from bigdl.llm.transformers.models.llama import llama_rms_norm_forward
-    from bigdl.llm.transformers.models.llama import llama_mlp_forward
-    from bigdl.llm.transformers.models.llama import llama_decoder_forward
-    from bigdl.llm.transformers.models.llama import llama_model_forward
+    from ipex_llm.transformers.models.llama import llama_attention_forward_4_31
+    from ipex_llm.transformers.models.llama import llama_attention_selective_batching_forward_4_31
+    from ipex_llm.transformers.models.llama import llama_model_selective_batching_forward_4_31
+    from ipex_llm.transformers.models.llama import llama_rms_norm_forward
+    from ipex_llm.transformers.models.llama import llama_mlp_forward
+    from ipex_llm.transformers.models.llama import llama_decoder_forward
+    from ipex_llm.transformers.models.llama import llama_model_forward
     from transformers.modeling_utils import PreTrainedModel
 
     # All huggingface format models are inherited from `PreTrainedModel`
@@ -813,8 +813,8 @@ def _optimize_post(model, lightweight_bmm=False):
                         llama_decoder_forward)
         if version.parse(trans_version) >= version.parse("4.36.0"):
             # transformers version >= 4.36.0
-            from bigdl.llm.transformers.models.llama import llama_attention_forward_4_36
-            from bigdl.llm.transformers.models.llama import llama_model_forward_4_36
+            from ipex_llm.transformers.models.llama import llama_attention_forward_4_36
+            from ipex_llm.transformers.models.llama import llama_model_forward_4_36
             convert_forward(
                 model,
                 transformers.models.llama.modeling_llama.LlamaAttention,
@@ -850,7 +850,7 @@ def _optimize_post(model, lightweight_bmm=False):
         pass
 
     # convert all nn.LayerNorm
-    from bigdl.llm.transformers.models.bloom import bloom_layer_norm_forward
+    from ipex_llm.transformers.models.bloom import bloom_layer_norm_forward
     convert_forward(model,
                     nn.LayerNorm,
                     bloom_layer_norm_forward)
@@ -861,7 +861,7 @@ def _optimize_post(model, lightweight_bmm=False):
             # chatglm2-6b-32k
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from bigdl.llm.transformers.models.chatglm2_32k import chatglm2_32k_attention_forward
+            from ipex_llm.transformers.models.chatglm2_32k import chatglm2_32k_attention_forward
             convert_forward(model,
                             module.SelfAttention,
                             chatglm2_32k_attention_forward)
@@ -870,9 +870,9 @@ def _optimize_post(model, lightweight_bmm=False):
             # chatglm2-6b
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from bigdl.llm.transformers.models.chatglm2 import chatglm2_attention_forward
-            from bigdl.llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
-            from bigdl.llm.transformers.models.chatglm2 import chatglm2_model_forward
+            from ipex_llm.transformers.models.chatglm2 import chatglm2_attention_forward
+            from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
+            from ipex_llm.transformers.models.chatglm2 import chatglm2_model_forward
             convert_forward(model,
                             module.SelfAttention,
                             chatglm2_attention_forward)
@@ -886,7 +886,7 @@ def _optimize_post(model, lightweight_bmm=False):
             # chatglm-6b
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from bigdl.llm.transformers.models.chatglm import chatglm_attention_forward
+            from ipex_llm.transformers.models.chatglm import chatglm_attention_forward
             convert_forward(model,
                             module.SelfAttention,
                             chatglm_attention_forward
@@ -896,7 +896,7 @@ def _optimize_post(model, lightweight_bmm=False):
             modeling_module_name = model.__class__.__module__
             attention_module_name = '.'.join(modeling_module_name.split('.')[:-1]) + ".attention"
             module = importlib.import_module(attention_module_name)
-            from bigdl.llm.transformers.models.mpt import mpt_multihead_attention_forward
+            from ipex_llm.transformers.models.mpt import mpt_multihead_attention_forward
             convert_forward(model,
                             module.MultiheadAttention,
                             mpt_multihead_attention_forward
@@ -905,7 +905,7 @@ def _optimize_post(model, lightweight_bmm=False):
         # dolly-v1-6b
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\
+        from ipex_llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\
             gptj_block_forward
         convert_forward(model,
                         module.GPTJAttention,
@@ -919,7 +919,7 @@ def _optimize_post(model, lightweight_bmm=False):
     elif "bloom" in model.config.model_type:
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.bloom import bloom_attention_forward
+        from ipex_llm.transformers.models.bloom import bloom_attention_forward
         convert_forward(model,
                         module.BloomAttention,
                         bloom_attention_forward
@@ -931,7 +931,7 @@ def _optimize_post(model, lightweight_bmm=False):
             if "RWForCausalLM" in model.config.architectures:
                 if model.config.hidden_size == 4544:
                     # falcon-7b need to check performance drop after kv cache support.
-                    # from bigdl.llm.transformers.models.falcon import rw_attention_forward_7b
+                    # from ipex_llm.transformers.models.falcon import rw_attention_forward_7b
                     # convert_forward(model,
                     #                 module.Attention,
                     #                 rw_attention_forward_7b
@@ -939,7 +939,7 @@ def _optimize_post(model, lightweight_bmm=False):
                     pass
                 else:
                     # falcon-40b
-                    from bigdl.llm.transformers.models.falcon import rw_attention_forward_40b
+                    from ipex_llm.transformers.models.falcon import rw_attention_forward_40b
                     convert_forward(model,
                                     module.Attention,
                                     rw_attention_forward_40b
@@ -949,7 +949,7 @@ def _optimize_post(model, lightweight_bmm=False):
                     # falcon-180b and new falcon-40b
                     if version.parse(trans_version) >= version.parse("4.36.0"):
                         # transformers version >= 4.36.0
-                        from bigdl.llm.transformers.models.falcon import \
+                        from ipex_llm.transformers.models.falcon import \
                             falcon_attention_forward_4_36
 
                         convert_forward(model,
@@ -957,7 +957,7 @@ def _optimize_post(model, lightweight_bmm=False):
                                         falcon_attention_forward_4_36
                                         )
                     else:
-                        from bigdl.llm.transformers.models.falcon import falcon_attention_forward
+                        from ipex_llm.transformers.models.falcon import falcon_attention_forward
                         convert_forward(model,
                                         module.FalconAttention,
                                         falcon_attention_forward
@@ -969,8 +969,8 @@ def _optimize_post(model, lightweight_bmm=False):
             # baichuan2-7B
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from bigdl.llm.transformers.models.baichuan2 import baichuan_attention_forward_7b
-            from bigdl.llm.transformers.models.baichuan2 import baichuan_mlp_forward
+            from ipex_llm.transformers.models.baichuan2 import baichuan_attention_forward_7b
+            from ipex_llm.transformers.models.baichuan2 import baichuan_mlp_forward
             convert_forward(model,
                             module.Attention,
                             baichuan_attention_forward_7b
@@ -985,10 +985,10 @@ def _optimize_post(model, lightweight_bmm=False):
             # baichuan2-13B
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from bigdl.llm.transformers.models.baichuan2 import baichuan_attention_forward_13b
-            from bigdl.llm.transformers.models.baichuan2 import baichuan_13b_rms_norm_forward
-            from bigdl.llm.transformers.models.baichuan2 import baichuan_mlp_forward
-            from bigdl.llm.transformers.models.baichuan2 import baichuan_13b_get_alibi_mask
+            from ipex_llm.transformers.models.baichuan2 import baichuan_attention_forward_13b
+            from ipex_llm.transformers.models.baichuan2 import baichuan_13b_rms_norm_forward
+            from ipex_llm.transformers.models.baichuan2 import baichuan_mlp_forward
+            from ipex_llm.transformers.models.baichuan2 import baichuan_13b_get_alibi_mask
             convert_forward(model,
                             module.BaichuanAttention,
                             baichuan_attention_forward_13b
@@ -1010,7 +1010,7 @@ def _optimize_post(model, lightweight_bmm=False):
             # baichuan-7B
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from bigdl.llm.transformers.models.baichuan import baichuan_attention_forward_7b
+            from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_7b
             convert_forward(model,
                             module.Attention,
                             baichuan_attention_forward_7b
@@ -1022,8 +1022,8 @@ def _optimize_post(model, lightweight_bmm=False):
             # baichuan-13B
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from bigdl.llm.transformers.models.baichuan import baichuan_attention_forward_13b
-            from bigdl.llm.transformers.models.baichuan2 import baichuan_13b_rms_norm_forward
+            from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_13b
+            from ipex_llm.transformers.models.baichuan2 import baichuan_13b_rms_norm_forward
             convert_forward(model,
                             module.BaichuanAttention,
                             baichuan_attention_forward_13b
@@ -1033,7 +1033,7 @@ def _optimize_post(model, lightweight_bmm=False):
                             module.RMSNorm,
                             baichuan_13b_rms_norm_forward)
     elif model.config.model_type == "gpt_neox":
-        from bigdl.llm.transformers.models.gptneox import gptneox_attention_forward
+        from ipex_llm.transformers.models.gptneox import gptneox_attention_forward
         convert_forward(model,
                         transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXAttention,
                         gptneox_attention_forward
@@ -1041,8 +1041,8 @@ def _optimize_post(model, lightweight_bmm=False):
     elif model.config.model_type == "internlm":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.internlm import internlm_attention_forward
-        from bigdl.llm.transformers.models.internlm import internlm2_attention_forward
+        from ipex_llm.transformers.models.internlm import internlm_attention_forward
+        from ipex_llm.transformers.models.internlm import internlm2_attention_forward
         try:
             convert_forward(model,
                             module.InternLM2Attention,
@@ -1068,7 +1068,7 @@ def _optimize_post(model, lightweight_bmm=False):
             # for Qwen-VL-Chat
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from bigdl.llm.transformers.models.qwen_vl import qwen_attention_forward_vl
+            from ipex_llm.transformers.models.qwen_vl import qwen_attention_forward_vl
             convert_forward(model,
                             module.QWenAttention,
                             qwen_attention_forward_vl
@@ -1077,10 +1077,10 @@ def _optimize_post(model, lightweight_bmm=False):
             # for Qwen-7B and Qwen-14B
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from bigdl.llm.transformers.models.qwen import qwen_attention_forward
-            from bigdl.llm.transformers.models.qwen import qwen_mlp_forward
-            from bigdl.llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
-            from bigdl.llm.transformers.models.qwen import qwen_model_forward
+            from ipex_llm.transformers.models.qwen import qwen_attention_forward
+            from ipex_llm.transformers.models.qwen import qwen_mlp_forward
+            from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
+            from ipex_llm.transformers.models.qwen import qwen_model_forward
             convert_forward(model,
                             module.QWenAttention,
                             qwen_attention_forward
@@ -1098,8 +1098,8 @@ def _optimize_post(model, lightweight_bmm=False):
         # for Qwen1.5-7B
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.qwen2 import qwen2_model_forward
-        from bigdl.llm.transformers.models.qwen2 import qwen2_attention_forward
+        from ipex_llm.transformers.models.qwen2 import qwen2_model_forward
+        from ipex_llm.transformers.models.qwen2 import qwen2_attention_forward
         convert_forward(model,
                         module.Qwen2Model,
                         qwen2_model_forward)
@@ -1115,7 +1115,7 @@ def _optimize_post(model, lightweight_bmm=False):
     elif model.config.model_type == "aquila":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.aquila import aquila_attention_forward
+        from ipex_llm.transformers.models.aquila import aquila_attention_forward
         convert_forward(model,
                         module.AquilaAttention,
                         aquila_attention_forward
@@ -1130,7 +1130,7 @@ def _optimize_post(model, lightweight_bmm=False):
                           "to run Mixtral models.")
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.mixtral import mixtral_moeblock_forward, \
+        from ipex_llm.transformers.models.mixtral import mixtral_moeblock_forward, \
             mixtral_attention_forward, mixtral_mlp_forward, mixtral_model_forward
         convert_forward(model,
                         module.MixtralAttention,
@@ -1153,7 +1153,7 @@ def _optimize_post(model, lightweight_bmm=False):
         # For phixtral, limit the condition to avoid applying on phi-2 hosted by ModelScope
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.phixtral import phixtral_moeblock_forward, \
+        from ipex_llm.transformers.models.phixtral import phixtral_moeblock_forward, \
             phixtral_mlp_forward
         convert_forward(model,
                         module.MoE,
@@ -1177,8 +1177,8 @@ def _optimize_post(model, lightweight_bmm=False):
             if version.parse(trans_version) >= version.parse("4.36.0"):
                 modeling_module_name = model.__class__.__module__
                 module = importlib.import_module(modeling_module_name)
-                from bigdl.llm.transformers.models.mistral import mistral_attention_forward_4_36
-                from bigdl.llm.transformers.models.mistral import mistral_model_forward_4_36
+                from ipex_llm.transformers.models.mistral import mistral_attention_forward_4_36
+                from ipex_llm.transformers.models.mistral import mistral_model_forward_4_36
                 convert_forward(model,
                                 module.MistralAttention,
                                 mistral_attention_forward_4_36
@@ -1196,7 +1196,7 @@ def _optimize_post(model, lightweight_bmm=False):
             else:
                 modeling_module_name = model.__class__.__module__
                 module = importlib.import_module(modeling_module_name)
-                from bigdl.llm.transformers.models.mistral import mistral_attention_forward
+                from ipex_llm.transformers.models.mistral import mistral_attention_forward
                 convert_forward(model,
                                 module.MistralAttention,
                                 mistral_attention_forward
@@ -1210,9 +1210,9 @@ def _optimize_post(model, lightweight_bmm=False):
     elif model.config.model_type == "gemma":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.gemma import gemma_attention_forward
-        from bigdl.llm.transformers.models.gemma import gemma_rms_norm_forward
-        from bigdl.llm.transformers.models.gemma import gemma_mlp_forward
+        from ipex_llm.transformers.models.gemma import gemma_attention_forward
+        from ipex_llm.transformers.models.gemma import gemma_rms_norm_forward
+        from ipex_llm.transformers.models.gemma import gemma_mlp_forward
         convert_forward(model,
                         module.GemmaAttention,
                         gemma_attention_forward,
@@ -1231,7 +1231,7 @@ def _optimize_post(model, lightweight_bmm=False):
                         llama_rms_norm_forward)
     elif model.config.model_type == "whisper" and lightweight_bmm:
         if platform.system().lower() == 'windows':
-            from bigdl.llm.transformers.bmm import SafeBMM
+            from ipex_llm.transformers.bmm import SafeBMM
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
             old_fwd = module.WhisperAttention.forward
@@ -1247,8 +1247,8 @@ def _optimize_post(model, lightweight_bmm=False):
         # rwkv v4
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.rwkv4 import rwkv_attention_forward
-        from bigdl.llm.transformers.models.rwkv4 import rwkv_ffn_forward
+        from ipex_llm.transformers.models.rwkv4 import rwkv_attention_forward
+        from ipex_llm.transformers.models.rwkv4 import rwkv_ffn_forward
         convert_forward(model,
                         module.RwkvSelfAttention,
                         rwkv_attention_forward)
@@ -1259,9 +1259,9 @@ def _optimize_post(model, lightweight_bmm=False):
         # rwkv v5
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.rwkv5 import rwkv_attention_forward
-        from bigdl.llm.transformers.models.rwkv5 import rwkv_ffn_forward_wrapper
-        from bigdl.llm.transformers.models.rwkv5 import rwkv_model_forward_wrapper
+        from ipex_llm.transformers.models.rwkv5 import rwkv_attention_forward
+        from ipex_llm.transformers.models.rwkv5 import rwkv_ffn_forward_wrapper
+        from ipex_llm.transformers.models.rwkv5 import rwkv_model_forward_wrapper
         convert_forward(model,
                         module.RwkvSelfAttention,
                         rwkv_attention_forward)
@@ -1276,7 +1276,7 @@ def _optimize_post(model, lightweight_bmm=False):
     elif model.config.model_type == "deci":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.decilm import decilm_attention_forward_4_35_2
+        from ipex_llm.transformers.models.decilm import decilm_attention_forward_4_35_2
         convert_forward(model,
                         module.LlamaRMSNorm,
                         llama_rms_norm_forward)
@@ -1290,8 +1290,8 @@ def _optimize_post(model, lightweight_bmm=False):
         # starcoder
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.gptbigcode import _attn_wrapper
-        from bigdl.llm.transformers.models.gptbigcode import gptbigcode_attention_forward
+        from ipex_llm.transformers.models.gptbigcode import _attn_wrapper
+        from ipex_llm.transformers.models.gptbigcode import gptbigcode_attention_forward
         convert_forward(model,
                         module.GPTBigCodeAttention,
                         gptbigcode_attention_forward)
@@ -1303,8 +1303,8 @@ def _optimize_post(model, lightweight_bmm=False):
     elif model.config.model_type == 'yuan':
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.yuan import yuan_attention_forward
-        # from bigdl.llm.transformers.models.yuan import yuan_mlp_forward
+        from ipex_llm.transformers.models.yuan import yuan_attention_forward
+        # from ipex_llm.transformers.models.yuan import yuan_mlp_forward
         convert_forward(model,
                         module.YuanAttention,
                         yuan_attention_forward
@@ -1320,8 +1320,8 @@ def _optimize_post(model, lightweight_bmm=False):
     ):
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from bigdl.llm.transformers.models.bert import self_attention_forward
-        from bigdl.llm.transformers.models.bert import encoder_forward
+        from ipex_llm.transformers.models.bert import self_attention_forward
+        from ipex_llm.transformers.models.bert import encoder_forward
         convert_forward(model,
                         module.BertSelfAttention,
                         self_attention_forward)
diff --git a/python/llm/src/bigdl/llm/transformers/convert_ipex.py b/python/llm/src/ipex_llm/transformers/convert_ipex.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/convert_ipex.py
rename to python/llm/src/ipex_llm/transformers/convert_ipex.py
index b50f6c04..4d6764bf 100644
--- a/python/llm/src/bigdl/llm/transformers/convert_ipex.py
+++ b/python/llm/src/ipex_llm/transformers/convert_ipex.py
@@ -35,7 +35,7 @@
 # limitations under the License.
 
 import torch
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 from typing import List, Optional, Tuple, Union
 from intel_extension_for_pytorch.transformers.optimize import (
     lowering_class_cpu,
@@ -46,8 +46,8 @@ from intel_extension_for_pytorch.cpu._auto_kernel_selection import (
     _using_tpp,
     _disable_tpp
 )
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.transformers.convert import get_enable_ipex
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.transformers.convert import get_enable_ipex
 import os
 
 
diff --git a/python/llm/src/bigdl/llm/transformers/embedding.py b/python/llm/src/ipex_llm/transformers/embedding.py
similarity index 97%
rename from python/llm/src/bigdl/llm/transformers/embedding.py
rename to python/llm/src/ipex_llm/transformers/embedding.py
index 4054418f..8031e020 100644
--- a/python/llm/src/bigdl/llm/transformers/embedding.py
+++ b/python/llm/src/ipex_llm/transformers/embedding.py
@@ -20,8 +20,8 @@ from torch import Tensor
 from torch.nn import functional as F
 from torch.nn import Parameter
 from typing import Optional
-from bigdl.llm.transformers.low_bit_linear import FP4Params
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.transformers.low_bit_linear import FP4Params
+from ipex_llm.utils.common import invalidInputError
 
 
 # To prevent insufficient available memory when moving embedding from XPU back to CPU,
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/__init__.py b/python/llm/src/ipex_llm/transformers/gguf/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/__init__.py
rename to python/llm/src/ipex_llm/transformers/gguf/__init__.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/api.py b/python/llm/src/ipex_llm/transformers/gguf/api.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/gguf/api.py
rename to python/llm/src/ipex_llm/transformers/gguf/api.py
index 020a91ba..05203fe0 100644
--- a/python/llm/src/bigdl/llm/transformers/gguf/api.py
+++ b/python/llm/src/ipex_llm/transformers/gguf/api.py
@@ -15,7 +15,7 @@
 #
 
 import torch
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 
 qtype_map = {
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/gguf.py b/python/llm/src/ipex_llm/transformers/gguf/gguf.py
similarity index 99%
rename from python/llm/src/bigdl/llm/transformers/gguf/gguf.py
rename to python/llm/src/ipex_llm/transformers/gguf/gguf.py
index 8d80e5f8..199299f2 100644
--- a/python/llm/src/bigdl/llm/transformers/gguf/gguf.py
+++ b/python/llm/src/ipex_llm/transformers/gguf/gguf.py
@@ -25,7 +25,7 @@ import numpy
 
 from io import BufferedReader
 from tqdm import tqdm
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 
 class GGUFReader:
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/__init__.py b/python/llm/src/ipex_llm/transformers/gguf/models/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/__init__.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/__init__.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/baichuan.py b/python/llm/src/ipex_llm/transformers/gguf/models/baichuan.py
similarity index 97%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/baichuan.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/baichuan.py
index 49e81d48..effea840 100644
--- a/python/llm/src/bigdl/llm/transformers/gguf/models/baichuan.py
+++ b/python/llm/src/ipex_llm/transformers/gguf/models/baichuan.py
@@ -24,8 +24,8 @@ from .model_implement.baichuan.modeling_baichuan import BaiChuanForCausalLM
 from .model_implement.baichuan.tokenization_baichuan import BaiChuanTokenizer
 
 from ..gguf import GGUFFileLoader
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.transformers.convert import replace_with_low_bit_linear_for_module
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.transformers.convert import replace_with_low_bit_linear_for_module
 
 
 def load_gguf_baichuan(loader: GGUFFileLoader, dtype: torch.dtype = torch.float,
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/bloom.py b/python/llm/src/ipex_llm/transformers/gguf/models/bloom.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/bloom.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/bloom.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/falcon.py b/python/llm/src/ipex_llm/transformers/gguf/models/falcon.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/falcon.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/falcon.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/llama.py b/python/llm/src/ipex_llm/transformers/gguf/models/llama.py
similarity index 97%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/llama.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/llama.py
index f40eeab3..86cdadfb 100644
--- a/python/llm/src/bigdl/llm/transformers/gguf/models/llama.py
+++ b/python/llm/src/ipex_llm/transformers/gguf/models/llama.py
@@ -22,8 +22,8 @@ from tempfile import NamedTemporaryFile
 from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
 
 from ..gguf import GGUFFileLoader
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.transformers.convert import replace_with_low_bit_linear_for_module
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.transformers.convert import replace_with_low_bit_linear_for_module
 
 
 def load_gguf_llama(loader: GGUFFileLoader, dtype: torch.dtype = torch.float,
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/mistral.py b/python/llm/src/ipex_llm/transformers/gguf/models/mistral.py
similarity index 97%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/mistral.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/mistral.py
index ba1feae4..d67ec3c8 100644
--- a/python/llm/src/bigdl/llm/transformers/gguf/models/mistral.py
+++ b/python/llm/src/ipex_llm/transformers/gguf/models/mistral.py
@@ -22,8 +22,8 @@ from tempfile import NamedTemporaryFile
 from transformers import MistralConfig, MistralForCausalLM, LlamaTokenizer
 
 from ..gguf import GGUFFileLoader
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.transformers.convert import replace_with_low_bit_linear_for_module
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.transformers.convert import replace_with_low_bit_linear_for_module
 
 
 def load_gguf_mistral(loader: GGUFFileLoader, dtype: torch.dtype = torch.float,
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py b/python/llm/src/ipex_llm/transformers/gguf/models/mixtral.py
similarity index 97%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/mixtral.py
index 23fc70a3..62a24b3f 100644
--- a/python/llm/src/bigdl/llm/transformers/gguf/models/mixtral.py
+++ b/python/llm/src/ipex_llm/transformers/gguf/models/mixtral.py
@@ -22,8 +22,8 @@ from tempfile import NamedTemporaryFile
 from transformers import MixtralConfig, MixtralForCausalLM, LlamaTokenizer
 
 from ..gguf import GGUFFileLoader
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.transformers.convert import replace_with_low_bit_linear_for_module
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.transformers.convert import replace_with_low_bit_linear_for_module
 
 
 def load_gguf_mixtral(loader: GGUFFileLoader, dtype: torch.dtype = torch.float,
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/__init__.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/__init__.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/__init__.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/configuration_baichuan.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/configuration_baichuan.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/configuration_baichuan.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/configuration_baichuan.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/modeling_baichuan.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/modeling_baichuan.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/modeling_baichuan.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/modeling_baichuan.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/tokenization_baichuan.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/tokenization_baichuan.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/baichuan/tokenization_baichuan.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/baichuan/tokenization_baichuan.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/bloom/tokenizer.json b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/bloom/tokenizer.json
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/bloom/tokenizer.json
rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/bloom/tokenizer.json
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/falcon/tokenizer.json b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/falcon/tokenizer.json
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/falcon/tokenizer.json
rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/falcon/tokenizer.json
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/mpt/tokenizer.json b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/mpt/tokenizer.json
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/mpt/tokenizer.json
rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/mpt/tokenizer.json
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/__init__.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/__init__.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/__init__.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py b/python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/mpt.py b/python/llm/src/ipex_llm/transformers/gguf/models/mpt.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/mpt.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/mpt.py
diff --git a/python/llm/src/bigdl/llm/transformers/gguf/models/yuan2.py b/python/llm/src/ipex_llm/transformers/gguf/models/yuan2.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/gguf/models/yuan2.py
rename to python/llm/src/ipex_llm/transformers/gguf/models/yuan2.py
diff --git a/python/llm/src/bigdl/llm/transformers/kv.py b/python/llm/src/ipex_llm/transformers/kv.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/kv.py
rename to python/llm/src/ipex_llm/transformers/kv.py
diff --git a/python/llm/src/bigdl/llm/transformers/layers/rope_embedding.py b/python/llm/src/ipex_llm/transformers/layers/rope_embedding.py
similarity index 93%
rename from python/llm/src/bigdl/llm/transformers/layers/rope_embedding.py
rename to python/llm/src/ipex_llm/transformers/layers/rope_embedding.py
index b3af61dd..0c6c3714 100644
--- a/python/llm/src/bigdl/llm/transformers/layers/rope_embedding.py
+++ b/python/llm/src/ipex_llm/transformers/layers/rope_embedding.py
@@ -16,10 +16,10 @@
 
 import torch
 import logging
-from bigdl.llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
+from ipex_llm.utils.common import invalidInputError
 
-LOG = logging.getLogger("bigdl.llm.rope_embedding")
+LOG = logging.getLogger("ipex_llm.rope_embedding")
 
 
 # Fast RoPE for finetuning, split the q and k
diff --git a/python/llm/src/bigdl/llm/transformers/load_config.yaml b/python/llm/src/ipex_llm/transformers/load_config.yaml
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/load_config.yaml
rename to python/llm/src/ipex_llm/transformers/load_config.yaml
diff --git a/python/llm/src/bigdl/llm/transformers/loader.py b/python/llm/src/ipex_llm/transformers/loader.py
similarity index 97%
rename from python/llm/src/bigdl/llm/transformers/loader.py
rename to python/llm/src/ipex_llm/transformers/loader.py
index 4c3250f8..2876f896 100644
--- a/python/llm/src/bigdl/llm/transformers/loader.py
+++ b/python/llm/src/ipex_llm/transformers/loader.py
@@ -17,11 +17,11 @@
 
 import torch
 
-from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel, get_enable_ipex
+from ipex_llm.transformers import AutoModelForCausalLM, AutoModel, get_enable_ipex
 import time
 from datetime import date
 import argparse
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
 
 LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']
diff --git a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/low_bit_linear.py
rename to python/llm/src/ipex_llm/transformers/low_bit_linear.py
index 702e7da5..f59724ba 100644
--- a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
+++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
@@ -42,22 +42,22 @@
 
 
 from typing import Optional, TypeVar, Union, overload
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 import os
 import torch
 import torch.nn.functional as F
 from torch import Tensor, device, dtype, nn
 from operator import mul
 from functools import reduce
-from bigdl.llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
-from bigdl.llm.transformers.utils import get_autocast_dtype, get_xpu_device_type, \
+from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
+from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_type, \
     get_ipex_version
 
 T = TypeVar("T", bound="torch.nn.Module")
 
-import bigdl.llm.ggml.model.llama.llama_cpp as ggml
+import ipex_llm.ggml.model.llama.llama_cpp as ggml
 import ctypes
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
 
 TORCH_LINEAR_THRESHOLD = int(os.getenv("BIGDL_LLM_LINEAR_THRESHOLD", "512"))
 SYM_INT4 = ggml_tensor_qtype["sym_int4"]
@@ -88,7 +88,7 @@ Q2_K = ggml_tensor_qtype["q2_k"]
 # Note this format cannot be used directly in IPEX's mm_int4, which expects
 # row major but packing two consecutive columns.
 def q4_0_xpu_transpose(ggml_weight, weight_shape):
-    from bigdl.llm.transformers.low_bit_linear import get_block_size
+    from ipex_llm.transformers.low_bit_linear import get_block_size
     Q4_0 = get_block_size("sym_int4")
 
     n, k = weight_shape
@@ -586,7 +586,7 @@ class LowBitLinear(nn.Linear):
             try:
                 import intel_extension_for_pytorch
                 import linear_q4_0
-                from bigdl.llm.transformers.models.utils import use_xmx
+                from ipex_llm.transformers.models.utils import use_xmx
             except ModuleNotFoundError:
                 invalidInputError(False,
                                   "Please `pip install bigdl_core_xe` first.")
@@ -646,7 +646,7 @@ class LowBitLinear(nn.Linear):
             if self.training and x.requires_grad:
                 result = MatMulLowBitCPU.apply(x, self.weight)
             else:
-                from bigdl.llm.utils.isa_checker import is_server, is_spr
+                from ipex_llm.utils.isa_checker import is_server, is_spr
 
                 # convert if necessary, and compute a linear result
                 if is_server() and (not is_spr()) and \
diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/model.py
rename to python/llm/src/ipex_llm/transformers/model.py
index 374d7887..f4c5bd25 100644
--- a/python/llm/src/bigdl/llm/transformers/model.py
+++ b/python/llm/src/ipex_llm/transformers/model.py
@@ -42,9 +42,9 @@ from transformers.configuration_utils import PretrainedConfig
 from .utils import extract_local_archive_file, \
     load_state_dict, \
     get_local_shard_files, load_imatrix_data
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.gguf.api import load_gguf_model
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.gguf.api import load_gguf_model
 import torch
 import warnings
 import copy
@@ -94,7 +94,7 @@ def save_low_bit(self, *args, **kwargs):
 
 def _load_pre():
     from transformers import GPTJModel
-    from bigdl.llm.transformers.models.gptj import gptj_model_new_init
+    from ipex_llm.transformers.models.gptj import gptj_model_new_init
     GPTJModel.__init__ = gptj_model_new_init
 
 
@@ -218,7 +218,7 @@ class _BaseAutoModelClass:
             kwargs["modules_to_not_convert"] = ["lm_head"]
 
         load_in_8bit = kwargs.pop("load_in_8bit", False)
-        from bigdl.llm.llm_patching import bigdl_patched
+        from ipex_llm.llm_patching import bigdl_patched
         if bigdl_patched == 'Train':
             global patched_training_mode
             if load_in_low_bit == "nf4" or load_in_low_bit == "sym_int4" or load_in_4bit:
@@ -234,7 +234,7 @@ class _BaseAutoModelClass:
         if load_in_4bit or load_in_low_bit:
 
             if config_dict.get("quantization_config", None) is not None:
-                from bigdl.llm.transformers.low_bit_linear import get_block_size
+                from ipex_llm.transformers.low_bit_linear import get_block_size
                 q_config = config_dict["quantization_config"]
                 if q_config["quant_method"] == "gptq":
                     invalidInputError(q_config["bits"] == 4,
@@ -260,7 +260,7 @@ class _BaseAutoModelClass:
                         user_quantization_config = GPTQConfig(bits=4, use_exllama=False)
                     kwargs["quantization_config"] = user_quantization_config
                 elif q_config["quant_method"] == "awq":
-                    from bigdl.llm.transformers.awq.awq_config import AwqConfig
+                    from ipex_llm.transformers.awq.awq_config import AwqConfig
                     awq_config = AwqConfig.from_dict(q_config)
                     invalidInputError(awq_config.bits == 4,
                                       "Only 4-bit awq is supported in bigdl-llm.")
@@ -347,7 +347,7 @@ class _BaseAutoModelClass:
 
         :return: An optimized bigdl-llm model and a huggingface tokenizer
         """
-        from bigdl.llm.optimize import optimize_model as optimize_model_fn
+        from ipex_llm.optimize import optimize_model as optimize_model_fn
 
         model, tokenizer = load_gguf_model(fpath, dtype=torch.half, low_bit=low_bit)
         model = optimize_model_fn(model, low_bit=low_bit, optimize_llm=optimize_model,
@@ -390,7 +390,7 @@ class _BaseAutoModelClass:
             # https://github.com/casper-hansen/AutoAWQ/blob/main/awq/models/base.py#L147
             from accelerate import init_empty_weights, infer_auto_device_map, \
                 load_checkpoint_in_model
-            from bigdl.llm.transformers.awq.awq import _replace_with_awq_layers, \
+            from ipex_llm.transformers.awq.awq import _replace_with_awq_layers, \
                 get_layer_type, _load_config
             awq_config = quant_config
             model_weights_path, config = _load_config(args[0], '', max_new_tokens=None,
diff --git a/python/llm/src/bigdl/llm/transformers/modelling_bigdl.py b/python/llm/src/ipex_llm/transformers/modelling_bigdl.py
similarity index 91%
rename from python/llm/src/bigdl/llm/transformers/modelling_bigdl.py
rename to python/llm/src/ipex_llm/transformers/modelling_bigdl.py
index 4c7ba671..e81b6fdc 100644
--- a/python/llm/src/bigdl/llm/transformers/modelling_bigdl.py
+++ b/python/llm/src/ipex_llm/transformers/modelling_bigdl.py
@@ -22,7 +22,7 @@
 import importlib
 import logging
 
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 from .model import *
 
 
@@ -40,7 +40,7 @@ class BigdlNativeForCausalLM:
                         **kwargs):
         """
         :param pretrained_model_name_or_path: Path for converted BigDL-LLM optimized ggml
-               binary checkpoint. The checkpoint should be converted by ``bigdl.llm.llm_convert``.
+               binary checkpoint. The checkpoint should be converted by ``ipex_llm.llm_convert``.
         :param model_family: The model family of the pretrained checkpoint.
                Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``, ``"starcoder"``
                and ``"chatglm"``.
@@ -67,19 +67,19 @@ class BigdlNativeForCausalLM:
         ggml_model_path = pretrained_model_name_or_path
 
         if model_family == 'llama':
-            from bigdl.llm.ggml.model.llama import Llama
+            from ipex_llm.ggml.model.llama import Llama
             return Llama(model_path=ggml_model_path, **kwargs)
         elif model_family == 'gptneox':
-            from bigdl.llm.ggml.model.gptneox import Gptneox
+            from ipex_llm.ggml.model.gptneox import Gptneox
             return Gptneox(model_path=ggml_model_path, **kwargs)
         elif model_family == 'bloom':
-            from bigdl.llm.ggml.model.bloom import Bloom
+            from ipex_llm.ggml.model.bloom import Bloom
             return Bloom(model_path=ggml_model_path, **kwargs)
         elif model_family == 'starcoder':
-            from bigdl.llm.ggml.model.starcoder import Starcoder
+            from ipex_llm.ggml.model.starcoder import Starcoder
             return Starcoder(model_path=ggml_model_path, **kwargs)
         elif model_family == 'chatglm':
-            from bigdl.llm.ggml.model.chatglm import ChatGLM
+            from ipex_llm.ggml.model.chatglm import ChatGLM
             return ChatGLM(model_path=ggml_model_path, **kwargs)
 
 
@@ -98,7 +98,7 @@ class _BaseGGMLClass:
         """
         :param pretrained_model_name_or_path: Path for model checkpoint.
                If running with ``native int4``, the path should be converted BigDL-LLM optimized
-               ggml binary checkpoint, which should be converted by ``bigdl.llm.llm_convert``.
+               ggml binary checkpoint, which should be converted by ``ipex_llm.llm_convert``.
                If running with ``transformers int4``, the path should be the huggingface repo id
                to be downloaded or the huggingface checkpoint folder.
         :param native: Load model to either BigDL-LLM optimized Transformer or Native (ggml) int4.
@@ -132,30 +132,30 @@ class _BaseGGMLClass:
 
 
 class LlamaForCausalLM(_BaseGGMLClass):
-    GGML_Module = "bigdl.llm.models"
+    GGML_Module = "ipex_llm.models"
     GGML_Model = "Llama"
     HF_Class = AutoModelForCausalLM
 
 
 class ChatGLMForCausalLM(_BaseGGMLClass):
-    GGML_Module = "bigdl.llm.ggml.model.chatglm"
+    GGML_Module = "ipex_llm.ggml.model.chatglm"
     GGML_Model = "ChatGLM"
     HF_Class = AutoModel
 
 
 class GptneoxForCausalLM(_BaseGGMLClass):
-    GGML_Module = "bigdl.llm.models"
+    GGML_Module = "ipex_llm.models"
     GGML_Model = "Gptneox"
     HF_Class = AutoModelForCausalLM
 
 
 class BloomForCausalLM(_BaseGGMLClass):
-    GGML_Module = "bigdl.llm.models"
+    GGML_Module = "ipex_llm.models"
     GGML_Model = "Bloom"
     HF_Class = AutoModelForCausalLM
 
 
 class StarcoderForCausalLM(_BaseGGMLClass):
-    GGML_Module = "bigdl.llm.models"
+    GGML_Module = "ipex_llm.models"
     GGML_Model = "Starcoder"
     HF_Class = AutoModelForCausalLM
diff --git a/python/llm/src/bigdl/llm/transformers/models/__init__.py b/python/llm/src/ipex_llm/transformers/models/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/models/__init__.py
rename to python/llm/src/ipex_llm/transformers/models/__init__.py
diff --git a/python/llm/src/bigdl/llm/transformers/models/aquila.py b/python/llm/src/ipex_llm/transformers/models/aquila.py
similarity index 96%
rename from python/llm/src/bigdl/llm/transformers/models/aquila.py
rename to python/llm/src/ipex_llm/transformers/models/aquila.py
index 68ca7a01..1b1d252a 100644
--- a/python/llm/src/bigdl/llm/transformers/models/aquila.py
+++ b/python/llm/src/ipex_llm/transformers/models/aquila.py
@@ -42,11 +42,11 @@ import torch
 import torch.utils.checkpoint
 from torch import nn
 
-from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, \
+from ipex_llm.transformers.models.utils import extend_kv_cache, init_kv_cache, \
     append_kv_cache, is_enough_kv_cache_room_4_31
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
-from bigdl.llm.utils.common import log4Error
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
+from ipex_llm.utils.common import log4Error
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
 
diff --git a/python/llm/src/bigdl/llm/transformers/models/baichuan.py b/python/llm/src/ipex_llm/transformers/models/baichuan.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/baichuan.py
rename to python/llm/src/ipex_llm/transformers/models/baichuan.py
index 29e7968f..0c9e8216 100644
--- a/python/llm/src/bigdl/llm/transformers/models/baichuan.py
+++ b/python/llm/src/ipex_llm/transformers/models/baichuan.py
@@ -26,14 +26,14 @@ import torch.utils.checkpoint
 from torch import nn
 import torch.nn.functional as F
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
     append_kv_cache, is_enough_kv_cache_room_4_31
-from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
+from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
     restore_fp8_kv_cache, use_quantize_kv_cache
-from bigdl.llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
+from ipex_llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
 
diff --git a/python/llm/src/bigdl/llm/transformers/models/baichuan2.py b/python/llm/src/ipex_llm/transformers/models/baichuan2.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/baichuan2.py
rename to python/llm/src/ipex_llm/transformers/models/baichuan2.py
index 1cb4b117..38a47592 100644
--- a/python/llm/src/bigdl/llm/transformers/models/baichuan2.py
+++ b/python/llm/src/ipex_llm/transformers/models/baichuan2.py
@@ -23,15 +23,15 @@ from typing import Optional, Tuple
 import torch
 import torch.utils.checkpoint
 from torch.nn import functional as F
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
     restore_fp8_kv_cache, use_quantize_kv_cache
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
     append_kv_cache, is_enough_kv_cache_room_4_31
-from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb, SILU
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
-from bigdl.llm.transformers.models.utils import mlp_fusion_check
+from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, SILU
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
+from ipex_llm.transformers.models.utils import mlp_fusion_check
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 
diff --git a/python/llm/src/bigdl/llm/transformers/models/bert.py b/python/llm/src/ipex_llm/transformers/models/bert.py
similarity index 99%
rename from python/llm/src/bigdl/llm/transformers/models/bert.py
rename to python/llm/src/ipex_llm/transformers/models/bert.py
index 27abd988..4c83ba6c 100644
--- a/python/llm/src/bigdl/llm/transformers/models/bert.py
+++ b/python/llm/src/ipex_llm/transformers/models/bert.py
@@ -36,7 +36,7 @@ import math
 import torch
 from typing import Optional, Tuple
 from transformers.models.bert.modeling_bert import BertSelfAttention, BertEncoder
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 
 def merge_qkv(module: torch.nn.Module):
diff --git a/python/llm/src/bigdl/llm/transformers/models/bloom.py b/python/llm/src/ipex_llm/transformers/models/bloom.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/bloom.py
rename to python/llm/src/ipex_llm/transformers/models/bloom.py
index 4438270f..46489e8b 100644
--- a/python/llm/src/bigdl/llm/transformers/models/bloom.py
+++ b/python/llm/src/ipex_llm/transformers/models/bloom.py
@@ -37,8 +37,8 @@ from typing import Optional, Tuple
 import torch
 import torch.utils.checkpoint
 from torch.nn import functional as F
-from bigdl.llm.transformers.models.utils import use_fused_layer_norm
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import use_fused_layer_norm
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm.py b/python/llm/src/ipex_llm/transformers/models/chatglm.py
similarity index 99%
rename from python/llm/src/bigdl/llm/transformers/models/chatglm.py
rename to python/llm/src/ipex_llm/transformers/models/chatglm.py
index 4adcc722..ac9a98a1 100644
--- a/python/llm/src/bigdl/llm/transformers/models/chatglm.py
+++ b/python/llm/src/ipex_llm/transformers/models/chatglm.py
@@ -22,7 +22,7 @@ import torch
 import torch.utils.checkpoint
 import torch.nn.functional as F
 from typing import Optional, Tuple
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 
 
 def rotate_half(x):
diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py b/python/llm/src/ipex_llm/transformers/models/chatglm2.py
similarity index 99%
rename from python/llm/src/bigdl/llm/transformers/models/chatglm2.py
rename to python/llm/src/ipex_llm/transformers/models/chatglm2.py
index 1db8424a..c6dae7a9 100644
--- a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py
+++ b/python/llm/src/ipex_llm/transformers/models/chatglm2.py
@@ -22,10 +22,10 @@ import torch
 from typing import Optional, Tuple, List
 import torch.nn.functional as F
 from transformers.modeling_outputs import BaseModelOutputWithPast
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
     restore_fp8_kv_cache, use_quantize_kv_cache
-from bigdl.llm.transformers.models.utils import use_esimd_sdp
+from ipex_llm.transformers.models.utils import use_esimd_sdp
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm2_32k.py b/python/llm/src/ipex_llm/transformers/models/chatglm2_32k.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/chatglm2_32k.py
rename to python/llm/src/ipex_llm/transformers/models/chatglm2_32k.py
index d85861f2..94856152 100644
--- a/python/llm/src/bigdl/llm/transformers/models/chatglm2_32k.py
+++ b/python/llm/src/ipex_llm/transformers/models/chatglm2_32k.py
@@ -20,7 +20,7 @@
 import torch
 from typing import Optional, Tuple, Union, List, Callable, Dict, Any
 import torch.nn.functional as F
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
diff --git a/python/llm/src/bigdl/llm/transformers/models/decilm.py b/python/llm/src/ipex_llm/transformers/models/decilm.py
similarity index 95%
rename from python/llm/src/bigdl/llm/transformers/models/decilm.py
rename to python/llm/src/ipex_llm/transformers/models/decilm.py
index 788f4bab..67bc5e49 100644
--- a/python/llm/src/bigdl/llm/transformers/models/decilm.py
+++ b/python/llm/src/ipex_llm/transformers/models/decilm.py
@@ -34,12 +34,12 @@
 import torch
 from typing import Optional, Tuple
 import torch.nn.functional as F
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
     apply_rotary_pos_emb
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
-from bigdl.llm.transformers.models.llama import should_use_fuse_rope, repeat_kv
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
+from ipex_llm.transformers.models.llama import should_use_fuse_rope, repeat_kv
+from ipex_llm.utils.common import invalidInputError
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
 
diff --git a/python/llm/src/bigdl/llm/transformers/models/falcon.py b/python/llm/src/ipex_llm/transformers/models/falcon.py
similarity index 99%
rename from python/llm/src/bigdl/llm/transformers/models/falcon.py
rename to python/llm/src/ipex_llm/transformers/models/falcon.py
index aa4abc14..4932aeab 100644
--- a/python/llm/src/bigdl/llm/transformers/models/falcon.py
+++ b/python/llm/src/ipex_llm/transformers/models/falcon.py
@@ -37,8 +37,8 @@ from typing import Optional, Tuple
 
 import torch
 from torch.nn import functional as F
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 import warnings
 
 
diff --git a/python/llm/src/bigdl/llm/transformers/models/gemma.py b/python/llm/src/ipex_llm/transformers/models/gemma.py
similarity index 96%
rename from python/llm/src/bigdl/llm/transformers/models/gemma.py
rename to python/llm/src/ipex_llm/transformers/models/gemma.py
index 9400c034..26934f03 100644
--- a/python/llm/src/bigdl/llm/transformers/models/gemma.py
+++ b/python/llm/src/ipex_llm/transformers/models/gemma.py
@@ -35,12 +35,12 @@ from typing import Optional, Tuple
 
 import torch
 from torch import nn
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu
-from bigdl.llm.transformers.models.utils import mlp_fusion_check, GELU
-from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_36, rotate_half
-from bigdl.llm.transformers.low_bit_linear import SYM_INT4, FP8E5
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu
+from ipex_llm.transformers.models.utils import mlp_fusion_check, GELU
+from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36, rotate_half
+from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
 
diff --git a/python/llm/src/bigdl/llm/transformers/models/gptbigcode.py b/python/llm/src/ipex_llm/transformers/models/gptbigcode.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/gptbigcode.py
rename to python/llm/src/ipex_llm/transformers/models/gptbigcode.py
index 8a38f22e..611b9fba 100644
--- a/python/llm/src/bigdl/llm/transformers/models/gptbigcode.py
+++ b/python/llm/src/ipex_llm/transformers/models/gptbigcode.py
@@ -46,7 +46,7 @@ def gptbigcode_attention_forward(
 
         if encoder_hidden_states is not None:
             if not hasattr(self, "q_attn") or not self.is_cross_attention:
-                from bigdl.llm.utils.common import invalidInputError
+                from ipex_llm.utils.common import invalidInputError
                 invalidInputError(
                     False,
                     "If class is used as cross attention," +
diff --git a/python/llm/src/bigdl/llm/transformers/models/gptj.py b/python/llm/src/ipex_llm/transformers/models/gptj.py
similarity index 99%
rename from python/llm/src/bigdl/llm/transformers/models/gptj.py
rename to python/llm/src/ipex_llm/transformers/models/gptj.py
index 794cf291..38df3cb1 100644
--- a/python/llm/src/bigdl/llm/transformers/models/gptj.py
+++ b/python/llm/src/ipex_llm/transformers/models/gptj.py
@@ -19,12 +19,12 @@
 
 import torch
 from typing import Optional, Tuple, Union
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
     apply_rotary_pos_emb, append_kv_cache, apply_ipex_rotate_every_two
 from transformers.utils.import_utils import is_torch_fx_proxy
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.models.gptj.modeling_gptj import GPTJModel
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
diff --git a/python/llm/src/bigdl/llm/transformers/models/gptneox.py b/python/llm/src/ipex_llm/transformers/models/gptneox.py
similarity index 96%
rename from python/llm/src/bigdl/llm/transformers/models/gptneox.py
rename to python/llm/src/ipex_llm/transformers/models/gptneox.py
index ca29845a..52466042 100644
--- a/python/llm/src/bigdl/llm/transformers/models/gptneox.py
+++ b/python/llm/src/ipex_llm/transformers/models/gptneox.py
@@ -33,10 +33,10 @@
 
 import torch
 from typing import Optional, Tuple
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
     append_kv_cache, is_enough_kv_cache_room_4_31
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
diff --git a/python/llm/src/bigdl/llm/transformers/models/internlm.py b/python/llm/src/ipex_llm/transformers/models/internlm.py
similarity index 97%
rename from python/llm/src/bigdl/llm/transformers/models/internlm.py
rename to python/llm/src/ipex_llm/transformers/models/internlm.py
index e75e67d4..038a63d8 100644
--- a/python/llm/src/bigdl/llm/transformers/models/internlm.py
+++ b/python/llm/src/ipex_llm/transformers/models/internlm.py
@@ -42,11 +42,11 @@ from typing import Optional, Tuple
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \
     append_kv_cache, is_enough_kv_cache_room_4_31
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
diff --git a/python/llm/src/bigdl/llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py
similarity index 99%
rename from python/llm/src/bigdl/llm/transformers/models/llama.py
rename to python/llm/src/ipex_llm/transformers/models/llama.py
index 3646253c..45d944c5 100644
--- a/python/llm/src/bigdl/llm/transformers/models/llama.py
+++ b/python/llm/src/ipex_llm/transformers/models/llama.py
@@ -39,20 +39,20 @@ from typing import Optional, Tuple, Union, List
 import math
 import os
 import torch.nn.functional as F
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import SILU
-from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import SILU
+from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
     restore_fp8_kv_cache, use_quantize_kv_cache
-from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
+from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
     apply_rotary_pos_emb, is_enough_kv_cache_room_4_36
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
-from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
-from bigdl.llm.transformers.models.utils import mlp_fusion_check, fp16_fusion_check
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
+from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
+from ipex_llm.transformers.models.utils import mlp_fusion_check, fp16_fusion_check
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.models.llama.modeling_llama import LlamaModel
-from bigdl.llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.utils.common import invalidInputError
 
 try:
     from transformers.cache_utils import Cache, DynamicCache
@@ -106,7 +106,7 @@ def llama_model_forward_4_36(
     output_hidden_states: Optional[bool] = None,
     return_dict: Optional[bool] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
-    from bigdl.llm.transformers.kv import DynamicFp8Cache
+    from ipex_llm.transformers.kv import DynamicFp8Cache
     use_cache = use_cache if use_cache is not None else self.config.use_cache
     if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids):
         if not isinstance(past_key_values, DynamicFp8Cache):
@@ -1558,7 +1558,7 @@ def llama_attention_fast_forward(
         kv_seq_len += past_key_value[0].shape[-2]
 
     if use_fast_rope:
-        from bigdl.llm.transformers.layers.rope_embedding import apply_fast_rope_embedding
+        from ipex_llm.transformers.layers.rope_embedding import apply_fast_rope_embedding
         query_states, key_states = apply_fast_rope_embedding(query_states,
                                                              key_states,
                                                              position_ids,
diff --git a/python/llm/src/bigdl/llm/transformers/models/mistral.py b/python/llm/src/ipex_llm/transformers/models/mistral.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/mistral.py
rename to python/llm/src/ipex_llm/transformers/models/mistral.py
index 3db06cfe..5c7a6343 100644
--- a/python/llm/src/bigdl/llm/transformers/models/mistral.py
+++ b/python/llm/src/ipex_llm/transformers/models/mistral.py
@@ -43,16 +43,16 @@ from torch import nn
 import torch.nn.functional as F
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.models.mistral.modeling_mistral import MistralModel
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
     restore_fp8_kv_cache, use_quantize_kv_cache
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb, \
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, \
     apply_rotary_pos_emb_no_cache_xpu
-from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
+from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
     is_enough_kv_cache_room_4_36
-from bigdl.llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS
-from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
+from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS
+from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
 try:
     from transformers.cache_utils import Cache
 except ImportError:
@@ -138,7 +138,7 @@ def mistral_model_forward_4_36(
     output_hidden_states: Optional[bool] = None,
     return_dict: Optional[bool] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
-    from bigdl.llm.transformers.kv import DynamicFp8Cache
+    from ipex_llm.transformers.kv import DynamicFp8Cache
     use_cache = use_cache if use_cache is not None else self.config.use_cache
     if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids):
         if not isinstance(past_key_values, DynamicFp8Cache):
diff --git a/python/llm/src/bigdl/llm/transformers/models/mixtral.py b/python/llm/src/ipex_llm/transformers/models/mixtral.py
similarity index 97%
rename from python/llm/src/bigdl/llm/transformers/models/mixtral.py
rename to python/llm/src/ipex_llm/transformers/models/mixtral.py
index 0e31e238..f5c836ac 100644
--- a/python/llm/src/bigdl/llm/transformers/models/mixtral.py
+++ b/python/llm/src/ipex_llm/transformers/models/mixtral.py
@@ -48,15 +48,15 @@ from transformers.modeling_attn_mask_utils import (
 import torch
 from torch import nn
 import torch.nn.functional as F
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb,\
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb,\
     apply_rotary_pos_emb_cache_freq_xpu, is_enough_kv_cache_room_4_36
-from bigdl.llm.transformers.models.mistral import should_use_fuse_rope, use_decoding_fast_path
-from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
-from bigdl.llm.transformers.models.utils import mlp_fusion_check, SILU
-from bigdl.llm.transformers.low_bit_linear import IQ2_XXS
+from ipex_llm.transformers.models.mistral import should_use_fuse_rope, use_decoding_fast_path
+from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
+from ipex_llm.transformers.models.utils import mlp_fusion_check, SILU
+from ipex_llm.transformers.low_bit_linear import IQ2_XXS
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
diff --git a/python/llm/src/bigdl/llm/transformers/models/mpt.py b/python/llm/src/ipex_llm/transformers/models/mpt.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/mpt.py
rename to python/llm/src/ipex_llm/transformers/models/mpt.py
index a09ef771..4d4a191a 100644
--- a/python/llm/src/bigdl/llm/transformers/models/mpt.py
+++ b/python/llm/src/ipex_llm/transformers/models/mpt.py
@@ -22,8 +22,8 @@ import torch
 from einops import rearrange
 import math
 import torch.nn.functional as F
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
diff --git a/python/llm/src/bigdl/llm/transformers/models/phixtral.py b/python/llm/src/ipex_llm/transformers/models/phixtral.py
similarity index 91%
rename from python/llm/src/bigdl/llm/transformers/models/phixtral.py
rename to python/llm/src/ipex_llm/transformers/models/phixtral.py
index 272ab53b..66595d5c 100644
--- a/python/llm/src/bigdl/llm/transformers/models/phixtral.py
+++ b/python/llm/src/ipex_llm/transformers/models/phixtral.py
@@ -43,14 +43,14 @@ from typing import Optional, Tuple
 import torch
 from torch import nn
 import torch.nn.functional as F
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb,\
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb,\
     apply_rotary_pos_emb_no_cache_xpu, is_enough_kv_cache_room_4_36
-from bigdl.llm.transformers.models.mistral import should_use_fuse_rope, use_decoding_fast_path
-from bigdl.llm.transformers.models.utils import use_flash_attention
-from bigdl.llm.transformers.models.utils import mlp_fusion_check
+from ipex_llm.transformers.models.mistral import should_use_fuse_rope, use_decoding_fast_path
+from ipex_llm.transformers.models.utils import use_flash_attention
+from ipex_llm.transformers.models.utils import mlp_fusion_check
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
diff --git a/python/llm/src/bigdl/llm/transformers/models/qwen.py b/python/llm/src/ipex_llm/transformers/models/qwen.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/qwen.py
rename to python/llm/src/ipex_llm/transformers/models/qwen.py
index 321a307e..833ff866 100644
--- a/python/llm/src/bigdl/llm/transformers/models/qwen.py
+++ b/python/llm/src/ipex_llm/transformers/models/qwen.py
@@ -36,15 +36,15 @@ try:
 except ImportError:
     rearrange = None
 
-from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
+from ipex_llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
     restore_fp8_kv_cache, use_quantize_kv_cache
-from bigdl.llm.transformers.models.utils import rotate_half, SILU
-from bigdl.llm.transformers.models.utils import mlp_fusion_check
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu
-from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
-from bigdl.llm.utils.common import invalidInputError, invalidOperationError
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.transformers.models.utils import rotate_half, SILU
+from ipex_llm.transformers.models.utils import mlp_fusion_check
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu
+from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
+from ipex_llm.utils.common import invalidInputError, invalidOperationError
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
 from transformers.modeling_outputs import BaseModelOutputWithPast
 
 apply_rotary_emb_func = None
diff --git a/python/llm/src/bigdl/llm/transformers/models/qwen2.py b/python/llm/src/ipex_llm/transformers/models/qwen2.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/qwen2.py
rename to python/llm/src/ipex_llm/transformers/models/qwen2.py
index 81864db0..faf14a87 100644
--- a/python/llm/src/bigdl/llm/transformers/models/qwen2.py
+++ b/python/llm/src/ipex_llm/transformers/models/qwen2.py
@@ -45,14 +45,14 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from bigdl.llm.transformers.models.llama import repeat_kv
-from bigdl.llm.transformers.models.utils import extend_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache
-from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_36
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu
-from bigdl.llm.transformers.kv import DynamicFp8Cache
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
+from ipex_llm.transformers.models.llama import repeat_kv
+from ipex_llm.transformers.models.utils import extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache
+from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu
+from ipex_llm.transformers.kv import DynamicFp8Cache
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import use_flash_attention, use_esimd_sdp
 from transformers.models.qwen2.modeling_qwen2 import Qwen2Model, apply_rotary_pos_emb
 from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask_for_sdpa
 from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask
@@ -404,7 +404,7 @@ def qwen2_attention_forward_quantized(
         attn_weights = None
 
     return attn_output, attn_weights, past_key_value
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
 SYM_INT4 = ggml_tensor_qtype["sym_int4"]
 FP8E5 = ggml_tensor_qtype["fp8_e5m2"]
 
diff --git a/python/llm/src/bigdl/llm/transformers/models/qwen_vl.py b/python/llm/src/ipex_llm/transformers/models/qwen_vl.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/qwen_vl.py
rename to python/llm/src/ipex_llm/transformers/models/qwen_vl.py
index 4094ae14..7c66f9ea 100644
--- a/python/llm/src/bigdl/llm/transformers/models/qwen_vl.py
+++ b/python/llm/src/ipex_llm/transformers/models/qwen_vl.py
@@ -30,8 +30,8 @@ import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from transformers.utils import logging
-from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import rotate_half
+from ipex_llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import rotate_half
 
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
diff --git a/python/llm/src/bigdl/llm/transformers/models/rwkv4.py b/python/llm/src/ipex_llm/transformers/models/rwkv4.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/models/rwkv4.py
rename to python/llm/src/ipex_llm/transformers/models/rwkv4.py
diff --git a/python/llm/src/bigdl/llm/transformers/models/rwkv5.py b/python/llm/src/ipex_llm/transformers/models/rwkv5.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/models/rwkv5.py
rename to python/llm/src/ipex_llm/transformers/models/rwkv5.py
diff --git a/python/llm/src/bigdl/llm/transformers/models/utils.py b/python/llm/src/ipex_llm/transformers/models/utils.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/models/utils.py
rename to python/llm/src/ipex_llm/transformers/models/utils.py
index 40230638..1a4e1f0b 100644
--- a/python/llm/src/bigdl/llm/transformers/models/utils.py
+++ b/python/llm/src/ipex_llm/transformers/models/utils.py
@@ -16,9 +16,9 @@
 
 import os
 import torch
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
-from bigdl.llm.transformers.utils import get_ipex_version, get_xpu_device_type
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_type
 
 FP8_KV_ALLOC_LENGTH = 512
 SYM_INT4 = ggml_tensor_qtype["sym_int4"]
@@ -177,7 +177,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
 def apply_ipex_rotate_every_two(q, k, cos, sin):
     # ipex's apply_rotary_embedding_two_qk can change the origin storage,
     # so q/k will get the result directly.
-    from bigdl.llm.transformers.utils import get_ipex_version
+    from ipex_llm.transformers.utils import get_ipex_version
     if get_ipex_version() >= "2.1.10+xpu":
         torch.ops.torch_ipex.apply_rotary_embedding_two_qk(
             q, k, sin, cos, q, k
diff --git a/python/llm/src/bigdl/llm/transformers/models/yuan.py b/python/llm/src/ipex_llm/transformers/models/yuan.py
similarity index 97%
rename from python/llm/src/bigdl/llm/transformers/models/yuan.py
rename to python/llm/src/ipex_llm/transformers/models/yuan.py
index 015835d7..f17b0ec7 100644
--- a/python/llm/src/bigdl/llm/transformers/models/yuan.py
+++ b/python/llm/src/ipex_llm/transformers/models/yuan.py
@@ -28,14 +28,14 @@ from typing import Optional, Tuple
 import torch
 import torch.nn as nn
 
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb, \
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, \
     apply_rotary_pos_emb_cache_freq_xpu, mlp_fusion_check, fp16_fusion_check
-from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
-from bigdl.llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
+from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
+from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \
     restore_fp8_kv_cache, use_quantize_kv_cache
-from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, SILU
-from bigdl.llm.transformers.low_bit_linear import SYM_INT4, FP8E5
+from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, SILU
+from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5
 
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
 
diff --git a/python/llm/src/bigdl/llm/transformers/qlora.py b/python/llm/src/ipex_llm/transformers/qlora.py
similarity index 95%
rename from python/llm/src/bigdl/llm/transformers/qlora.py
rename to python/llm/src/ipex_llm/transformers/qlora.py
index 852f59e5..5c9f3b54 100644
--- a/python/llm/src/bigdl/llm/transformers/qlora.py
+++ b/python/llm/src/ipex_llm/transformers/qlora.py
@@ -51,15 +51,15 @@
 import torch
 import logging
 from torch.nn import Linear, Embedding
-from bigdl.llm.transformers.low_bit_linear import LowBitLinear, BF16Linear, get_qk_size
+from ipex_llm.transformers.low_bit_linear import LowBitLinear, BF16Linear, get_qk_size
 from peft.tuners.lora import LoraLayer
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.transformers.utils import get_autocast_dtype
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.utils import get_autocast_dtype
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
 import functools
-from bigdl.llm.transformers import training_patch
+from ipex_llm.transformers import training_patch
 
-LOG = logging.getLogger("bigdl.llm.qlora")
+LOG = logging.getLogger("ipex_llm.qlora")
 
 
 class LoraLowBitLinear(LowBitLinear, LoraLayer):
@@ -246,7 +246,7 @@ class LoraConfig(LoraConfigBase):
     def __init__(self, *args, **kwargs):
         self.training_mode = kwargs.pop("training_mode", "qlora")
         super().__init__(*args, **kwargs)
-        from bigdl.llm.llm_patching import bigdl_patched
+        from ipex_llm.llm_patching import bigdl_patched
         if bigdl_patched == 'Train':
             from .model import patched_training_mode
             self.training_mode = patched_training_mode
@@ -274,7 +274,7 @@ def get_peft_model(*args, **kwargs):
                                                                   old_create_new_module))
 
     try:
-        from bigdl.llm.llm_patching import bigdl_patched
+        from ipex_llm.llm_patching import bigdl_patched
         if bigdl_patched == 'Train':
             from peft import get_peft_model_original
         else:
@@ -384,8 +384,8 @@ def cast_lora_weight(model, dtype=torch.bfloat16):
 def _optimize_post(model):
     import transformers
     from packaging import version
-    from bigdl.llm.transformers.convert import convert_forward
-    from bigdl.llm.transformers.models.llama import llama_attention_fast_forward
+    from ipex_llm.transformers.convert import convert_forward
+    from ipex_llm.transformers.models.llama import llama_attention_fast_forward
 
     trans_version = transformers.__version__
     if version.parse(trans_version) >= version.parse("4.31.0"):
diff --git a/python/llm/src/bigdl/llm/transformers/relora.py b/python/llm/src/ipex_llm/transformers/relora.py
similarity index 98%
rename from python/llm/src/bigdl/llm/transformers/relora.py
rename to python/llm/src/ipex_llm/transformers/relora.py
index e1c37a0c..37676f70 100644
--- a/python/llm/src/bigdl/llm/transformers/relora.py
+++ b/python/llm/src/ipex_llm/transformers/relora.py
@@ -54,11 +54,11 @@ from transformers import (
 )
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 import torch.distributed as dist
-from bigdl.llm.transformers.qlora import LoraLowBitLinear
-from bigdl.llm.transformers.low_bit_linear import FP4Params
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.transformers.qlora import LoraLowBitLinear
+from ipex_llm.transformers.low_bit_linear import FP4Params
+from ipex_llm.utils.common import invalidInputError
 
-LOG = logging.getLogger("bigdl.llm.relora")
+LOG = logging.getLogger("ipex_llm.relora")
 
 
 class ReLoRATrainer(Trainer):
diff --git a/python/llm/src/bigdl/llm/transformers/speculative.py b/python/llm/src/ipex_llm/transformers/speculative.py
similarity index 99%
rename from python/llm/src/bigdl/llm/transformers/speculative.py
rename to python/llm/src/ipex_llm/transformers/speculative.py
index 323a1d9c..74866fee 100644
--- a/python/llm/src/bigdl/llm/transformers/speculative.py
+++ b/python/llm/src/ipex_llm/transformers/speculative.py
@@ -29,14 +29,14 @@ from packaging import version
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 from transformers import top_k_top_p_filtering, GenerationConfig, \
     LogitsProcessorList, StoppingCriteriaList
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 # patch GenerationMixin.generate
 from transformers import GenerationMixin
 original_generate = GenerationMixin.generate
 query_group_size = 16
-logger = logging.getLogger("bigdl.llm.speculative")
+logger = logging.getLogger("ipex_llm.speculative")
 
 
 @torch.no_grad()
@@ -370,7 +370,7 @@ def _update_past_key_values_storage_cpu(self, past_key_values, past_key_values_s
 
 def _check_and_extend_kv_cache(past_key_values, max_step_draft, kv_alloc_block_len=256,
                                model_type="llama"):
-    from bigdl.llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
+    from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
         extend_kv_cache
     enough_kv_room = True
     if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral",
@@ -534,7 +534,7 @@ def speculative_generate(self,
     past_key_values = None
     past_key_values_storage = []
 
-    from bigdl.llm.transformers.convert import get_enable_ipex
+    from ipex_llm.transformers.convert import get_enable_ipex
     _enable_ipex = get_enable_ipex()
 
     if _enable_ipex:
diff --git a/python/llm/src/bigdl/llm/transformers/training_patch.py b/python/llm/src/ipex_llm/transformers/training_patch.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/training_patch.py
rename to python/llm/src/ipex_llm/transformers/training_patch.py
diff --git a/python/llm/src/bigdl/llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py
similarity index 99%
rename from python/llm/src/bigdl/llm/transformers/utils.py
rename to python/llm/src/ipex_llm/transformers/utils.py
index ba6fba9b..39211fd4 100644
--- a/python/llm/src/bigdl/llm/transformers/utils.py
+++ b/python/llm/src/ipex_llm/transformers/utils.py
@@ -41,7 +41,7 @@
 # SOFTWARE.
 import os
 from transformers.modeling_utils import _add_variant
-from bigdl.llm.ggml.quantize import ggml_tensor_qtype
+from ipex_llm.ggml.quantize import ggml_tensor_qtype
 from ..utils.common import invalidInputError
 from typing import Union, Optional
 import torch
diff --git a/python/llm/src/bigdl/llm/transformers/xpu_customize_fwd.py b/python/llm/src/ipex_llm/transformers/xpu_customize_fwd.py
similarity index 100%
rename from python/llm/src/bigdl/llm/transformers/xpu_customize_fwd.py
rename to python/llm/src/ipex_llm/transformers/xpu_customize_fwd.py
diff --git a/python/llm/src/bigdl/llm/utils/__init__.py b/python/llm/src/ipex_llm/utils/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/utils/__init__.py
rename to python/llm/src/ipex_llm/utils/__init__.py
diff --git a/python/llm/src/bigdl/llm/utils/common/__init__.py b/python/llm/src/ipex_llm/utils/common/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/utils/common/__init__.py
rename to python/llm/src/ipex_llm/utils/common/__init__.py
diff --git a/python/llm/src/bigdl/llm/utils/common/lazyimport.py b/python/llm/src/ipex_llm/utils/common/lazyimport.py
similarity index 95%
rename from python/llm/src/bigdl/llm/utils/common/lazyimport.py
rename to python/llm/src/ipex_llm/utils/common/lazyimport.py
index 6380831d..ac76dc01 100644
--- a/python/llm/src/bigdl/llm/utils/common/lazyimport.py
+++ b/python/llm/src/ipex_llm/utils/common/lazyimport.py
@@ -25,8 +25,8 @@ class LazyImport:
     Lazy import python module until use.
 
     Example:
-        >>> from bigdl.llm.utils.common import LazyImport
-        >>> _convert_to_ggml = LazyImport('bigdl.llm.ggml.convert._convert_to_ggml')
+        >>> from ipex_llm.utils.common import LazyImport
+        >>> _convert_to_ggml = LazyImport('ipex_llm.ggml.convert._convert_to_ggml')
         >>> _convert_to_ggml(model_path, outfile_dir)
     """
     def __init__(self, module_name: str):
diff --git a/python/llm/src/bigdl/llm/utils/common/log4Error.py b/python/llm/src/ipex_llm/utils/common/log4Error.py
similarity index 100%
rename from python/llm/src/bigdl/llm/utils/common/log4Error.py
rename to python/llm/src/ipex_llm/utils/common/log4Error.py
diff --git a/python/llm/src/bigdl/llm/utils/convert_chatglm.py b/python/llm/src/ipex_llm/utils/convert_chatglm.py
similarity index 99%
rename from python/llm/src/bigdl/llm/utils/convert_chatglm.py
rename to python/llm/src/ipex_llm/utils/convert_chatglm.py
index 466287a3..d25e569c 100644
--- a/python/llm/src/bigdl/llm/utils/convert_chatglm.py
+++ b/python/llm/src/ipex_llm/utils/convert_chatglm.py
@@ -55,7 +55,7 @@ import torch
 from tabulate import tabulate
 from tqdm import tqdm
 from transformers import AutoModel, AutoTokenizer
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 GGML_QK8_0 = 64
 GGML_QK4_0 = 64
diff --git a/python/llm/src/bigdl/llm/utils/convert_util.py b/python/llm/src/ipex_llm/utils/convert_util.py
similarity index 99%
rename from python/llm/src/bigdl/llm/utils/convert_util.py
rename to python/llm/src/ipex_llm/utils/convert_util.py
index 8a9e5059..5570f3c9 100644
--- a/python/llm/src/bigdl/llm/utils/convert_util.py
+++ b/python/llm/src/ipex_llm/utils/convert_util.py
@@ -65,7 +65,7 @@ from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
                     Literal, Optional, Sequence, Tuple, TypeVar, Union)
 import numpy as np
 from sentencepiece import SentencePieceProcessor
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 import os
 from pathlib import Path
 
@@ -1782,7 +1782,7 @@ def _convert_chatglm_hf_to_ggml(model_path, outfile_dir, outtype):
     invalidInputError(outtype in ["q4_0", "q4_1"],
                       "For now we only support quantization type 'q4_0' and 'q4_1' "
                       "in chatglm family.")
-    from bigdl.llm.utils.convert_chatglm import _convert_chatglm_hf_to_ggml_
+    from ipex_llm.utils.convert_chatglm import _convert_chatglm_hf_to_ggml_
     return _convert_chatglm_hf_to_ggml_(model_path,
                                         outfile,
                                         outtype)
diff --git a/python/llm/src/bigdl/llm/utils/glibc_checker.py b/python/llm/src/ipex_llm/utils/glibc_checker.py
similarity index 97%
rename from python/llm/src/bigdl/llm/utils/glibc_checker.py
rename to python/llm/src/ipex_llm/utils/glibc_checker.py
index 4368ee1b..21d9edc4 100644
--- a/python/llm/src/bigdl/llm/utils/glibc_checker.py
+++ b/python/llm/src/ipex_llm/utils/glibc_checker.py
@@ -18,7 +18,7 @@ import os
 import platform
 from packaging import version
 from importlib.metadata import distribution, PackageNotFoundError
-from bigdl.llm.utils.common import log4Error
+from ipex_llm.utils.common import log4Error
 
 
 class GlibcChecker:
diff --git a/python/llm/src/bigdl/llm/utils/ipex_importer.py b/python/llm/src/ipex_llm/utils/ipex_importer.py
similarity index 100%
rename from python/llm/src/bigdl/llm/utils/ipex_importer.py
rename to python/llm/src/ipex_llm/utils/ipex_importer.py
diff --git a/python/llm/src/bigdl/llm/utils/isa_checker.py b/python/llm/src/ipex_llm/utils/isa_checker.py
similarity index 100%
rename from python/llm/src/bigdl/llm/utils/isa_checker.py
rename to python/llm/src/ipex_llm/utils/isa_checker.py
diff --git a/python/llm/src/bigdl/llm/utils/lazy_load_torch.py b/python/llm/src/ipex_llm/utils/lazy_load_torch.py
similarity index 100%
rename from python/llm/src/bigdl/llm/utils/lazy_load_torch.py
rename to python/llm/src/ipex_llm/utils/lazy_load_torch.py
diff --git a/python/llm/src/bigdl/llm/utils/utils.py b/python/llm/src/ipex_llm/utils/utils.py
similarity index 95%
rename from python/llm/src/bigdl/llm/utils/utils.py
rename to python/llm/src/ipex_llm/utils/utils.py
index 974bf4a1..5b906632 100644
--- a/python/llm/src/bigdl/llm/utils/utils.py
+++ b/python/llm/src/ipex_llm/utils/utils.py
@@ -16,7 +16,7 @@
 
 import sys
 import pathlib
-from bigdl.llm.utils.common import invalidInputError, invalidOperationError
+from ipex_llm.utils.common import invalidInputError, invalidOperationError
 
 
 def get_shared_lib_info(lib_base_name: str):
diff --git a/python/llm/src/bigdl/llm/vllm/config.py b/python/llm/src/ipex_llm/vllm/config.py
similarity index 99%
rename from python/llm/src/bigdl/llm/vllm/config.py
rename to python/llm/src/ipex_llm/vllm/config.py
index 11c999c8..c386a90e 100644
--- a/python/llm/src/bigdl/llm/vllm/config.py
+++ b/python/llm/src/ipex_llm/vllm/config.py
@@ -34,8 +34,8 @@
 from typing import Optional
 import torch
 from transformers import AutoConfig, PretrainedConfig
-from bigdl.llm.vllm.logger import init_logger
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.vllm.logger import init_logger
+from ipex_llm.utils.common import invalidInputError
 
 logger = init_logger(__name__)
 
diff --git a/python/llm/src/bigdl/llm/vllm/core/policy.py b/python/llm/src/ipex_llm/vllm/core/policy.py
similarity index 95%
rename from python/llm/src/bigdl/llm/vllm/core/policy.py
rename to python/llm/src/ipex_llm/vllm/core/policy.py
index 9a468a6d..2ec56693 100644
--- a/python/llm/src/bigdl/llm/vllm/core/policy.py
+++ b/python/llm/src/ipex_llm/vllm/core/policy.py
@@ -33,8 +33,8 @@
 
 from typing import List
 
-from bigdl.llm.vllm.sequence import SequenceGroup
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.vllm.sequence import SequenceGroup
+from ipex_llm.utils.common import invalidInputError
 
 
 class Policy:
diff --git a/python/llm/src/bigdl/llm/vllm/core/scheduler.py b/python/llm/src/ipex_llm/vllm/core/scheduler.py
similarity index 98%
rename from python/llm/src/bigdl/llm/vllm/core/scheduler.py
rename to python/llm/src/ipex_llm/vllm/core/scheduler.py
index b41ea166..0667d4e9 100644
--- a/python/llm/src/bigdl/llm/vllm/core/scheduler.py
+++ b/python/llm/src/ipex_llm/vllm/core/scheduler.py
@@ -38,13 +38,13 @@ import enum
 import time
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
-from bigdl.llm.vllm.config import SchedulerConfig
-from bigdl.llm.vllm.core.policy import PolicyFactory
-from bigdl.llm.vllm.logger import init_logger
-from bigdl.llm.vllm.sequence import SequenceData, SequenceStatus
-from bigdl.llm.vllm.sequence import (Sequence, SequenceGroup,
+from ipex_llm.vllm.config import SchedulerConfig
+from ipex_llm.vllm.core.policy import PolicyFactory
+from ipex_llm.vllm.logger import init_logger
+from ipex_llm.vllm.sequence import SequenceData, SequenceStatus
+from ipex_llm.vllm.sequence import (Sequence, SequenceGroup,
                                      SequenceGroupMetadata)
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 logger = init_logger(__name__)
 
diff --git a/python/llm/src/bigdl/llm/vllm/engine/__init__.py b/python/llm/src/ipex_llm/vllm/engine/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/vllm/engine/__init__.py
rename to python/llm/src/ipex_llm/vllm/engine/__init__.py
diff --git a/python/llm/src/bigdl/llm/vllm/engine/arg_utils.py b/python/llm/src/ipex_llm/vllm/engine/arg_utils.py
similarity index 99%
rename from python/llm/src/bigdl/llm/vllm/engine/arg_utils.py
rename to python/llm/src/ipex_llm/vllm/engine/arg_utils.py
index dc9857b1..5e6d4357 100644
--- a/python/llm/src/bigdl/llm/vllm/engine/arg_utils.py
+++ b/python/llm/src/ipex_llm/vllm/engine/arg_utils.py
@@ -38,7 +38,7 @@ import argparse
 import dataclasses
 from dataclasses import dataclass
 from typing import Optional, Tuple
-from bigdl.llm.vllm.config import ModelConfig, SchedulerConfig
+from ipex_llm.vllm.config import ModelConfig, SchedulerConfig
 
 
 @dataclass
diff --git a/python/llm/src/bigdl/llm/vllm/engine/async_llm_engine.py b/python/llm/src/ipex_llm/vllm/engine/async_llm_engine.py
similarity index 98%
rename from python/llm/src/bigdl/llm/vllm/engine/async_llm_engine.py
rename to python/llm/src/ipex_llm/vllm/engine/async_llm_engine.py
index 46eaf741..119725cf 100644
--- a/python/llm/src/bigdl/llm/vllm/engine/async_llm_engine.py
+++ b/python/llm/src/ipex_llm/vllm/engine/async_llm_engine.py
@@ -39,13 +39,13 @@ import time
 from functools import partial
 from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type,
                     Union)
-from bigdl.llm.vllm.config import ModelConfig
-from bigdl.llm.vllm.engine.arg_utils import AsyncEngineArgs
-from bigdl.llm.vllm.engine.llm_engine import LLMEngine
-from bigdl.llm.vllm.logger import init_logger
-from bigdl.llm.vllm.outputs import RequestOutput
-from bigdl.llm.vllm.sampling_params import SamplingParams
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.vllm.config import ModelConfig
+from ipex_llm.vllm.engine.arg_utils import AsyncEngineArgs
+from ipex_llm.vllm.engine.llm_engine import LLMEngine
+from ipex_llm.vllm.logger import init_logger
+from ipex_llm.vllm.outputs import RequestOutput
+from ipex_llm.vllm.sampling_params import SamplingParams
+from ipex_llm.utils.common import invalidInputError
 logger = init_logger(__name__)
 
 
diff --git a/python/llm/src/bigdl/llm/vllm/engine/llm_engine.py b/python/llm/src/ipex_llm/vllm/engine/llm_engine.py
similarity index 97%
rename from python/llm/src/bigdl/llm/vllm/engine/llm_engine.py
rename to python/llm/src/ipex_llm/vllm/engine/llm_engine.py
index d56ed482..b8575e72 100644
--- a/python/llm/src/bigdl/llm/vllm/engine/llm_engine.py
+++ b/python/llm/src/ipex_llm/vllm/engine/llm_engine.py
@@ -38,13 +38,13 @@
 import time
 from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union, Dict
 
-from bigdl.llm.vllm.config import ModelConfig, SchedulerConfig
-from bigdl.llm.vllm.core.scheduler import SchedulerOutputs, FixedWindowScheduler
-from bigdl.llm.vllm.engine.arg_utils import EngineArgs
-from bigdl.llm.vllm.logger import init_logger
-from bigdl.llm.vllm.outputs import RequestOutput
-from bigdl.llm.vllm.sampling_params import SamplingParams
-from bigdl.llm.vllm.sequence import (
+from ipex_llm.vllm.config import ModelConfig, SchedulerConfig
+from ipex_llm.vllm.core.scheduler import SchedulerOutputs, FixedWindowScheduler
+from ipex_llm.vllm.engine.arg_utils import EngineArgs
+from ipex_llm.vllm.logger import init_logger
+from ipex_llm.vllm.outputs import RequestOutput
+from ipex_llm.vllm.sampling_params import SamplingParams
+from ipex_llm.vllm.sequence import (
     SamplerOutput,
     Sequence,
     SequenceGroup,
@@ -52,11 +52,11 @@ from bigdl.llm.vllm.sequence import (
     SequenceStatus,
     SequenceOutputs,
 )
-from bigdl.llm.vllm.transformers_utils.tokenizer import get_tokenizer, detokenize_incrementally
-from bigdl.llm.vllm.utils import (
+from ipex_llm.vllm.transformers_utils.tokenizer import get_tokenizer, detokenize_incrementally
+from ipex_llm.vllm.utils import (
     Counter,
 )
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 logger = init_logger(__name__)
 
@@ -158,7 +158,7 @@ class LLMEngine:
     def _init_workers(self):
         # Lazy import the Worker to avoid importing torch.cuda/xformers
         # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from bigdl.llm.vllm.worker.worker import (
+        from ipex_llm.vllm.worker.worker import (
             Worker,
         )  # pylint: disable=import-outside-toplevel
 
diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/__init__.py b/python/llm/src/ipex_llm/vllm/entrypoints/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/vllm/entrypoints/__init__.py
rename to python/llm/src/ipex_llm/vllm/entrypoints/__init__.py
diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/api_server.py b/python/llm/src/ipex_llm/vllm/entrypoints/api_server.py
similarity index 94%
rename from python/llm/src/bigdl/llm/vllm/entrypoints/api_server.py
rename to python/llm/src/ipex_llm/vllm/entrypoints/api_server.py
index 6dcdf7ec..e636435e 100644
--- a/python/llm/src/bigdl/llm/vllm/entrypoints/api_server.py
+++ b/python/llm/src/ipex_llm/vllm/entrypoints/api_server.py
@@ -39,10 +39,10 @@ from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 import uvicorn
 
-from bigdl.llm.vllm.engine.arg_utils import AsyncEngineArgs
-from bigdl.llm.vllm.engine.async_llm_engine import AsyncLLMEngine
-from bigdl.llm.vllm.sampling_params import SamplingParams
-from bigdl.llm.vllm.utils import random_uuid
+from ipex_llm.vllm.engine.arg_utils import AsyncEngineArgs
+from ipex_llm.vllm.engine.async_llm_engine import AsyncLLMEngine
+from ipex_llm.vllm.sampling_params import SamplingParams
+from ipex_llm.vllm.utils import random_uuid
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
 TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds.
diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/llm.py b/python/llm/src/ipex_llm/vllm/entrypoints/llm.py
similarity index 97%
rename from python/llm/src/bigdl/llm/vllm/entrypoints/llm.py
rename to python/llm/src/ipex_llm/vllm/entrypoints/llm.py
index 8ed3c790..66942f02 100644
--- a/python/llm/src/bigdl/llm/vllm/entrypoints/llm.py
+++ b/python/llm/src/ipex_llm/vllm/entrypoints/llm.py
@@ -38,12 +38,12 @@ from typing import List, Optional, Union
 from tqdm import tqdm
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from bigdl.llm.vllm.engine.arg_utils import EngineArgs
-from bigdl.llm.vllm.engine.llm_engine import LLMEngine
+from ipex_llm.vllm.engine.arg_utils import EngineArgs
+from ipex_llm.vllm.engine.llm_engine import LLMEngine
 
-from bigdl.llm.vllm.outputs import RequestOutput
-from bigdl.llm.vllm.sampling_params import SamplingParams
-from bigdl.llm.vllm.utils import Counter
+from ipex_llm.vllm.outputs import RequestOutput
+from ipex_llm.vllm.sampling_params import SamplingParams
+from ipex_llm.vllm.utils import Counter
 
 
 class LLM:
diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/__init__.py b/python/llm/src/ipex_llm/vllm/entrypoints/openai/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/vllm/entrypoints/openai/__init__.py
rename to python/llm/src/ipex_llm/vllm/entrypoints/openai/__init__.py
diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/entrypoints/openai/api_server.py
similarity index 98%
rename from python/llm/src/bigdl/llm/vllm/entrypoints/openai/api_server.py
rename to python/llm/src/ipex_llm/vllm/entrypoints/openai/api_server.py
index 27d904fc..9e1c2011 100644
--- a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/api_server.py
+++ b/python/llm/src/ipex_llm/vllm/entrypoints/openai/api_server.py
@@ -52,24 +52,24 @@ from fastapi.responses import JSONResponse, StreamingResponse
 from packaging import version
 import numpy as np
 
-from bigdl.llm.vllm.engine.arg_utils import AsyncEngineArgs
-from bigdl.llm.vllm.engine.async_llm_engine import AsyncLLMEngine
-from bigdl.llm.vllm.entrypoints.openai.protocol import (
+from ipex_llm.vllm.engine.arg_utils import AsyncEngineArgs
+from ipex_llm.vllm.engine.async_llm_engine import AsyncLLMEngine
+from ipex_llm.vllm.entrypoints.openai.protocol import (
     CompletionResponse, CompletionResponseChoice,
     CompletionResponseStreamChoice, CompletionStreamResponse,
     ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage,
     DeltaMessage, ErrorResponse, LogProbs, ModelCard, ModelPermission,
     UsageInfo)
-from bigdl.llm.vllm.entrypoints.openai.openai_protocol import (
+from ipex_llm.vllm.entrypoints.openai.openai_protocol import (
     CompletionRequest, ChatCompletionRequest,
     ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
     ModelList)
-from bigdl.llm.vllm.logger import init_logger
-from bigdl.llm.vllm.outputs import RequestOutput
-from bigdl.llm.vllm.sampling_params import SamplingParams
-from bigdl.llm.vllm.transformers_utils.tokenizer import get_tokenizer
+from ipex_llm.vllm.logger import init_logger
+from ipex_llm.vllm.outputs import RequestOutput
+from ipex_llm.vllm.sampling_params import SamplingParams
+from ipex_llm.vllm.transformers_utils.tokenizer import get_tokenizer
 import uuid
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 try:
     import fastchat
diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/openai_protocol.py b/python/llm/src/ipex_llm/vllm/entrypoints/openai/openai_protocol.py
similarity index 99%
rename from python/llm/src/bigdl/llm/vllm/entrypoints/openai/openai_protocol.py
rename to python/llm/src/ipex_llm/vllm/entrypoints/openai/openai_protocol.py
index 5297d55b..f7e46aee 100644
--- a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/openai_protocol.py
+++ b/python/llm/src/ipex_llm/vllm/entrypoints/openai/openai_protocol.py
@@ -43,7 +43,7 @@ from typing import Dict, List, Literal, Optional, Union
 
 from pydantic import BaseModel, Field
 
-from bigdl.llm.vllm.utils import random_uuid
+from ipex_llm.vllm.utils import random_uuid
 
 # bigdl-llm change start
 # summary: add token time recording logic
diff --git a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/protocol.py b/python/llm/src/ipex_llm/vllm/entrypoints/openai/protocol.py
similarity index 97%
rename from python/llm/src/bigdl/llm/vllm/entrypoints/openai/protocol.py
rename to python/llm/src/ipex_llm/vllm/entrypoints/openai/protocol.py
index 5c1218f8..24186f55 100644
--- a/python/llm/src/bigdl/llm/vllm/entrypoints/openai/protocol.py
+++ b/python/llm/src/ipex_llm/vllm/entrypoints/openai/protocol.py
@@ -41,8 +41,8 @@ import time
 from typing import Dict, List, Literal, Optional, Union
 
 from pydantic import BaseModel, Field
-from bigdl.llm.vllm.utils import random_uuid
-from bigdl.llm.vllm.entrypoints.openai.openai_protocol import (
+from ipex_llm.vllm.utils import random_uuid
+from ipex_llm.vllm.entrypoints.openai.openai_protocol import (
     ErrorResponse, ModelPermission, ModelCard, UsageInfo, LogProbs, ChatMessage, DeltaMessage
 )
 
diff --git a/python/llm/src/bigdl/llm/vllm/logger.py b/python/llm/src/ipex_llm/vllm/logger.py
similarity index 100%
rename from python/llm/src/bigdl/llm/vllm/logger.py
rename to python/llm/src/ipex_llm/vllm/logger.py
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/__init__.py b/python/llm/src/ipex_llm/vllm/model_executor/__init__.py
similarity index 90%
rename from python/llm/src/bigdl/llm/vllm/model_executor/__init__.py
rename to python/llm/src/ipex_llm/vllm/model_executor/__init__.py
index 54804458..d920a7a2 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/__init__.py
+++ b/python/llm/src/ipex_llm/vllm/model_executor/__init__.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-from bigdl.llm.vllm.model_executor.model_loader import get_model
+from ipex_llm.vllm.model_executor.model_loader import get_model
 
 __all__ = [
     "get_model",
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/input_metadata.py b/python/llm/src/ipex_llm/vllm/model_executor/input_metadata.py
similarity index 97%
rename from python/llm/src/bigdl/llm/vllm/model_executor/input_metadata.py
rename to python/llm/src/ipex_llm/vllm/model_executor/input_metadata.py
index 0a7a24e5..7547078c 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/input_metadata.py
+++ b/python/llm/src/ipex_llm/vllm/model_executor/input_metadata.py
@@ -34,8 +34,8 @@
 from typing import Dict, List, Optional, Tuple
 import torch
 # from xformers.ops import AttentionBias
-from bigdl.llm.vllm.sequence import SequenceData
-from bigdl.llm.vllm.sampling_params import SamplingParams
+from ipex_llm.vllm.sequence import SequenceData
+from ipex_llm.vllm.sampling_params import SamplingParams
 
 
 class InputMetadata:
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/layers/bigdl_sampler.py b/python/llm/src/ipex_llm/vllm/model_executor/layers/bigdl_sampler.py
similarity index 99%
rename from python/llm/src/bigdl/llm/vllm/model_executor/layers/bigdl_sampler.py
rename to python/llm/src/ipex_llm/vllm/model_executor/layers/bigdl_sampler.py
index a355ef19..a3675ae6 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/layers/bigdl_sampler.py
+++ b/python/llm/src/ipex_llm/vllm/model_executor/layers/bigdl_sampler.py
@@ -39,10 +39,10 @@ from typing import Dict, List, Optional, Tuple
 import torch
 import torch.nn as nn
 
-from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata
+from ipex_llm.vllm.model_executor.input_metadata import InputMetadata
 
-from bigdl.llm.vllm.sampling_params import SamplingParams, SamplingType
-from bigdl.llm.vllm.sequence import (SamplerOutput, SequenceGroupMetadata,
+from ipex_llm.vllm.sampling_params import SamplingParams, SamplingType
+from ipex_llm.vllm.sequence import (SamplerOutput, SequenceGroupMetadata,
                                      SequenceData, SequenceOutputs)
 
 import time
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/model_loader.py b/python/llm/src/ipex_llm/vllm/model_executor/model_loader.py
similarity index 90%
rename from python/llm/src/bigdl/llm/vllm/model_executor/model_loader.py
rename to python/llm/src/ipex_llm/vllm/model_executor/model_loader.py
index f4a449f0..ed3bfb18 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/model_loader.py
+++ b/python/llm/src/ipex_llm/vllm/model_executor/model_loader.py
@@ -38,14 +38,14 @@ import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from bigdl.llm.vllm.config import ModelConfig
-from bigdl.llm.vllm.model_executor.models.bigdl_llama import BigDLLlamaForCausalLM
-from bigdl.llm.vllm.model_executor.models.bigdl_mixtral import BigDLMixtralForCausalLM
-from bigdl.llm.vllm.model_executor.models.bigdl_mistral import BigDLMistralForCausalLM
-from bigdl.llm.vllm.model_executor.models.bigdl_chatglm import BigDLChatGLMForCausalLM
-from bigdl.llm.vllm.model_executor.models.bigdl_baichuan import BigDLBaichuanForCausalLM
+from ipex_llm.vllm.config import ModelConfig
+from ipex_llm.vllm.model_executor.models.bigdl_llama import BigDLLlamaForCausalLM
+from ipex_llm.vllm.model_executor.models.bigdl_mixtral import BigDLMixtralForCausalLM
+from ipex_llm.vllm.model_executor.models.bigdl_mistral import BigDLMistralForCausalLM
+from ipex_llm.vllm.model_executor.models.bigdl_chatglm import BigDLChatGLMForCausalLM
+from ipex_llm.vllm.model_executor.models.bigdl_baichuan import BigDLBaichuanForCausalLM
 
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 # bigdl-llm Intel specified code change
 # bigdl-llm change start
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_baichuan.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_baichuan.py
similarity index 95%
rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_baichuan.py
rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_baichuan.py
index 0369f9f2..3e39edf1 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_baichuan.py
+++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_baichuan.py
@@ -20,13 +20,13 @@ from torch import nn
 from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig
 from typing import Optional, Tuple, List, Type, Dict
 
-from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
-from bigdl.llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler
-from bigdl.llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM
-from bigdl.llm.vllm.logger import init_logger
+from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
+from ipex_llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler
+from ipex_llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM
+from ipex_llm.vllm.logger import init_logger
 import math
 import time
-from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata
+from ipex_llm.vllm.model_executor.input_metadata import InputMetadata
 import os
 from transformers.generation.logits_process import (
     LogitsProcessorList,
@@ -68,9 +68,9 @@ class BigDLBaichuanForCausalLM(BigDLModelForCausalLM):
         super().__init__(config, device, max_model_len)
         self.config = config
         # Always enable bigdl-llm model
-        from bigdl.llm.transformers import AutoModelForCausalLM
+        from ipex_llm.transformers import AutoModelForCausalLM
         # TODO: we will need to pass the argument through command line argument
-        # from bigdl.llm import optimize_model
+        # from ipex_llm import optimize_model
         torch_dtype = 'auto'
 
         if load_in_low_bit == 'bf16':
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_chatglm.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_chatglm.py
similarity index 95%
rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_chatglm.py
rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_chatglm.py
index c20e57e5..5a95c63d 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_chatglm.py
+++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_chatglm.py
@@ -20,13 +20,13 @@ from torch import nn
 from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig
 from typing import Optional, Tuple, List, Type, Dict
 
-from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
-from bigdl.llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler
-from bigdl.llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM
-from bigdl.llm.vllm.logger import init_logger
+from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
+from ipex_llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler
+from ipex_llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM
+from ipex_llm.vllm.logger import init_logger
 import math
 import time
-from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata
+from ipex_llm.vllm.model_executor.input_metadata import InputMetadata
 from transformers.generation.logits_process import (
     LogitsProcessorList,
     RepetitionPenaltyLogitsProcessor,
@@ -63,7 +63,7 @@ class BigDLChatGLMForCausalLM(BigDLModelForCausalLM):
         super().__init__(config, device, max_model_len)
         self.config = config
         # TODO(gc): later change this to a switch?
-        from bigdl.llm.transformers import AutoModelForCausalLM
+        from ipex_llm.transformers import AutoModelForCausalLM
         torch_dtype = 'auto'
 
         if load_in_low_bit == 'bf16':
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_llama.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_llama.py
similarity index 95%
rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_llama.py
rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_llama.py
index e5d66f37..1ab8695b 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_llama.py
+++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_llama.py
@@ -20,13 +20,13 @@ from torch import nn
 from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig
 from typing import Optional, Tuple, List, Type, Dict
 
-from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
-from bigdl.llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler
-from bigdl.llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM
-from bigdl.llm.vllm.logger import init_logger
+from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
+from ipex_llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler
+from ipex_llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM
+from ipex_llm.vllm.logger import init_logger
 import math
 import time
-from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata
+from ipex_llm.vllm.model_executor.input_metadata import InputMetadata
 import os
 from transformers.generation.logits_process import (
     LogitsProcessorList,
@@ -68,9 +68,9 @@ class BigDLLlamaForCausalLM(BigDLModelForCausalLM):
         super().__init__(config, device, max_model_len)
         self.config = config
         # Always enable bigdl-llm model
-        from bigdl.llm.transformers import AutoModelForCausalLM
+        from ipex_llm.transformers import AutoModelForCausalLM
         # TODO: we will need to pass the argument through command line argument
-        # from bigdl.llm import optimize_model
+        # from ipex_llm import optimize_model
         torch_dtype = 'auto'
 
         if load_in_low_bit == 'bf16':
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mistral.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mistral.py
similarity index 94%
rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mistral.py
rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mistral.py
index 9bdf572a..9640c282 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mistral.py
+++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mistral.py
@@ -20,13 +20,13 @@ from torch import nn
 from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig
 from typing import Optional, Tuple, List, Type, Dict
 
-from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
-from bigdl.llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler
-from bigdl.llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM
-from bigdl.llm.vllm.logger import init_logger
+from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
+from ipex_llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler
+from ipex_llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM
+from ipex_llm.vllm.logger import init_logger
 import math
 import time
-from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata
+from ipex_llm.vllm.model_executor.input_metadata import InputMetadata
 from transformers.generation.logits_process import (
     LogitsProcessorList,
     RepetitionPenaltyLogitsProcessor,
@@ -63,8 +63,8 @@ class BigDLMistralForCausalLM(BigDLModelForCausalLM):
         super().__init__(config, device, max_model_len)
         self.config = config
         # TODO(gc): later change this to a switch?
-        from bigdl.llm.transformers import AutoModelForCausalLM
-        # from bigdl.llm import optimize_model
+        from ipex_llm.transformers import AutoModelForCausalLM
+        # from ipex_llm import optimize_model
 
         torch_dtype = 'auto'
 
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mixtral.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mixtral.py
similarity index 95%
rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mixtral.py
rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mixtral.py
index c5be91c3..8946910a 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_mixtral.py
+++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_mixtral.py
@@ -20,13 +20,13 @@ from torch import nn
 from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig
 from typing import Optional, Tuple, List, Type, Dict
 
-from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
-from bigdl.llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler
-from bigdl.llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM
-from bigdl.llm.vllm.logger import init_logger
+from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
+from ipex_llm.vllm.model_executor.layers.bigdl_sampler import BigDLSampler
+from ipex_llm.vllm.model_executor.models.bigdl_model import BigDLModelForCausalLM
+from ipex_llm.vllm.logger import init_logger
 import math
 import time
-from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata
+from ipex_llm.vllm.model_executor.input_metadata import InputMetadata
 from transformers.generation.logits_process import (
     LogitsProcessorList,
     RepetitionPenaltyLogitsProcessor,
@@ -63,7 +63,7 @@ class BigDLMixtralForCausalLM(BigDLModelForCausalLM):
         super().__init__(config, device, max_model_len)
         self.config = config
         # TODO(gc): later change this to a switch?
-        from bigdl.llm.transformers import AutoModelForCausalLM
+        from ipex_llm.transformers import AutoModelForCausalLM
 
         torch_dtype = 'auto'
 
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_model.py b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_model.py
similarity index 97%
rename from python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_model.py
rename to python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_model.py
index a81993dc..5d7ecf28 100644
--- a/python/llm/src/bigdl/llm/vllm/model_executor/models/bigdl_model.py
+++ b/python/llm/src/ipex_llm/vllm/model_executor/models/bigdl_model.py
@@ -19,9 +19,9 @@ from torch import nn
 from typing import Optional, Tuple, List, Type, Dict
 from transformers import LlamaConfig
 
-from bigdl.llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
-from bigdl.llm.transformers.models.utils import extend_kv_cache
-from bigdl.llm.vllm.logger import init_logger
+from ipex_llm.vllm.sequence import SequenceOutputs, SequenceGroupMetadata
+from ipex_llm.transformers.models.utils import extend_kv_cache
+from ipex_llm.vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
diff --git a/python/llm/src/bigdl/llm/vllm/model_executor/utils.py b/python/llm/src/ipex_llm/vllm/model_executor/utils.py
similarity index 100%
rename from python/llm/src/bigdl/llm/vllm/model_executor/utils.py
rename to python/llm/src/ipex_llm/vllm/model_executor/utils.py
diff --git a/python/llm/src/bigdl/llm/vllm/outputs.py b/python/llm/src/ipex_llm/vllm/outputs.py
similarity index 98%
rename from python/llm/src/bigdl/llm/vllm/outputs.py
rename to python/llm/src/ipex_llm/vllm/outputs.py
index 15b5ca7b..ab608631 100644
--- a/python/llm/src/bigdl/llm/vllm/outputs.py
+++ b/python/llm/src/ipex_llm/vllm/outputs.py
@@ -35,7 +35,7 @@
 
 
 from typing import Dict, List, Optional
-from bigdl.llm.vllm.sequence import SequenceGroup, SequenceStatus
+from ipex_llm.vllm.sequence import SequenceGroup, SequenceStatus
 
 
 class CompletionOutput:
diff --git a/python/llm/src/bigdl/llm/vllm/sampling_params.py b/python/llm/src/ipex_llm/vllm/sampling_params.py
similarity index 99%
rename from python/llm/src/bigdl/llm/vllm/sampling_params.py
rename to python/llm/src/ipex_llm/vllm/sampling_params.py
index 3d0bfb0e..af7091c1 100644
--- a/python/llm/src/bigdl/llm/vllm/sampling_params.py
+++ b/python/llm/src/ipex_llm/vllm/sampling_params.py
@@ -35,7 +35,7 @@
 from enum import IntEnum
 from functools import cached_property
 from typing import List, Optional, Union
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.utils.common import invalidInputError
 
 _SAMPLING_EPS = 1e-5
 
diff --git a/python/llm/src/bigdl/llm/vllm/sequence.py b/python/llm/src/ipex_llm/vllm/sequence.py
similarity index 99%
rename from python/llm/src/bigdl/llm/vllm/sequence.py
rename to python/llm/src/ipex_llm/vllm/sequence.py
index 987c8a1b..3594e90b 100644
--- a/python/llm/src/bigdl/llm/vllm/sequence.py
+++ b/python/llm/src/ipex_llm/vllm/sequence.py
@@ -35,8 +35,8 @@ import copy
 import enum
 import time
 from typing import Dict, List, Optional, Union
-from bigdl.llm.vllm.sampling_params import SamplingParams
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.vllm.sampling_params import SamplingParams
+from ipex_llm.utils.common import invalidInputError
 
 
 class SequenceStatus(enum.Enum):
diff --git a/python/llm/src/bigdl/llm/vllm/transformers_utils/__init__.py b/python/llm/src/ipex_llm/vllm/transformers_utils/__init__.py
similarity index 100%
rename from python/llm/src/bigdl/llm/vllm/transformers_utils/__init__.py
rename to python/llm/src/ipex_llm/vllm/transformers_utils/__init__.py
diff --git a/python/llm/src/bigdl/llm/vllm/transformers_utils/tokenizer.py b/python/llm/src/ipex_llm/vllm/transformers_utils/tokenizer.py
similarity index 98%
rename from python/llm/src/bigdl/llm/vllm/transformers_utils/tokenizer.py
rename to python/llm/src/ipex_llm/vllm/transformers_utils/tokenizer.py
index 727763d6..589496db 100644
--- a/python/llm/src/bigdl/llm/vllm/transformers_utils/tokenizer.py
+++ b/python/llm/src/ipex_llm/vllm/transformers_utils/tokenizer.py
@@ -35,8 +35,8 @@ from typing import List, Optional, Tuple, Union
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                           PreTrainedTokenizerFast)
 
-from bigdl.llm.vllm.logger import init_logger
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.vllm.logger import init_logger
+from ipex_llm.utils.common import invalidInputError
 
 logger = init_logger(__name__)
 
diff --git a/python/llm/src/bigdl/llm/vllm/utils.py b/python/llm/src/ipex_llm/vllm/utils.py
similarity index 94%
rename from python/llm/src/bigdl/llm/vllm/utils.py
rename to python/llm/src/ipex_llm/vllm/utils.py
index b1520671..821d47c9 100644
--- a/python/llm/src/bigdl/llm/vllm/utils.py
+++ b/python/llm/src/ipex_llm/vllm/utils.py
@@ -37,8 +37,8 @@ from typing import List, Optional, Tuple, Union
 
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from bigdl.llm.vllm.logger import init_logger
-from bigdl.llm.utils.common import invalidInputError
+from ipex_llm.vllm.logger import init_logger
+from ipex_llm.utils.common import invalidInputError
 
 logger = init_logger(__name__)
 
diff --git a/python/llm/src/bigdl/llm/vllm/worker/worker.py b/python/llm/src/ipex_llm/vllm/worker/worker.py
similarity index 96%
rename from python/llm/src/bigdl/llm/vllm/worker/worker.py
rename to python/llm/src/ipex_llm/vllm/worker/worker.py
index b0005e79..f5805131 100644
--- a/python/llm/src/bigdl/llm/vllm/worker/worker.py
+++ b/python/llm/src/ipex_llm/vllm/worker/worker.py
@@ -40,13 +40,13 @@ import warnings
 import numpy as np
 import random
 
-from bigdl.llm.vllm.config import ModelConfig, SchedulerConfig
-from bigdl.llm.vllm.model_executor.model_loader import get_model
-from bigdl.llm.vllm.model_executor.input_metadata import InputMetadata
-from bigdl.llm.vllm.sampling_params import SamplingParams
-from bigdl.llm.vllm.sequence import SequenceData, SamplerOutput, SequenceGroupMetadata
-from bigdl.llm.utils.common import invalidInputError
-from bigdl.llm.vllm.model_executor.utils import set_random_seed
+from ipex_llm.vllm.config import ModelConfig, SchedulerConfig
+from ipex_llm.vllm.model_executor.model_loader import get_model
+from ipex_llm.vllm.model_executor.input_metadata import InputMetadata
+from ipex_llm.vllm.sampling_params import SamplingParams
+from ipex_llm.vllm.sequence import SequenceData, SamplerOutput, SequenceGroupMetadata
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.vllm.model_executor.utils import set_random_seed
 
 
 class Worker:
diff --git a/python/llm/test/convert/test_convert_model.py b/python/llm/test/convert/test_convert_model.py
index 1a0495d6..450e97fb 100644
--- a/python/llm/test/convert/test_convert_model.py
+++ b/python/llm/test/convert/test_convert_model.py
@@ -20,9 +20,9 @@ import tempfile
 from unittest import TestCase
 import shutil
 
-from bigdl.llm import llm_convert
-from bigdl.llm.transformers import AutoModelForCausalLM
-from bigdl.llm.optimize import optimize_model, load_low_bit, low_memory_init
+from ipex_llm import llm_convert
+from ipex_llm.transformers import AutoModelForCausalLM
+from ipex_llm.optimize import optimize_model, load_low_bit, low_memory_init
 
 
 llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
diff --git a/python/llm/test/inference/test_call_models.py b/python/llm/test/inference/test_call_models.py
index 1f888da5..e9b7175d 100644
--- a/python/llm/test/inference/test_call_models.py
+++ b/python/llm/test/inference/test_call_models.py
@@ -15,8 +15,8 @@
 #
 
 
-from bigdl.llm.models import Llama, Bloom, Gptneox, Starcoder
-from bigdl.llm.transformers import LlamaForCausalLM, BloomForCausalLM, \
+from ipex_llm.models import Llama, Bloom, Gptneox, Starcoder
+from ipex_llm.transformers import LlamaForCausalLM, BloomForCausalLM, \
     GptneoxForCausalLM, StarcoderForCausalLM
 import pytest
 from unittest import TestCase
diff --git a/python/llm/test/inference/test_optimize_model_api.py b/python/llm/test/inference/test_optimize_model_api.py
index ff1269e5..99e3be5e 100644
--- a/python/llm/test/inference/test_optimize_model_api.py
+++ b/python/llm/test/inference/test_optimize_model_api.py
@@ -19,7 +19,7 @@ import os
 import pytest
 import time
 import torch
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
         
 class TestOptimizeAPI(unittest.TestCase):
 
diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py
index f69abea0..1a72801c 100644
--- a/python/llm/test/inference/test_transformers_api.py
+++ b/python/llm/test/inference/test_transformers_api.py
@@ -22,7 +22,7 @@ import time
 import torch
 import pytest
 
-from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq
+from ipex_llm.transformers import AutoModel, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq
 from transformers import AutoTokenizer, LlamaTokenizer
 
 class TestTransformersAPI(unittest.TestCase):
@@ -93,7 +93,7 @@ class TestTransformersAPI(unittest.TestCase):
         self.assertTrue(res)
 
     def test_transformers_chatglm_for_causallm(self):
-        from bigdl.llm.transformers import ChatGLMForCausalLM
+        from ipex_llm.transformers import ChatGLMForCausalLM
         model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
         model = ChatGLMForCausalLM.from_pretrained(model_path, native=False, trust_remote_code=True, load_in_4bit=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
diff --git a/python/llm/test/inference/test_transformesr_api_434.py b/python/llm/test/inference/test_transformesr_api_434.py
index 67cfcd63..4de49e66 100644
--- a/python/llm/test/inference/test_transformesr_api_434.py
+++ b/python/llm/test/inference/test_transformesr_api_434.py
@@ -19,7 +19,7 @@ import pytest
 import tempfile
 import torch
 
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 
 
diff --git a/python/llm/test/inference_gpu/test_layer_fast_rope.py b/python/llm/test/inference_gpu/test_layer_fast_rope.py
index 9861c913..79e6f5a0 100644
--- a/python/llm/test/inference_gpu/test_layer_fast_rope.py
+++ b/python/llm/test/inference_gpu/test_layer_fast_rope.py
@@ -30,7 +30,7 @@ from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
 from transformers.models.llama.modeling_llama import (
     apply_rotary_pos_emb as apply_rotary_pos_emb_llama,
 )
-from bigdl.llm.transformers.layers.rope_embedding import apply_fast_rope_embedding
+from ipex_llm.transformers.layers.rope_embedding import apply_fast_rope_embedding
 
 device = os.environ['DEVICE']
 print(f'Running on {device}')
diff --git a/python/llm/test/inference_gpu/test_transformers_api.py b/python/llm/test/inference_gpu/test_transformers_api.py
index 9a9bb8a3..ae9c6b9b 100644
--- a/python/llm/test/inference_gpu/test_transformers_api.py
+++ b/python/llm/test/inference_gpu/test_transformers_api.py
@@ -20,7 +20,7 @@ import pytest
 import tempfile
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSpeechSeq2Seq
+from ipex_llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSpeechSeq2Seq
 from transformers import LlamaTokenizer, AutoTokenizer
 
 device = os.environ['DEVICE']
diff --git a/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py b/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py
index 6876ca75..db3aa485 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py
@@ -20,7 +20,7 @@ import gc
 import pytest
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
+from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
 from transformers import LlamaTokenizer, AutoTokenizer
 
 device = os.environ['DEVICE']
diff --git a/python/llm/test/inference_gpu/test_transformers_api_attention.py b/python/llm/test/inference_gpu/test_transformers_api_attention.py
index 0a7a6ac2..bf9df673 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_attention.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_attention.py
@@ -20,7 +20,7 @@ import gc
 import pytest
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
+from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
 from transformers import LlamaTokenizer, AutoTokenizer
 
 device = os.environ['DEVICE']
diff --git a/python/llm/test/inference_gpu/test_transformers_api_final_logits.py b/python/llm/test/inference_gpu/test_transformers_api_final_logits.py
index 02e3cc27..1aff7719 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_final_logits.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_final_logits.py
@@ -20,7 +20,7 @@ import gc
 import pytest
 
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
+from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
 from transformers import LlamaTokenizer, AutoTokenizer
 
 device = os.environ['DEVICE']
diff --git a/python/llm/test/inference_gpu/test_transformers_api_layernorm.py b/python/llm/test/inference_gpu/test_transformers_api_layernorm.py
index 0dbb4fe8..68a15d8a 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_layernorm.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_layernorm.py
@@ -19,7 +19,7 @@ import pytest
 import gc
  
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
+from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
 from transformers import LlamaTokenizer, AutoTokenizer
  
 device = os.environ['DEVICE']
diff --git a/python/llm/test/inference_gpu/test_transformers_api_mlp.py b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
index 1c01b259..e614e561 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_mlp.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
@@ -19,7 +19,7 @@ import gc
 import pytest
  
 import torch
-from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
+from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
 from transformers import LlamaTokenizer, AutoTokenizer
  
 device = os.environ['DEVICE']
diff --git a/python/llm/test/install/test_install.py b/python/llm/test/install/test_install.py
index 11a35ed0..74c12dee 100644
--- a/python/llm/test/install/test_install.py
+++ b/python/llm/test/install/test_install.py
@@ -15,7 +15,7 @@
 #
 
 
-import bigdl.llm
+import ipex_llm
 import pytest
 from unittest import TestCase
 
@@ -23,8 +23,8 @@ from unittest import TestCase
 class Test_LLM_Basics(TestCase):
 
     def test_naive(self):
-        from bigdl.llm.ggml import quantize
-        from bigdl.llm.utils.common import invalidInputError
+        from ipex_llm.ggml import quantize
+        from ipex_llm.utils.common import invalidInputError
         pass
 
 
diff --git a/python/llm/test/langchain/test_langchain.py b/python/llm/test/langchain/test_langchain.py
index 782744b9..33a60d20 100644
--- a/python/llm/test/langchain/test_langchain.py
+++ b/python/llm/test/langchain/test_langchain.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 #
 
-from bigdl.llm.langchain.embeddings import *
-from bigdl.llm.langchain.llms import *
+from ipex_llm.langchain.embeddings import *
+from ipex_llm.langchain.llms import *
 import pytest
 from unittest import TestCase
 import os
diff --git a/python/llm/test/langchain/test_transformers_api.py b/python/llm/test/langchain/test_transformers_api.py
index c8e2ac53..cbaaa1e0 100644
--- a/python/llm/test/langchain/test_transformers_api.py
+++ b/python/llm/test/langchain/test_transformers_api.py
@@ -14,9 +14,9 @@
 # limitations under the License.
 #
 
-from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM, \
+from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM, \
     LlamaLLM, BloomLLM
-from bigdl.llm.langchain.embeddings import TransformersEmbeddings, LlamaEmbeddings, \
+from ipex_llm.langchain.embeddings import TransformersEmbeddings, LlamaEmbeddings, \
     BloomEmbeddings
 
 
diff --git a/python/llm/test/langchain_gpu/test_transformers_api.py b/python/llm/test/langchain_gpu/test_transformers_api.py
index a983cb7f..b4e714bd 100644
--- a/python/llm/test/langchain_gpu/test_transformers_api.py
+++ b/python/llm/test/langchain_gpu/test_transformers_api.py
@@ -14,9 +14,9 @@
 # limitations under the License.
 #
 
-from bigdl.llm.langchain.llms import TransformersLLM, TransformersPipelineLLM, \
+from ipex_llm.langchain.llms import TransformersLLM, TransformersPipelineLLM, \
     LlamaLLM, BloomLLM
-from bigdl.llm.langchain.embeddings import TransformersEmbeddings, LlamaEmbeddings, \
+from ipex_llm.langchain.embeddings import TransformersEmbeddings, LlamaEmbeddings, \
     BloomEmbeddings
 
 import pytest
diff --git a/python/llm/test/llamaindex/test_llamaindex.py b/python/llm/test/llamaindex/test_llamaindex.py
index c0ebf4c3..1e7c8cfe 100644
--- a/python/llm/test/llamaindex/test_llamaindex.py
+++ b/python/llm/test/llamaindex/test_llamaindex.py
@@ -17,7 +17,7 @@
 import pytest
 from unittest import TestCase
 import os
-from bigdl.llm.llamaindex.llms import BigdlLLM
+from ipex_llm.llamaindex.llms import BigdlLLM
 
 class Test_LlamaIndex_Transformers_API(TestCase):
     def setUp(self):
diff --git a/python/llm/test/llamaindex_gpu/test_llamaindex.py b/python/llm/test/llamaindex_gpu/test_llamaindex.py
index 920ce534..b894a37f 100644
--- a/python/llm/test/llamaindex_gpu/test_llamaindex.py
+++ b/python/llm/test/llamaindex_gpu/test_llamaindex.py
@@ -18,7 +18,7 @@ import torch
 import pytest
 from unittest import TestCase
 import os
-from bigdl.llm.llamaindex.llms import BigdlLLM
+from ipex_llm.llamaindex.llms import BigdlLLM
 
 class Test_LlamaIndex_Transformers_API(TestCase):
     def setUp(self):