Merge branch 'intel-analytics:main' into MargarettMao-parent_folder

2024-04-11 07:18:19 +08:00 · 2024-04-11 07:18:19 +08:00 · 63a9a736be
commit 63a9a736be
parent 50dfcaa8fa 4b024b7aac
295 changed files with 1756 additions and 600 deletions
--- a/.github/actions/llm/setup-llm-env/action.yml
+++ b/.github/actions/llm/setup-llm-env/action.yml
@ -19,7 +19,7 @@ runs:
        sed -i 's/"bigdl-core-xe==" + CORE_XE_VERSION + "/"bigdl-core-xe/g' python/llm/setup.py
        sed -i 's/"bigdl-core-xe-esimd==" + CORE_XE_VERSION + "/"bigdl-core-xe-esimd/g' python/llm/setup.py
        sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
-        sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION + "/"bigdl-core-xe-esimd-21/g' python/llm/setup.py
+        sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
        pip install requests
        if [[ ${{ runner.os }} == 'Linux' ]]; then
--- a/.github/workflows/llm-c-evaluation.yml
+++ b/.github/workflows/llm-c-evaluation.yml
@ -95,7 +95,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
        model_name: ${{ fromJson(needs.set-matrix.outputs.model_name) }}
        precision: ${{ fromJson(needs.set-matrix.outputs.precision) }}
        device: [xpu]
@ -193,10 +193,10 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
-      - name: Set up Python 3.9
+      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
+          python-version: 3.11
      - name: Install dependencies
        shell: bash
        run: |
@ -230,10 +230,10 @@ jobs:
    runs-on: ["self-hosted", "llm", "accuracy1", "accuracy-nightly"]
    steps:
      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
-      - name: Set up Python 3.9
+      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
+          python-version: 3.11
      - name: Install dependencies
        shell: bash
        run: |
--- a/.github/workflows/llm-harness-evaluation.yml
+++ b/.github/workflows/llm-harness-evaluation.yml
@ -105,7 +105,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
        model_name: ${{ fromJson(needs.set-matrix.outputs.model_name) }}
        task: ${{ fromJson(needs.set-matrix.outputs.task) }}
        precision: ${{ fromJson(needs.set-matrix.outputs.precision) }}
@ -189,7 +189,7 @@ jobs:
          fi
          python run_llb.py \
-            --model bigdl-llm \
+            --model ipex-llm \
            --pretrained ${MODEL_PATH} \
            --precision ${{ matrix.precision }} \
            --device ${{ matrix.device }} \
@ -216,10 +216,10 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
-      - name: Set up Python 3.9
+      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
+          python-version: 3.11
      - name: Install dependencies
        shell: bash
        run: |
@ -243,10 +243,10 @@ jobs:
    runs-on: ["self-hosted", "llm", "accuracy1", "accuracy-nightly"]
    steps:
      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
-      - name: Set up Python 3.9
+      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
+          python-version: 3.11
      - name: Install dependencies
        shell: bash
        run: |
--- a/.github/workflows/llm-nightly-test.yml
+++ b/.github/workflows/llm-nightly-test.yml
@ -34,10 +34,10 @@ jobs:
        include:
          - os: windows
            instruction: AVX-VNNI-UT
-            python-version: "3.9"
+            python-version: "3.11"
          - os: ubuntu-20.04-lts
            instruction: avx512
-            python-version: "3.9"
+            python-version: "3.11"
    runs-on: [self-hosted, llm, "${{matrix.instruction}}", "${{matrix.os}}"]
    env:
      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
--- a/.github/workflows/llm-ppl-evaluation.yml
+++ b/.github/workflows/llm-ppl-evaluation.yml
@ -104,7 +104,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
        model_name: ${{ fromJson(needs.set-matrix.outputs.model_name) }}
        precision: ${{ fromJson(needs.set-matrix.outputs.precision) }}
        seq_len: ${{ fromJson(needs.set-matrix.outputs.seq_len) }}
@ -201,10 +201,10 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
-      - name: Set up Python 3.9
+      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
+          python-version: 3.11
      - name: Install dependencies
        shell: bash
        run: |
@ -227,10 +227,10 @@ jobs:
    runs-on: ["self-hosted", "llm", "accuracy1", "accuracy-nightly"]
    steps:
      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
-      - name: Set up Python 3.9
+      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
+          python-version: 3.11
      - name: Install dependencies
        shell: bash
        run: |
--- a/.github/workflows/llm-whisper-evaluation.yml
+++ b/.github/workflows/llm-whisper-evaluation.yml
@ -81,7 +81,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
        model_name: ${{ fromJson(needs.set-matrix.outputs.model_name) }}
        task: ${{ fromJson(needs.set-matrix.outputs.task) }}
        precision: ${{ fromJson(needs.set-matrix.outputs.precision) }}
@ -158,10 +158,10 @@ jobs:
    runs-on: ["self-hosted", "llm", "perf"]
    steps:
      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
-      - name: Set up Python 3.9
+      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
+          python-version: 3.11
      - name: Set output path
        shell: bash
--- a/.github/workflows/llm_example_tests.yml
+++ b/.github/workflows/llm_example_tests.yml
@ -39,7 +39,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
        instruction: ["AVX512"]
    runs-on: [ self-hosted, llm,"${{matrix.instruction}}", ubuntu-20.04-lts ]
    env:
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@ -33,7 +33,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
    runs-on: [self-hosted, llm, perf]
    env:
      OMP_NUM_THREADS: 16
@ -163,7 +163,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
    runs-on: [self-hosted, llm, spr-perf]
    env:
      OMP_NUM_THREADS: 16
@ -238,10 +238,10 @@ jobs:
        include:
          - os: windows
            platform: dp
-            python-version: "3.9"
+            python-version: "3.11"
          # - os: windows
          #   platform: lp
-          #   python-version: "3.9"
+          #   python-version: "3.11"
    runs-on: [self-hosted, "${{ matrix.os }}", llm, perf-core, "${{ matrix.platform }}"]
    env:
      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
@ -309,7 +309,7 @@ jobs:
      matrix:
        include:
          - os: windows
-            python-version: "3.9"
+            python-version: "3.11"
    runs-on: [self-hosted, "${{ matrix.os }}", llm, perf-igpu]
    env:
      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
@ -380,7 +380,7 @@ jobs:
      - name: Create env for html generation
        shell: cmd
        run: |
-          call conda create -n html-gen python=3.9 -y
+          call conda create -n html-gen python=3.11 -y
          call conda activate html-gen
          pip install pandas==1.5.3
--- a/.github/workflows/llm_tests_for_stable_version_on_arc.yml
+++ b/.github/workflows/llm_tests_for_stable_version_on_arc.yml
@ -30,7 +30,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
    runs-on: [self-hosted, llm, perf]
    env:
      OMP_NUM_THREADS: 16
@ -154,7 +154,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
    runs-on: [self-hosted, llm, perf]
    env:
      OMP_NUM_THREADS: 16
--- a/.github/workflows/llm_tests_for_stable_version_on_spr.yml
+++ b/.github/workflows/llm_tests_for_stable_version_on_spr.yml
@ -29,7 +29,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
    runs-on: [self-hosted, llm, spr01-perf]
    env:
      OMP_NUM_THREADS: 16
@ -87,7 +87,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
    runs-on: [self-hosted, llm, spr01-perf]
    env:
      OMP_NUM_THREADS: 16
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@ -51,7 +51,7 @@ jobs:
          if [ ${{ github.event_name }} == 'schedule' ]; then
            python_version='["3.9", "3.10", "3.11"]'
          else
-            python_version='["3.9"]'
+            python_version='["3.11"]'
          fi
          list=$(echo ${python_version} | jq -c)
          echo "python-version=${list}" >> "$GITHUB_OUTPUT"
@ -224,6 +224,7 @@ jobs:
        run: |
          pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface
          pip install transformers==4.31.0
          pip install "pydantic>=2.0.0"
          bash python/llm/test/run-llm-llamaindex-tests.sh
  llm-unit-test-on-arc:
    needs: [setup-python-version, llm-cpp-build]
@ -398,4 +399,5 @@ jobs:
            pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
            source /home/arda/intel/oneapi/setvars.sh
          fi
          pip install "pydantic>=2.0.0"
          bash python/llm/test/run-llm-llamaindex-tests-gpu.sh
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 **`IPEX-LLM`** is a PyTorch library for running **LLM** on Intel CPU and GPU *(e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max)* with very low latency[^1]. 
 > [!NOTE]
 > - *It is built on top of **Intel Extension for PyTorch** (**`IPEX`**), as well as the excellent work of **`llama.cpp`**, **`bitsandbytes`**, **`vLLM`**, **`qlora`**, **`AutoGPTQ`**, **`AutoAWQ`**, etc.*
-> - *It provides seamless integration with [llama.cpp](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/llama_cpp_quickstart.html), [Text-Generation-WebUI](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/webui_quickstart.html), [HuggingFace transformers](python/llm/example/GPU/HF-Transformers-AutoModels), [HuggingFace PEFT](python/llm/example/GPU/LLM-Finetuning), [LangChain](python/llm/example/GPU/LangChain), [LlamaIndex](python/llm/example/GPU/LlamaIndex), [DeepSpeed-AutoTP](python/llm/example/GPU/Deepspeed-AutoTP), [vLLM](python/llm/example/GPU/vLLM-Serving), [FastChat](python/llm/src/ipex_llm/serving/fastchat), [HuggingFace TRL](python/llm/example/GPU/LLM-Finetuning/DPO), [AutoGen](python/llm/example/CPU/Applications/autogen), [ModeScope](python/llm/example/GPU/ModelScope-Models), etc.* 
+> - *It provides seamless integration with [llama.cpp](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/llama_cpp_quickstart.html), [ollama](https://ipex-llm.readthedocs.io/en/main/doc/LLM/Quickstart/ollama_quickstart.html), [Text-Generation-WebUI](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/webui_quickstart.html), [HuggingFace transformers](python/llm/example/GPU/HF-Transformers-AutoModels), [HuggingFace PEFT](python/llm/example/GPU/LLM-Finetuning), [LangChain](python/llm/example/GPU/LangChain), [LlamaIndex](python/llm/example/GPU/LlamaIndex), [DeepSpeed-AutoTP](python/llm/example/GPU/Deepspeed-AutoTP), [vLLM](python/llm/example/GPU/vLLM-Serving), [FastChat](python/llm/src/ipex_llm/serving/fastchat), [HuggingFace TRL](python/llm/example/GPU/LLM-Finetuning/DPO), [AutoGen](python/llm/example/CPU/Applications/autogen), [ModeScope](python/llm/example/GPU/ModelScope-Models), etc.* 
 > - ***50+ models** have been optimized/verified on `ipex-llm` (including LLaMA2, Mistral, Mixtral, Gemma, LLaVA, Whisper, ChatGLM, Baichuan, Qwen, RWKV, and more); see the complete list [here](#verified-models).*
 ## `ipex-llm` Demo
@ -48,9 +48,10 @@ See the demo of running [*Text-Generation-WebUI*](https://ipex-llm.readthedocs.i
 </table>
 ## Latest Update 🔥 
 - [2024/04] `ipex-llm` now provides C++ interface, which can be used as an accelerated backend for running [llama.cpp](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/llama_cpp_quickstart.html) and [ollama](https://ipex-llm.readthedocs.io/en/main/doc/LLM/Quickstart/ollama_quickstart.html) on Intel GPU.
 - [2024/03] `bigdl-llm` has now become `ipex-llm` (see the migration guide [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/bigdl_llm_migration.html)); you may find the original `BigDL` project [here](https://github.com/intel-analytics/bigdl-2.x).
 - [2024/02] `ipex-llm` now supports directly loading model from [ModelScope](python/llm/example/GPU/ModelScope-Models) ([魔搭](python/llm/example/CPU/ModelScope-Models)).
- [2024/02] `ipex-llm` added inital **INT2** support (based on llama.cpp [IQ2](python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2) mechanism), which makes it possible to run large-size LLM (e.g., Mixtral-8x7B) on Intel GPU with 16GB VRAM.
+- [2024/02] `ipex-llm` added initial **INT2** support (based on llama.cpp [IQ2](python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2) mechanism), which makes it possible to run large-size LLM (e.g., Mixtral-8x7B) on Intel GPU with 16GB VRAM.
 - [2024/02] Users can now use `ipex-llm` through [Text-Generation-WebUI](https://github.com/intel-analytics/text-generation-webui) GUI.
 - [2024/02] `ipex-llm` now supports *[Self-Speculative Decoding](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Inference/Self_Speculative_Decoding.html)*, which in practice brings **~30% speedup** for FP16 and BF16 inference latency on Intel [GPU](python/llm/example/GPU/Speculative-Decoding) and [CPU](python/llm/example/CPU/Speculative-Decoding) respectively.
 - [2024/02] `ipex-llm` now supports a comprehensive list of LLM **finetuning** on Intel GPU (including [LoRA](python/llm/example/GPU/LLM-Finetuning/LoRA), [QLoRA](python/llm/example/GPU/LLM-Finetuning/QLoRA), [DPO](python/llm/example/GPU/LLM-Finetuning/DPO), [QA-LoRA](python/llm/example/GPU/LLM-Finetuning/QA-LoRA) and [ReLoRA](python/llm/example/GPU/LLM-Finetuning/ReLora)).
@ -81,7 +82,8 @@ See the demo of running [*Text-Generation-WebUI*](https://ipex-llm.readthedocs.i
 - *For more details, please refer to the [installation guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install.html)*
 ### Run `ipex-llm`
- [llama.cpp](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/llama_cpp_quickstart.html): running **ipex-llm for llama.cpp** (*using C++ interface of `ipex-llm` as an accelerated backend for `llama.cpp` on Intel GPU*)
+- [llama.cpp](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/llama_cpp_quickstart.html): running **llama.cpp** (*using C++ interface of `ipex-llm` as an accelerated backend for `llama.cpp`*) on Intel GPU
 - [ollama](https://ipex-llm.readthedocs.io/en/main/doc/LLM/Quickstart/ollama_quickstart.html): running **ollama** (*using C++ interface of `ipex-llm` as an accelerated backend for `ollama`*) on Intel GPU
 - [vLLM](python/llm/example/GPU/vLLM-Serving): running `ipex-llm` in `vLLM` on both Intel [GPU](python/llm/example/GPU/vLLM-Serving) and [CPU](python/llm/example/CPU/vLLM-Serving)
 - [FastChat](python/llm/src/ipex_llm/serving/fastchat): running `ipex-llm` in `FastChat` serving on on both Intel GPU and CPU
 - [LangChain-Chatchat RAG](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/chatchat_quickstart.html): running `ipex-llm` in `LangChain-Chatchat` (*Knowledge Base QA using **RAG** pipeline*)
--- a/docker/llm/finetune/lora/cpu/docker/Dockerfile
+++ b/docker/llm/finetune/lora/cpu/docker/Dockerfile
@ -21,7 +21,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] h
 RUN mkdir /ipex_llm/data && mkdir /ipex_llm/model && \
 # install pytorch 2.0.1
    apt-get update && \
-    apt-get install -y python3-pip python3.9-dev python3-wheel git software-properties-common && \
+    apt-get install -y python3-pip python3.11-dev python3-wheel git software-properties-common && \
    pip3 install --upgrade pip && \
    export PIP_DEFAULT_TIMEOUT=100 && \
    pip install --upgrade torch==2.1.0 --index-url https://download.pytorch.org/whl/cpu && \
@ -37,9 +37,9 @@ RUN mkdir /ipex_llm/data && mkdir /ipex_llm/model && \
    pip install -r /ipex_llm/requirements.txt && \
 # install python
    add-apt-repository ppa:deadsnakes/ppa -y && \
-    apt-get install -y python3.9 && \
+    apt-get install -y python3.11 && \
    rm /usr/bin/python3 && \
-    ln -s /usr/bin/python3.9 /usr/bin/python3 && \
+    ln -s /usr/bin/python3.11 /usr/bin/python3 && \
    ln -s /usr/bin/python3 /usr/bin/python && \
    pip install --no-cache requests argparse cryptography==3.3.2 urllib3 && \
    pip install --upgrade requests && \
--- a/docker/llm/finetune/qlora/cpu/docker/Dockerfile
+++ b/docker/llm/finetune/qlora/cpu/docker/Dockerfile
@ -21,7 +21,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] h
 RUN mkdir -p /ipex_llm/data && mkdir -p /ipex_llm/model && \
    # install pytorch 2.1.0
    apt-get update && \
-    apt-get install -y --no-install-recommends python3-pip python3.9-dev python3-wheel python3.9-distutils git software-properties-common && \
+    apt-get install -y --no-install-recommends python3-pip python3.11-dev python3-wheel python3.11-distutils git software-properties-common && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
    pip3 install --upgrade pip && \
--- a/docker/llm/finetune/qlora/cpu/docker/Dockerfile.k8s
+++ b/docker/llm/finetune/qlora/cpu/docker/Dockerfile.k8s
@ -22,7 +22,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] h
 RUN mkdir -p /ipex_llm/data && mkdir -p /ipex_llm/model && \
    apt-get update && \
    apt install -y --no-install-recommends openssh-server openssh-client libcap2-bin gnupg2 ca-certificates \ 
-    python3-pip python3.9-dev python3-wheel python3.9-distutils git software-properties-common && \
+    python3-pip python3.11-dev python3-wheel python3.11-distutils git software-properties-common && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
    mkdir -p /var/run/sshd && \
--- a/docker/llm/finetune/qlora/xpu/docker/Dockerfile
+++ b/docker/llm/finetune/qlora/xpu/docker/Dockerfile
@ -18,15 +18,15 @@ RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-P
    apt-get install -y curl wget git gnupg gpg-agent software-properties-common libunwind8-dev vim less && \
    # install Intel GPU driver
    apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \
-    # install python 3.9
+    # install python 3.11
    ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
    env DEBIAN_FRONTEND=noninteractive apt-get update && \
    add-apt-repository ppa:deadsnakes/ppa -y && \
-    apt-get install -y python3.9 && \
+    apt-get install -y python3.11 && \
    rm /usr/bin/python3 && \
-    ln -s /usr/bin/python3.9 /usr/bin/python3 && \
+    ln -s /usr/bin/python3.11 /usr/bin/python3 && \
    ln -s /usr/bin/python3 /usr/bin/python && \
-    apt-get install -y python3-pip python3.9-dev python3-wheel python3.9-distutils && \
+    apt-get install -y python3-pip python3.11-dev python3-wheel python3.11-distutils && \
    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
    # install XPU ipex-llm
    pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ && \
--- a/docker/llm/inference/cpu/docker/Dockerfile
+++ b/docker/llm/inference/cpu/docker/Dockerfile
@ -9,22 +9,30 @@ ENV PYTHONUNBUFFERED=1
 COPY ./start-notebook.sh /llm/start-notebook.sh
-# Install PYTHON 3.9
+# Update the software sources
 RUN env DEBIAN_FRONTEND=noninteractive apt-get update && \
 # Install essential packages
    apt install software-properties-common libunwind8-dev vim less -y && \
 # Install git, curl, and wget
    apt-get install -y git curl wget && \
 # Install Python 3.11
    # Add Python 3.11 PPA repository
    add-apt-repository ppa:deadsnakes/ppa -y && \
-    apt-get install -y python3.9 git curl wget && \
+    # Install Python 3.11
    apt-get install -y python3.11 && \
    # Remove the original /usr/bin/python3 symbolic link
    rm /usr/bin/python3 && \
-    ln -s /usr/bin/python3.9 /usr/bin/python3 && \
+    # Create a symbolic link pointing to Python 3.11 at /usr/bin/python3
    ln -s /usr/bin/python3.11 /usr/bin/python3 && \
    # Create a symbolic link pointing to /usr/bin/python3 at /usr/bin/python
    ln -s /usr/bin/python3 /usr/bin/python && \
-    apt-get install -y python3-pip python3.9-dev python3-wheel python3.9-distutils && \
+    # Install Python 3.11 development and utility packages
    apt-get install -y python3-pip python3.11-dev python3-wheel python3.11-distutils && \
 # Download and install pip, install FastChat from source requires PEP 660 support
    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
 # Install FastChat from source requires PEP 660 support
    python3 get-pip.py && \
    rm get-pip.py && \
    pip install --upgrade requests argparse urllib3 && \
    pip3 install --no-cache-dir --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
    pip install --pre --upgrade ipex-llm[all] && \
 # Download ipex-llm-tutorial
    cd /llm && \
    pip install --upgrade jupyterlab && \
--- a/docker/llm/inference/xpu/docker/Dockerfile
+++ b/docker/llm/inference/xpu/docker/Dockerfile
@ -20,16 +20,16 @@ RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-P
    wget -qO - https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \
    echo 'deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc' | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    rm /etc/apt/sources.list.d/intel-graphics.list && \
-    # Install PYTHON 3.9 and IPEX-LLM[xpu]
+    # Install PYTHON 3.11 and IPEX-LLM[xpu]
    ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
    env DEBIAN_FRONTEND=noninteractive apt-get update && \
    apt install software-properties-common libunwind8-dev vim less -y && \
    add-apt-repository ppa:deadsnakes/ppa -y && \
-    apt-get install -y python3.9 git curl wget && \
+    apt-get install -y python3.11 git curl wget && \
    rm /usr/bin/python3 && \
-    ln -s /usr/bin/python3.9 /usr/bin/python3 && \
+    ln -s /usr/bin/python3.11 /usr/bin/python3 && \
    ln -s /usr/bin/python3 /usr/bin/python && \
-    apt-get install -y python3-pip python3.9-dev python3-wheel python3.9-distutils && \
+    apt-get install -y python3-pip python3.11-dev python3-wheel python3.11-distutils && \
    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
    # Install FastChat from source requires PEP 660 support
    python3 get-pip.py && \
--- a/docker/llm/serving/cpu/docker/Dockerfile
+++ b/docker/llm/serving/cpu/docker/Dockerfile
@ -16,7 +16,7 @@ RUN cd /llm && \
    # Fix Trivy CVE Issues
    pip install Jinja2==3.1.3 transformers==4.36.2 gradio==4.19.2 cryptography==42.0.4 && \
    # Fix Qwen model adpater in fastchat
-    patch /usr/local/lib/python3.9/dist-packages/fastchat/model/model_adapter.py < /llm/model_adapter.py.patch && \
+    patch /usr/local/lib/python3.11/dist-packages/fastchat/model/model_adapter.py < /llm/model_adapter.py.patch && \
    chmod +x /opt/entrypoint.sh && \
    chmod +x /sbin/tini && \
    cp /sbin/tini /usr/bin/tini
--- a/docs/readthedocs/source/_templates/sidebar_quicklinks.html
+++ b/docs/readthedocs/source/_templates/sidebar_quicklinks.html
@ -29,13 +29,16 @@
                        <a href="doc/LLM/Quickstart/docker_windows_gpu.html">Install IPEX-LLM in Docker on Windows with Intel GPU</a>
                    </li>
                    <li>
-                        <a href="doc/LLM/Quickstart/chatchat_quickstart.html">Run Langchain-Chatchat (RAG Application) on Intel GPU</a>
+                        <a href="doc/LLM/Quickstart/chatchat_quickstart.html">Run Local RAG using Langchain-Chatchat on Intel GPU</a>
                    </li>
                    <li>
                        <a href="doc/LLM/Quickstart/webui_quickstart.html">Run Text Generation WebUI on Intel GPU</a>
                    </li>
                    <li>
-                        <a href="doc/LLM/Quickstart/continue_quickstart.html">Run Code Copilot (Continue) in VSCode with Intel GPU</a>
+                        <a href="doc/LLM/Quickstart/continue_quickstart.html">Run Coding Copilot (Continue) in VSCode with Intel GPU</a>
                    </li>
                    <li>
                        <a href="doc/LLM/Quickstart/open_webui_with_ollama_quickstart.html">Run Open WebUI with IPEX-LLM on Intel GPU</a>
                    </li>
                    <li>
                        <a href="doc/LLM/Quickstart/benchmark_quickstart.html">Run Performance Benchmarking with IPEX-LLM</a>
--- a/docs/readthedocs/source/_toc.yml
+++ b/docs/readthedocs/source/_toc.yml
@ -25,6 +25,7 @@ subtrees:
                - file: doc/LLM/Quickstart/docker_windows_gpu
                - file: doc/LLM/Quickstart/chatchat_quickstart
                - file: doc/LLM/Quickstart/webui_quickstart
                - file: doc/LLM/Quickstart/open_webui_with_ollama_quickstart
                - file: doc/LLM/Quickstart/continue_quickstart
                - file: doc/LLM/Quickstart/benchmark_quickstart
                - file: doc/LLM/Quickstart/llama_cpp_quickstart
--- a/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md
@ -17,7 +17,7 @@ Please refer to [Environment Setup](#environment-setup) for more information.
 .. important::
-   ``ipex-llm`` is tested with Python 3.9, 3.10 and 3.11; Python 3.9 is recommended for best practices.
+   ``ipex-llm`` is tested with Python 3.9, 3.10 and 3.11; Python 3.11 is recommended for best practices.
 ```
 ## Recommended Requirements
@ -39,10 +39,10 @@ Here list the recommended hardware and OS for smooth IPEX-LLM optimization exper
 For optimal performance with LLM models using IPEX-LLM optimizations on Intel CPUs, here are some best practices for setting up environment:
-First we recommend using [Conda](https://docs.conda.io/en/latest/miniconda.html) to create a python 3.9 enviroment:
+First we recommend using [Conda](https://docs.conda.io/en/latest/miniconda.html) to create a python 3.11 enviroment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md
@ -22,10 +22,10 @@ To apply Intel GPU acceleration, there're several prerequisite steps for tools i
 * Step 4: Install Intel® oneAPI Base Toolkit 2024.0:
-  First, Create a Python 3.9 enviroment and activate it. In Anaconda Prompt:
+  First, Create a Python 3.11 enviroment and activate it. In Anaconda Prompt:
  ```cmd
-  conda create -n llm python=3.9 libuv
+  conda create -n llm python=3.11 libuv
  conda activate llm
  ```
@ -33,7 +33,7 @@ To apply Intel GPU acceleration, there're several prerequisite steps for tools i
  ```eval_rst
  .. important::
-     ``ipex-llm`` is tested with Python 3.9, 3.10 and 3.11. Python 3.9 is recommended for best practices.
+     ``ipex-llm`` is tested with Python 3.9, 3.10 and 3.11. Python 3.11 is recommended for best practices.
  ```
  Then, use `pip` to install the Intel oneAPI Base Toolkit 2024.0:
@ -93,17 +93,17 @@ If you encounter network issues when installing IPEX, you can also install IPEX-
 Download the wheels on Windows system:
 ```
-wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torch-2.1.0a0%2Bcxx11.abi-cp39-cp39-win_amd64.whl
+wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torch-2.1.0a0%2Bcxx11.abi-cp311-cp311-win_amd64.whl
-wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torchvision-0.16.0a0%2Bcxx11.abi-cp39-cp39-win_amd64.whl
+wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torchvision-0.16.0a0%2Bcxx11.abi-cp311-cp311-win_amd64.whl
-wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.1.10%2Bxpu-cp39-cp39-win_amd64.whl
+wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.1.10%2Bxpu-cp311-cp311-win_amd64.whl
 ```
 You may install dependencies directly from the wheel archives and then install `ipex-llm` using following commands:
 ```
-pip install torch-2.1.0a0+cxx11.abi-cp39-cp39-win_amd64.whl
+pip install torch-2.1.0a0+cxx11.abi-cp311-cp311-win_amd64.whl
-pip install torchvision-0.16.0a0+cxx11.abi-cp39-cp39-win_amd64.whl
+pip install torchvision-0.16.0a0+cxx11.abi-cp311-cp311-win_amd64.whl
-pip install intel_extension_for_pytorch-2.1.10+xpu-cp39-cp39-win_amd64.whl
+pip install intel_extension_for_pytorch-2.1.10+xpu-cp311-cp311-win_amd64.whl
 pip install --pre --upgrade ipex-llm[xpu]
 ```
@ -111,7 +111,7 @@ pip install --pre --upgrade ipex-llm[xpu]
 ```eval_rst
 .. note::
-   All the wheel packages mentioned here are for Python 3.9. If you would like to use Python 3.10 or 3.11, you should modify the wheel names for ``torch``, ``torchvision``, and ``intel_extension_for_pytorch`` by replacing ``cp39`` with ``cp310`` or ``cp311``, respectively.
+   All the wheel packages mentioned here are for Python 3.11. If you would like to use Python 3.9 or 3.10, you should modify the wheel names for ``torch``, ``torchvision``, and ``intel_extension_for_pytorch`` by replacing ``cp11`` with ``cp39`` or ``cp310``, respectively.
 ```
 ### Runtime Configuration
@ -164,7 +164,7 @@ If you met error when importing `intel_extension_for_pytorch`, please ensure tha
 * Ensure that `libuv` is installed in your conda environment. This can be done during the creation of the environment with the command:
  ```cmd
-  conda create -n llm python=3.9 libuv
+  conda create -n llm python=3.11 libuv
  ```
  If you missed `libuv`, you can add it to your existing environment through
  ```cmd
@ -399,12 +399,12 @@ IPEX-LLM GPU support on Linux has been verified on:
 ### Install IPEX-LLM
 #### Install IPEX-LLM From PyPI
-We recommend using [miniconda](https://docs.conda.io/en/latest/miniconda.html) to create a python 3.9 enviroment:
+We recommend using [miniconda](https://docs.conda.io/en/latest/miniconda.html) to create a python 3.11 enviroment:
 ```eval_rst
 .. important::
-   ``ipex-llm`` is tested with Python 3.9, 3.10 and 3.11. Python 3.9 is recommended for best practices.
+   ``ipex-llm`` is tested with Python 3.9, 3.10 and 3.11. Python 3.11 is recommended for best practices.
 ```
 ```eval_rst
@ -422,7 +422,7 @@ We recommend using [miniconda](https://docs.conda.io/en/latest/miniconda.html) t
            .. code-block:: bash
-               conda create -n llm python=3.9
+               conda create -n llm python=3.11
               conda activate llm
               pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
@ -439,7 +439,7 @@ We recommend using [miniconda](https://docs.conda.io/en/latest/miniconda.html) t
            .. code-block:: bash
-               conda create -n llm python=3.9
+               conda create -n llm python=3.11
               conda activate llm
               pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
@ -461,7 +461,7 @@ We recommend using [miniconda](https://docs.conda.io/en/latest/miniconda.html) t
            .. code-block:: bash
-               conda create -n llm python=3.9
+               conda create -n llm python=3.11
               conda activate llm
               pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
@ -470,7 +470,7 @@ We recommend using [miniconda](https://docs.conda.io/en/latest/miniconda.html) t
            .. code-block:: bash
-               conda create -n llm python=3.9
+               conda create -n llm python=3.11
               conda activate llm
               pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
@ -488,18 +488,18 @@ If you encounter network issues when installing IPEX, you can also install IPEX-
      .. code-block:: bash
         # get the wheels on Linux system for IPEX 2.1.10+xpu
-         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torch-2.1.0a0%2Bcxx11.abi-cp39-cp39-linux_x86_64.whl
+         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torch-2.1.0a0%2Bcxx11.abi-cp311-cp311-linux_x86_64.whl
-         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torchvision-0.16.0a0%2Bcxx11.abi-cp39-cp39-linux_x86_64.whl
+         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torchvision-0.16.0a0%2Bcxx11.abi-cp311-cp311-linux_x86_64.whl
-         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.1.10%2Bxpu-cp39-cp39-linux_x86_64.whl
+         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.1.10%2Bxpu-cp311-cp311-linux_x86_64.whl
      Then you may install directly from the wheel archives using following commands:
      .. code-block:: bash
         # install the packages from the wheels
-         pip install torch-2.1.0a0+cxx11.abi-cp39-cp39-linux_x86_64.whl
+         pip install torch-2.1.0a0+cxx11.abi-cp311-cp311-linux_x86_64.whl
-         pip install torchvision-0.16.0a0+cxx11.abi-cp39-cp39-linux_x86_64.whl
+         pip install torchvision-0.16.0a0+cxx11.abi-cp311-cp311-linux_x86_64.whl
-         pip install intel_extension_for_pytorch-2.1.10+xpu-cp39-cp39-linux_x86_64.whl
+         pip install intel_extension_for_pytorch-2.1.10+xpu-cp311-cp311-linux_x86_64.whl
         # install ipex-llm for Intel GPU
         pip install --pre --upgrade ipex-llm[xpu]
@ -509,18 +509,18 @@ If you encounter network issues when installing IPEX, you can also install IPEX-
      .. code-block:: bash
         # get the wheels on Linux system for IPEX 2.0.110+xpu
-         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torch-2.0.1a0%2Bcxx11.abi-cp39-cp39-linux_x86_64.whl
+         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torch-2.0.1a0%2Bcxx11.abi-cp311-cp311-linux_x86_64.whl
-         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torchvision-0.15.2a0%2Bcxx11.abi-cp39-cp39-linux_x86_64.whl
+         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/torchvision-0.15.2a0%2Bcxx11.abi-cp311-cp311-linux_x86_64.whl
-         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.0.110%2Bxpu-cp39-cp39-linux_x86_64.whl
+         wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.0.110%2Bxpu-cp311-cp311-linux_x86_64.whl
      Then you may install directly from the wheel archives using following commands:
      .. code-block:: bash
         # install the packages from the wheels
-         pip install torch-2.0.1a0+cxx11.abi-cp39-cp39-linux_x86_64.whl
+         pip install torch-2.0.1a0+cxx11.abi-cp311-cp311-linux_x86_64.whl
-         pip install torchvision-0.15.2a0+cxx11.abi-cp39-cp39-linux_x86_64.whl
+         pip install torchvision-0.15.2a0+cxx11.abi-cp311-cp311-linux_x86_64.whl
-         pip install intel_extension_for_pytorch-2.0.110+xpu-cp39-cp39-linux_x86_64.whl
+         pip install intel_extension_for_pytorch-2.0.110+xpu-cp311-cp311-linux_x86_64.whl
         # install ipex-llm for Intel GPU
         pip install --pre --upgrade ipex-llm[xpu_2.0]
@ -530,7 +530,7 @@ If you encounter network issues when installing IPEX, you can also install IPEX-
 ```eval_rst
 .. note::
-   All the wheel packages mentioned here are for Python 3.9. If you would like to use Python 3.10 or 3.11, you should modify the wheel names for ``torch``, ``torchvision``, and ``intel_extension_for_pytorch`` by replacing ``cp39`` with ``cp310`` or ``cp311``, respectively.
+   All the wheel packages mentioned here are for Python 3.11. If you would like to use Python 3.9 or 3.10, you should modify the wheel names for ``torch``, ``torchvision``, and ``intel_extension_for_pytorch`` by replacing ``cp11`` with ``cp39`` or ``cp310``, respectively.
 ```
 ### Runtime Configuration
--- a/docs/readthedocs/source/doc/LLM/Quickstart/chatchat_quickstart.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/chatchat_quickstart.md
@ -6,8 +6,8 @@
 <table border="1" width="100%">
  <tr>
-    <td align="center">English</td>
+    <td align="center" width="50%">English</td>
-    <td align="center">简体中文</td>
+    <td align="center" width="50%">简体中文</td>
  </tr>
  <tr>
    <td><video src="https://llm-assets.readthedocs.io/en/latest/_images/langchain-chatchat-en.mp4" width="100%" controls></video></td>
@ -33,7 +33,7 @@ See the Langchain-Chatchat architecture below ([source](https://github.com/chatc
 Follow the guide that corresponds to your specific system and GPU type from the links provided below:
 - For systems with Intel Core Ultra integrated GPU: [Windows Guide](https://github.com/intel-analytics/Langchain-Chatchat/blob/ipex-llm/INSTALL_win_mtl.md#)
- For systems with Intel Arc A-Series GPU: [Windows Guide](https://github.com/intel-analytics/Langchain-Chatchat/blob/ipex-llm/INSTALL_windows_arc.md#) | [Linux Guide](https://github.com/intel-analytics/Langchain-Chatchat/blob/ipex-llm/INSTALL_linux_arc.md#)
+- For systems with Intel Arc A-Series GPU: [Windows Guide](https://github.com/intel-analytics/Langchain-Chatchat/blob/ipex-llm/INSTALL_win_arc.md#) | [Linux Guide](https://github.com/intel-analytics/Langchain-Chatchat/blob/ipex-llm/INSTALL_linux_arc.md#)
 - For systems with Intel Data Center Max Series GPU: [Linux Guide](https://github.com/intel-analytics/Langchain-Chatchat/blob/ipex-llm/INSTALL_linux_max.md#)
--- a/docs/readthedocs/source/doc/LLM/Quickstart/continue_quickstart.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/continue_quickstart.md
@ -1,8 +1,9 @@
-# Run Code Copilot on Windows with Intel GPU
+# Run Coding Copilot on Windows with Intel GPU
-[**Continue**](https://marketplace.visualstudio.com/items?itemName=Continue.continue) is a coding copilot extension in [Microsoft Visual Studio Code](https://code.visualstudio.com/); by porting it to [`ipex-llm`](https://github.com/intel-analytics/ipex-llm), users can now easily leverage local llms running on Intel GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) for code explanation, code generation/completion; see the demos of using Continue with [Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) running on Intel A770 GPU below.
+[**Continue**](https://marketplace.visualstudio.com/items?itemName=Continue.continue) is a coding copilot extension in [Microsoft Visual Studio Code](https://code.visualstudio.com/); by porting it to [`ipex-llm`](https://github.com/intel-analytics/ipex-llm), users can now easily leverage local LLMs running on Intel GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) for code explanation, code generation/completion, etc.
 See the demos of using Continue with [Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) running on Intel A770 GPU below.
 <table border="1" width="100%">
  <tr>
@ -27,7 +28,7 @@ This guide walks you through setting up and running **Continue** within _Visual
 Visit [Run Text Generation WebUI Quickstart Guide](webui_quickstart.html), and follow the steps 1) [Install IPEX-LLM](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/webui_quickstart.html#install-ipex-llm), 2) [Install WebUI](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/webui_quickstart.html#install-the-webui) and 3) [Start the Server](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/webui_quickstart.html#start-the-webui-server) to install and start the Text Generation WebUI API Service. **Please pay attention to below items during installation:**
- The Text Generation WebUI API service requires Python version 3.10 or higher. But [IPEX-LLM installation instructions](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/webui_quickstart.html#install-ipex-llm) used ``python=3.9`` as default for creating the conda environment. We recommend changing it to ``3.11``, using below command:
+- The Text Generation WebUI API service requires Python version 3.10 or higher. We recommend use Python 3.11 as below:
  ```bash
  conda create -n llm python=3.11 libuv
  ```
--- a/docs/readthedocs/source/doc/LLM/Quickstart/index.rst
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/index.rst
@ -13,9 +13,10 @@ This section includes efficient guide to show you how to:
 * `Install IPEX-LLM on Windows with Intel GPU <./install_windows_gpu.html>`_
 * `Install IPEX-LLM in Docker on Windows with Intel GPU <./docker_windows_gpu.html>`_
 * `Run Performance Benchmarking with IPEX-LLM <./benchmark_quickstart.html>`_
-* `Run Langchain-Chatchat (RAG Application) on Intel GPU <./chatchat_quickstart.html>`_
+* `Run Local RAG using Langchain-Chatchat on Intel GPU <./chatchat_quickstart.html>`_
 * `Run Text Generation WebUI on Intel GPU <./webui_quickstart.html>`_
-* `Run Code Copilot (Continue) in VSCode with Intel GPU <./continue_quickstart.html>`_
+* `Run Open WebUI on Intel GPU <./open_webui_with_ollama_quickstart.html>`_
 * `Run Coding Copilot (Continue) in VSCode with Intel GPU <./continue_quickstart.html>`_
 * `Run llama.cpp with IPEX-LLM on Intel GPU <./llama_cpp_quickstart.html>`_
 * `Run Ollama with IPEX-LLM on Intel GPU <./ollama_quickstart.html>`_
--- a/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md
@ -144,7 +144,7 @@ You can use `conda --version` to verify you conda installation.
 After installation, create a new python environment `llm`:
 ```cmd
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 ```
 Activate the newly created environment `llm`:
 ```cmd
--- a/docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md
@ -57,7 +57,7 @@ Visit [Miniconda installation page](https://docs.anaconda.com/free/miniconda/),
 Open the **Anaconda Prompt**. Then create a new python environment `llm` and activate it:
 ```cmd
-conda create -n llm python=3.9 libuv
+conda create -n llm python=3.11 libuv
 conda activate llm
 ```
--- a/docs/readthedocs/source/doc/LLM/Quickstart/llama_cpp_quickstart.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/llama_cpp_quickstart.md
@ -1,6 +1,6 @@
 # Run llama.cpp with IPEX-LLM on Intel GPU 
-[ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) prvoides fast LLM inference in in pure C++ across a variety of hardware; you can now use the C++ interface of `ipex-llm` as an accelerated backend for `llama.cpp` running on Intel **GPU** *(e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max)*.
+[ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) prvoides fast LLM inference in in pure C++ across a variety of hardware; you can now use the C++ interface of [`ipex-llm`](https://github.com/intel-analytics/ipex-llm) as an accelerated backend for `llama.cpp` running on Intel **GPU** *(e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max)*.
 See the demo of running LLaMA2-7B on Intel Arc GPU below.
@ -26,7 +26,7 @@ Visit the [Install IPEX-LLM on Windows with Intel GPU Guide](https://ipex-llm.re
 To use `llama.cpp` with IPEX-LLM, first ensure that `ipex-llm[cpp]` is installed.
 ```cmd
-conda create -n llm-cpp python=3.9
+conda create -n llm-cpp python=3.11
 conda activate llm-cpp
 pip install --pre --upgrade ipex-llm[cpp]
 ```
--- a/docs/readthedocs/source/doc/LLM/Quickstart/ollama_quickstart.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/ollama_quickstart.md
@ -1,10 +1,16 @@
 # Run Ollama on Linux with Intel GPU
-The [ollama/ollama](https://github.com/ollama/ollama) is popular framework designed to build and run language models on a local machine. Now you can run Ollama with [`ipex-llm`](https://github.com/intel-analytics/ipex-llm) on Intel GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max); see the demo of running LLaMA2-7B on an Intel A770 GPU below.
+[ollama/ollama](https://github.com/ollama/ollama) is popular framework designed to build and run language models on a local machine; you can now use the C++ interface of [`ipex-llm`](https://github.com/intel-analytics/ipex-llm) as an accelerated backend for `ollama` running on Intel **GPU** *(e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max)*.
 ```eval_rst
 .. note::
   Only Linux is currently supported.
 ```
 See the demo of running LLaMA2-7B on Intel Arc GPU below.
 <video src="https://llm-assets.readthedocs.io/en/latest/_images/ollama-linux-arc.mp4" width="100%" controls></video>
 ## Quickstart
 ### 1 Install IPEX-LLM with Ollama Binaries
@ -45,16 +51,16 @@ source /opt/intel/oneapi/setvars.sh
 The console will display messages similar to the following:
-<a href="https://llm-assets.readthedocs.io/en/latest/_images/webui_quickstart_chat.png" target="_blank">
+<a href="https://llm-assets.readthedocs.io/en/latest/_images/ollama_serve.png" target="_blank">
  <img src="https://llm-assets.readthedocs.io/en/latest/_images/ollama_serve.png" width=100%; />
 </a>
 ### 4 Pull Model
-Keep the Ollama service on and open a new terminal and pull a model, e.g. `dolphin-phi:latest`:
+Keep the Ollama service on and open another terminal and run `./ollama pull <model_name>` to automatically pull a model. e.g. `dolphin-phi:latest`:
-<a href="https://llm-assets.readthedocs.io/en/latest/_images/webui_quickstart_chat.png" target="_blank">
+<a href="https://llm-assets.readthedocs.io/en/latest/_images/ollama_pull.png" target="_blank">
  <img src="https://llm-assets.readthedocs.io/en/latest/_images/ollama_pull.png" width=100%; />
 </a>
@ -77,7 +83,7 @@ curl http://localhost:11434/api/generate -d '
 An example output of using model `doplphin-phi` looks like the following:
-<a href="https://llm-assets.readthedocs.io/en/latest/_images/webui_quickstart_chat.png" target="_blank">
+<a href="https://llm-assets.readthedocs.io/en/latest/_images/ollama_curl.png" target="_blank">
  <img src="https://llm-assets.readthedocs.io/en/latest/_images/ollama_curl.png" width=100%; />
 </a>
@ -99,6 +105,6 @@ source /opt/intel/oneapi/setvars.sh
 An example process of interacting with model with `ollama run` looks like the following:
-<a href="https://llm-assets.readthedocs.io/en/latest/_images/webui_quickstart_chat.png" target="_blank">
+<a href="https://llm-assets.readthedocs.io/en/latest/_images/ollama_run_1.png" target="_blank">
  <img src="https://llm-assets.readthedocs.io/en/latest/_images/ollama_run_1.png" width=100%; /><img src="https://llm-assets.readthedocs.io/en/latest/_images/ollama_run_2.png" width=100%; />
 </a>
--- a/docs/readthedocs/source/doc/LLM/Quickstart/open_webui_with_ollama_quickstart.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/open_webui_with_ollama_quickstart.md
@ -0,0 +1,151 @@
 # Run Open WebUI on Linux with Intel GPU
 [Open WebUI](https://github.com/open-webui/open-webui) is a user friendly GUI for running LLM locally; by porting it to [`ipex-llm`](https://github.com/intel-analytics/ipex-llm), users can now easily run LLM in [Open WebUI](https://github.com/open-webui/open-webui) on Intel **GPU** *(e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max)*.
 See the demo of running Mistral:7B on Intel Arc A770 below.
 <video src="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_demo.mp4" width="100%" controls></video>
 ## Quickstart
 This quickstart guide walks you through setting up and using [Open WebUI](https://github.com/open-webui/open-webui) with Ollama (using the C++ interface of [`ipex-llm`](https://github.com/intel-analytics/ipex-llm) as an accelerated backend).
 ### 1 Run Ollama on Linux with Intel GPU
 Follow the instructions on the [Run Ollama on Linux with Intel GPU](ollama_quickstart.html) to install and run "Ollam Serve". Please ensure that the Ollama server continues to run while you're using the Open WebUI.
 ### 2 Install and Run Open-Webui
 #### Installation
 ```eval_rst
 .. note::
  Package version requirements for running Open WebUI: Node.js (>= 20.10) or Bun (>= 1.0.21), Python (>= 3.11)
 ```
 1. Run below commands to install Node.js & npm. Once the installation is complete, verify the installation by running ```node -v``` and ```npm -v``` to check the versions of Node.js and npm, respectively.
   ```sh
   sudo apt update 
   sudo apt install nodejs 
   sudo apt install npm 
   ```
 2. Use `git` to clone the [open-webui repo](https://github.com/open-webui/open-webui.git), or download the open-webui source code zip from [this link](https://github.com/open-webui/open-webui/archive/refs/heads/main.zip) and unzip it to a directory, e.g. `~/open-webui`.  
 3. Run below commands to install Open WebUI.  
    ```sh
    cd ~/open-webui/
    cp -RPp .env.example .env  # Copy required .env file
    # Build frontend
    npm i
    npm run build
    # Install Dependencies
    cd ./backend
    pip install -r requirements.txt -U
    ```
 #### Start the service
 Run below commands to start the service:
 ```sh
 export no_proxy=localhost,127.0.0.1
 bash start.sh
 ```
 ```eval_rst
 .. note::
  If you have difficulty accessing the huggingface repositories, you may use a mirror, e.g. add `export HF_ENDPOINT=https://hf-mirror.com` before running `bash start.sh`.
 ```
 #### Access the WebUI
 Upon successful launch, URLs to access the WebUI will be displayed in the terminal. Open the provided local URL in your browser to interact with the WebUI, e.g. http://localhost:8080/.
 ### 3. Using Open-Webui
 ```eval_rst
 .. note::
  For detailed information about how to use Open WebUI, visit the README of `open-webui official repository <https://github.com/open-webui/open-webui>`_.
 ```
 #### Log-in
 If this is your first time using it, you need to register. After registering, log in with the registered account to access the interface.
 <a href="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_signup.png" target="_blank">
  <img src="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_signup.png" width="100%" />
 </a>
 <a href="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_login.png" target="_blank">
  <img src="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_login.png" width="100%" />
 </a>
 #### Configure `Ollama` service URL
 Access the Ollama settings through **Settings -> Connections** in the menu. By default, the **Ollama Base URL** is preset to https://localhost:11434, as illustrated in the snapshot below. To verify the status of the Ollama service connection, click the **Refresh button** located next to the textbox. If the WebUI is unable to establish a connection with the Ollama server, you will see an error message stating, `WebUI could not connect to Ollama`.
 <a href="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_settings_0.png" target="_blank">
  <img src="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_settings_0.png" width="100%" />
 </a>
 If the connection is successful, you will see a message stating `Service Connection Verified`, as illustrated below.
 <a href="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_settings.png" target="_blank">
  <img src="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_settings.png" width="100%" />
 </a>
 ```eval_rst
 .. note::
  If you want to use an Ollama server hosted at a different URL, simply update the **Ollama Base URL** to the new URL and press the **Refresh** button to re-confirm the connection to Ollama. 
 ```
 #### Pull Model
 Go to **Settings -> Models** in the menu, choose a model under **Pull a model from Ollama.com** using the drop-down menu, and then hit the **Download** button on the right. Ollama will automatically download the selected model for you.
 <a href="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_pull_models.png" target="_blank">
  <img src="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_pull_models.png" width="100%" />
 </a>
 #### Chat with the Model
 Start new conversations with **New chat** in the left-side menu. 
 On the right-side, choose a downloaded model from the **Select a model** drop-down menu at the top, input your questions into the **Send a Message** textbox at the bottom, and click the button on the right to get responses.
  <a href="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_select_model.png" target="_blank">
    <img src="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_select_model.png" width="100%" />
  </a> 
 <br/>
 Additionally, you can drag and drop a document into the textbox, allowing the LLM to access its contents. The LLM will then generate answers based on the document provided.
 <a href="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_chat_2.png" target="_blank">
  <img src="https://llm-assets.readthedocs.io/en/latest/_images/open_webui_chat_2.png" width="100%" />
 </a>
 #### Exit Open-Webui
 To shut down the open-webui server, use **Ctrl+C** in the terminal where the open-webui server is runing, then close your browser tab.
 ### 4. Troubleshooting
 ##### Error `No module named 'torch._C`
 When you encounter the error ``ModuleNotFoundError: No module named 'torch._C'`` after executing ```bash start.sh```, you can resolve it by reinstalling PyTorch. First, use ```pip uninstall torch``` to remove the existing PyTorch installation, and then reinstall it along with its dependencies by running ```pip install torch torchvision torchaudio```.
--- a/docs/readthedocs/source/index.rst
+++ b/docs/readthedocs/source/index.rst
@ -33,7 +33,7 @@
               It is built on top of <strong>Intel Extension for PyTorch</strong> (<strong><code><span>IPEX</span></code></strong>), as well as the excellent work of <strong><code><span>llama.cpp</span></code></strong>, <strong><code><span>bitsandbytes</span></code></strong>, <strong><code><span>vLLM</span></code></strong>, <strong><code><span>qlora</span></code></strong>, <strong><code><span>AutoGPTQ</span></code></strong>, <strong><code><span>AutoAWQ</span></code></strong>, etc. 
            </li></em>
            <li><em>
-               It provides seamless integration with <a href=doc/LLM/Quickstart/llama_cpp_quickstart.html>llama.cpp</a>, <a href=doc/LLM/Quickstart/webui_quickstart.html>Text-Generation-WebUI</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/HF-Transformers-AutoModels>HuggingFace transformers</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning>HuggingFace PEFT</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LangChain >LangChain</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LlamaIndex >LlamaIndex</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Deepspeed-AutoTP >DeepSpeed-AutoTP</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/vLLM-Serving >vLLM</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/src/ipex_llm/serving/fastchat>FastChat</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/DPO>HuggingFace TRL</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/Applications/autogen >AutoGen</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/ModelScope-Models >ModeScope</a>, etc.
+               It provides seamless integration with <a href=doc/LLM/Quickstart/llama_cpp_quickstart.html>llama.cpp</a>, <a href=doc/LLM/Quickstart/ollama_quickstart.html>ollama</a>, <a href=doc/LLM/Quickstart/webui_quickstart.html>Text-Generation-WebUI</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/HF-Transformers-AutoModels>HuggingFace transformers</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning>HuggingFace PEFT</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LangChain >LangChain</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LlamaIndex >LlamaIndex</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Deepspeed-AutoTP >DeepSpeed-AutoTP</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/vLLM-Serving >vLLM</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/src/ipex_llm/serving/fastchat>FastChat</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/DPO>HuggingFace TRL</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/Applications/autogen >AutoGen</a>, <a href=https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/ModelScope-Models >ModeScope</a>, etc.
            </li></em>
            <li><em>
               <strong>50+ models</strong> have been optimized/verified on <code><span>ipex-llm</span></code> (including LLaMA2, Mistral, Mixtral, Gemma, LLaVA, Whisper, ChatGLM, Baichuan, Qwen, RWKV, and more); see the complete list <a href=#verified-models>here</a>.
@ -44,6 +44,8 @@
 ************************************************
 Latest update 🔥
 ************************************************
 * [2024/04] ``ipex-llm`` now provides C++ interface, which can be used as an accelerated backend for running `llama.cpp <doc/LLM/Quickstart/llama_cpp_quickstart.html>`_ and `ollama <doc/LLM/Quickstart/ollama_quickstart.html>`_ on Intel GPU.
 * [2024/03] ``bigdl-llm`` has now become ``ipex-llm`` (see the migration guide `here <doc/LLM/Quickstart/bigdl_llm_migration.html>`_); you may find the original ``BigDL`` project `here <https://github.com/intel-analytics/bigdl-2.x>`_.
 * [2024/02] ``ipex-llm`` now supports directly loading model from `ModelScope <https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/ModelScope-Models>`_ (`魔搭 <https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/ModelScope-Models>`_).
 * [2024/02] ``ipex-llm`` added inital **INT2** support (based on llama.cpp `IQ2 <https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2>`_ mechanism), which makes it possible to run large-size LLM (e.g., Mixtral-8x7B) on Intel GPU with 16GB VRAM.
@ -106,6 +108,10 @@ See the **optimized performance** of ``chatglm2-6b`` and ``llama-2-13b-chat`` mo
 ``ipex-llm`` Quickstart
 ************************************************
 ============================================
 Install ``ipex-llm``
 ============================================
 * `Windows GPU <doc/LLM/Quickstart/install_windows_gpu.html>`_: installing ``ipex-llm`` on Windows with Intel GPU
 * `Linux GPU <doc/LLM/Quickstart/install_linux_gpu.html>`_: installing ``ipex-llm`` on Linux with Intel GPU
 * `Docker <https://github.com/intel-analytics/ipex-llm/tree/main/docker/llm>`_: using ``ipex-llm`` dockers on Intel CPU and GPU
@ -118,7 +124,8 @@ See the **optimized performance** of ``chatglm2-6b`` and ``llama-2-13b-chat`` mo
 Run ``ipex-llm``
 ============================================
-* `llama.cpp <doc/LLM/Quickstart/llama_cpp_quickstart.html>`_: running **ipex-llm for llama.cpp** (*using C++ interface of* ``ipex-llm`` *as an accelerated backend for* ``llama.cpp`` *on Intel GPU*)
+* `llama.cpp <doc/LLM/Quickstart/llama_cpp_quickstart.html>`_: running **llama.cpp** (*using C++ interface of* ``ipex-llm`` *as an accelerated backend for* ``llama.cpp``) on Intel GPU
 * `ollama <doc/LLM/Quickstart/ollama_quickstart.html>`_: running **ollama** (*using C++ interface of* ``ipex-llm`` *as an accelerated backend for* ``ollama``) on Intel GPU
 * `vLLM <https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/vLLM-Serving>`_: running ``ipex-llm`` in ``vLLM`` on both Intel `GPU <https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/vLLM-Serving>`_ and `CPU <https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/vLLM-Serving>`_
 * `FastChat <https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/src/ipex_llm/serving/fastchat>`_: running ``ipex-llm`` in ``FastChat`` serving on on both Intel GPU and CPU
 * `LangChain-Chatchat RAG <https://github.com/intel-analytics/Langchain-Chatchat>`_: running ``ipex-llm`` in ``LangChain-Chatchat`` (*Knowledge Base QA using* **RAG** *pipeline*)
--- a/python/llm/dev/benchmark/harness/README.md
+++ b/python/llm/dev/benchmark/harness/README.md
@ -30,6 +30,6 @@ Taking example above, the script will fork 3 processes, each for one xpu, to exe
 ## Results
 We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result.
 ## Summarize the results
-"""python
+```python
 python make_table.py <input_dir>
-"""
+```
--- a/python/llm/dev/benchmark/harness/harness_to_leaderboard.py
+++ b/python/llm/dev/benchmark/harness/harness_to_leaderboard.py
@ -48,7 +48,7 @@ task_to_metric = dict(
    drop='f1'
 )
-def parse_precision(precision, model="bigdl-llm"):
+def parse_precision(precision, model="ipex-llm"):
    result = match(r"([a-zA-Z_]+)(\d+)([a-zA-Z_\d]*)", precision)
    datatype = result.group(1)
    bit = int(result.group(2))
@ -62,6 +62,6 @@ def parse_precision(precision, model="bigdl-llm"):
    else:
        if model == "hf-causal":
            return f"bnb_type={precision}"
-        if model == "bigdl-llm":
+        if model == "ipex-llm":
            return f"load_in_low_bit={precision}"
    raise RuntimeError(f"invald precision {precision}")    
--- a/python/llm/dev/benchmark/harness/bigdl_llm.py
+++ b/python/llm/dev/benchmark/harness/bigdl_llm.py
@ -35,7 +35,7 @@ def force_decrease_order(Reorderer):
 utils.Reorderer = force_decrease_order(utils.Reorderer)
-class BigDLLM(AutoCausalLM):
+class IPEXLLM(AutoCausalLM):
    AUTO_MODEL_CLASS = AutoModelForCausalLM
    AutoCausalLM_ARGS = inspect.getfullargspec(AutoCausalLM.__init__).args
    def __init__(self, *args, **kwargs):
--- a/python/llm/dev/benchmark/harness/run_llb.py
+++ b/python/llm/dev/benchmark/harness/run_llb.py
@ -20,8 +20,8 @@ import os
 from harness_to_leaderboard import *
 from lm_eval import tasks, evaluator, utils, models
-from bigdl_llm import BigDLLM
+from ipexllm import IPEXLLM
-models.MODEL_REGISTRY['bigdl-llm'] = BigDLLM    # patch bigdl-llm to harness
+models.MODEL_REGISTRY['ipex-llm'] = IPEXLLM    # patch ipex-llm to harness
 logging.getLogger("openai").setLevel(logging.WARNING)
--- a/python/llm/dev/benchmark/harness/run_multi_llb.py
+++ b/python/llm/dev/benchmark/harness/run_multi_llb.py
@ -22,8 +22,9 @@ from lm_eval import tasks, evaluator, utils, models
 from multiprocessing import Queue, Process
 import multiprocessing as mp
 from contextlib import redirect_stdout, redirect_stderr
-from bigdl_llm import BigDLLM
+
-models.MODEL_REGISTRY['bigdl-llm'] = BigDLLM    # patch bigdl-llm to harness
+from ipexllm import IPEXLLM
 models.MODEL_REGISTRY['ipex-llm'] = IPEXLLM    # patch ipex-llm to harness
 logging.getLogger("openai").setLevel(logging.WARNING)
--- a/python/llm/dev/benchmark/perplexity/README.md
+++ b/python/llm/dev/benchmark/perplexity/README.md
@ -20,6 +20,6 @@ python run.py --model_path meta-llama/Llama-2-7b-chat-hf --precisions float16 sy
 - If you want to test perplexity on pre-downloaded datasets, please specify the `<path/to/dataset>` in the `dataset_path` argument in your command.
 ## Summarize the results
-"""python
+```python
 python make_table.py <input_dir>
-"""
+```
--- a/python/llm/example/CPU/Applications/autogen/README.md
+++ b/python/llm/example/CPU/Applications/autogen/README.md
@ -11,7 +11,7 @@ mkdir autogen
 cd autogen
 # create respective conda environment
-conda create -n autogen python=3.9
+conda create -n autogen python=3.11
 conda activate autogen
 # install fastchat-adapted ipex-llm
--- a/python/llm/example/CPU/Applications/hf-agent/README.md
+++ b/python/llm/example/CPU/Applications/hf-agent/README.md
@ -10,7 +10,7 @@ To run this example with IPEX-LLM, we have some recommended requirements for you
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/Applications/streaming-llm/README.md
+++ b/python/llm/example/CPU/Applications/streaming-llm/README.md
@ -10,7 +10,7 @@ model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=Tr
 ## Prepare Environment
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all]
--- a/python/llm/example/CPU/Deepspeed-AutoTP/README.md
+++ b/python/llm/example/CPU/Deepspeed-AutoTP/README.md
@ -2,7 +2,7 @@
 #### 1. Install Dependencies
-Install necessary packages (here Python 3.9 is our test environment):
+Install necessary packages (here Python 3.11 is our test environment):
 ```bash
 bash install.sh
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/README.md
@ -34,7 +34,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a AWQ
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install autoawq==0.1.8 --no-deps
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md
@ -25,7 +25,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/README.md
@ -16,7 +16,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/README.md
@ -16,7 +16,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Baic
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Baic
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Blue
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/README.md
@ -16,10 +16,11 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
 pip install "transformers<4.34.1"  # chatglm cannot work with transformers 4.34.1+
 ```
 ### 2. Run
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/README.md
@ -10,7 +10,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Chat
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
@ -80,7 +80,7 @@ In the example [streamchat.py](./streamchat.py), we show a basic use case for a
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/README.md
@ -10,7 +10,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Chat
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install ipex-llm with 'all' option
@ -81,7 +81,7 @@ In the example [streamchat.py](./streamchat.py), we show a basic use case for a
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Code
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/README.md
@ -16,7 +16,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Deci
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/README.md
@ -16,7 +16,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Deep
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/README.md
@ -12,7 +12,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Doll
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Doll
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/README.md
@ -10,7 +10,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Falc
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/README.md
@ -12,7 +12,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/README.md
@ -11,7 +11,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/README.md
@ -14,7 +14,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
@ -27,7 +27,7 @@ pip install transformers==4.38.1
 #### 1.2 Installation on Windows
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9 libuv
+conda create -n llm python=3.11 libuv
 conda activate llm
 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/README.md
@ -11,7 +11,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/README.md
@ -10,7 +10,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Inte
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/README.md
@ -10,7 +10,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Inte
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/README.md
@ -13,7 +13,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/README.md
@ -13,7 +13,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 # below command will install PyTorch CPU as default
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/README.md
@ -10,7 +10,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a MOSS
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for an MPT
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/README.md
@ -16,7 +16,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/README.md
@ -16,7 +16,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/README.md
@ -16,7 +16,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/README.md
@ -10,7 +10,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Phoe
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/README.md
@ -11,7 +11,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/README.md
@ -15,7 +15,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Qwen
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5/README.md
@ -10,7 +10,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Qwen
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/README.md
@ -10,7 +10,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a RedP
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/README.md
@ -11,7 +11,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/skywork/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Skyw
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/solar/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a SOLA
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/stablelm/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/stablelm/README.md
@ -11,7 +11,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for an Sta
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Vicu
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/readme.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/readme.md
@ -10,7 +10,7 @@ In the example [recognize.py](./recognize.py), we show a basic use case for a Wh
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
@ -66,7 +66,7 @@ In the example [long-segment-recognize.py](./long-segment-recognize.py), we show
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/wizardcoder-python/README.md
@ -9,7 +9,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Wiza
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/README.md
@ -11,7 +11,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/README.md
@ -13,7 +13,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/README.md
@ -16,7 +16,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/README.md
@ -5,7 +5,7 @@ In this example, we show a pipeline to apply IPEX-LLM low-bit optimizations (inc
 ## Prepare Environment
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all]
--- a/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md
+++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md
@ -5,7 +5,7 @@ In this example, we show a pipeline to apply IPEX-LLM low-bit optimizations (inc
 ## Prepare Environment
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all]
--- a/python/llm/example/CPU/ModelScope-Models/README.md
+++ b/python/llm/example/CPU/ModelScope-Models/README.md
@ -10,7 +10,7 @@ In the example [generate.py](./generate.py), we show a basic use case for a Chat
 ### 1. Install
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install ipex-llm with 'all' option
--- a/python/llm/example/CPU/Native-Models/README.md
+++ b/python/llm/example/CPU/Native-Models/README.md
@ -7,7 +7,7 @@ In this example, we show a pipeline to convert a large language model to IPEX-LL
 ## Prepare Environment
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.9
+conda create -n llm python=3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all]
--- a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/README.md
+++ b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/README.md
@ -11,7 +11,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/PyTorch-Models/Model/bark/README.md
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bark/README.md
@ -11,7 +11,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/PyTorch-Models/Model/bert/README.md
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bert/README.md
@ -11,7 +11,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/README.md
+++ b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/README.md
@ -11,7 +11,7 @@ We suggest using conda to manage the Python environment. For more information ab
 After installing conda, create a Python environment for IPEX-LLM:
 ```bash
-conda create -n llm python=3.9 # recommend to use Python 3.9
+conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
--- a/Show more
+++ b/Show more