From 1e00bd7bbe88b87d24a64c1dd6e41b7996764205 Mon Sep 17 00:00:00 2001
From: Qiyuan Gong <qiyuan.gong@intel.com>
Date: Wed, 15 May 2024 09:42:43 +0800
Subject: [PATCH] Re-org XPU finetune images (#10971)

* Rename xpu finetune image from `ipex-llm-finetune-qlora-xpu` to `ipex-llm-finetune-xpu`.
* Add axolotl to xpu finetune image.
* Upgrade peft to 0.10.0, transformers to 4.36.0.
* Add accelerate default config to home.
---
 .github/workflows/manually_build.yml          |  12 +-
 .../workflows/manually_build_for_testing.yml  |  14 +-
 docker/llm/README.md                          |  94 +++++-----
 docker/llm/README_backup.md                   |   6 +-
 .../llm/finetune/qlora/xpu/docker/README.md   |  98 -----------
 .../{qlora/xpu/docker => xpu}/Dockerfile      |  28 ++-
 docker/llm/finetune/xpu/README.md             | 160 ++++++++++++++++++
 .../start-qlora-finetuning-on-xpu.sh          |   4 +-
 8 files changed, 245 insertions(+), 171 deletions(-)
 delete mode 100644 docker/llm/finetune/qlora/xpu/docker/README.md
 rename docker/llm/finetune/{qlora/xpu/docker => xpu}/Dockerfile (70%)
 create mode 100644 docker/llm/finetune/xpu/README.md
 rename docker/llm/finetune/{qlora/xpu/docker => xpu}/start-qlora-finetuning-on-xpu.sh (88%)

diff --git a/.github/workflows/manually_build.yml b/.github/workflows/manually_build.yml
index e0b8db8a..f7d1a0fd 100644
--- a/.github/workflows/manually_build.yml
+++ b/.github/workflows/manually_build.yml
@@ -17,7 +17,7 @@ on:
         - ipex-llm-finetune-lora-cpu
         - ipex-llm-finetune-qlora-cpu-standalone
         - ipex-llm-finetune-qlora-cpu-k8s
-        - ipex-llm-finetune-qlora-xpu
+        - ipex-llm-finetune-xpu
       tag:
         description: 'docker image tag (e.g. 2.1.0-SNAPSHOT)'
         required: true
@@ -133,8 +133,8 @@ jobs:
           sudo docker push ${image}:latest
           sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} ${image}:latest
 
-  ipex-llm-finetune-qlora-xpu:
-    if: ${{ inputs.artifact == 'ipex-llm-finetune-qlora-xpu' || inputs.artifact == 'all' }}
+  ipex-llm-finetune-xpu:
+    if: ${{ inputs.artifact == 'ipex-llm-finetune-xpu' || inputs.artifact == 'all' }}
     runs-on: [self-hosted, Shire]
     
     steps:
@@ -142,12 +142,12 @@ jobs:
     - name: docker login
       run: |
         docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
-    - name: ipex-llm-finetune-qlora-xpu
+    - name: ipex-llm-finetune-xpu
       run: |
         echo "##############################################################"
-        echo "####### ipex-llm-finetune-qlora-xpu ########"
+        echo "####### ipex-llm-finetune-xpu ########"
         echo "##############################################################"
-        export image=intelanalytics/ipex-llm-finetune-qlora-xpu
+        export image=intelanalytics/ipex-llm-finetune-xpu
         cd docker/llm/finetune/qlora/xpu/docker
         sudo docker build \
           --no-cache=true \
diff --git a/.github/workflows/manually_build_for_testing.yml b/.github/workflows/manually_build_for_testing.yml
index 46662693..dc00e838 100644
--- a/.github/workflows/manually_build_for_testing.yml
+++ b/.github/workflows/manually_build_for_testing.yml
@@ -16,7 +16,7 @@ on:
         - all
         - ipex-llm-finetune-lora-cpu
         - ipex-llm-finetune-qlora-cpu
-        - ipex-llm-finetune-qlora-xpu
+        - ipex-llm-finetune-xpu
         - ipex-llm-xpu
         - ipex-llm-cpp-xpu
         - ipex-llm-cpu
@@ -91,8 +91,8 @@ jobs:
           sudo docker push 10.239.45.10/arda/${image}:${TAG}
           sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
 
-  ipex-llm-finetune-qlora-xpu:
-    if: ${{ github.event.inputs.artifact == 'ipex-llm-finetune-qlora-xpu' || github.event.inputs.artifact == 'all' }}
+  ipex-llm-finetune-xpu:
+    if: ${{ github.event.inputs.artifact == 'ipex-llm-finetune-xpu' || github.event.inputs.artifact == 'all' }}
     runs-on: [self-hosted, Shire]
 
     steps:
@@ -102,13 +102,13 @@ jobs:
     - name: docker login
       run: |
         docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
-    - name: ipex-llm-finetune-qlora-xpu
+    - name: ipex-llm-finetune-xpu
       run: |
         echo "##############################################################"
-        echo "####### ipex-llm-finetune-qlora-xpu ########"
+        echo "####### ipex-llm-finetune-xpu ########"
         echo "##############################################################"
-        export image=intelanalytics/ipex-llm-finetune-qlora-xpu
-        cd docker/llm/finetune/qlora/xpu/docker
+        export image=intelanalytics/ipex-llm-finetune-xpu
+        cd docker/llm/finetune/xpu
         sudo docker build \
           --no-cache=true \
           --build-arg http_proxy=${HTTP_PROXY} \
diff --git a/docker/llm/README.md b/docker/llm/README.md
index 8c60eccd..1691e6c6 100644
--- a/docker/llm/README.md
+++ b/docker/llm/README.md
@@ -1,47 +1,47 @@
-# IPEX-LLM Docker Containers
-
-You can run IPEX-LLM containers (via docker or k8s) for inference, serving and fine-tuning on Intel CPU and GPU. Details on how to use these containers are available at [IPEX-LLM Docker Container Guides](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/DockerGuides/index.html).
-
-### Prerequisites
-
-- Docker on Windows or Linux
-- Windows Subsystem for Linux (WSL) is required if using Windows.  
-
-### Quick Start 
-
-
-#### Pull a IPEX-LLM Docker Image
-To pull IPEX-LLM Docker images from [Docker Hub](https://hub.docker.com/u/intelanalytics), use the `docker pull` command. For instance, to pull the CPU inference image:
-```bash
-docker pull intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT
-```
-
-Available images in hub are: 
-
-| Image Name | Description |
-| --- | --- |
-| intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT | CPU Inference |
-| intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT | GPU Inference |
-| intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT | CPU Serving|
-| intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT | GPU Serving|
-| intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT | CPU Finetuning via Docker|
-|intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT|CPU Finetuning via Kubernetes|
-| intelanalytics/ipex-llm-finetune-qlora-xpu:2.1.0-SNAPSHOT| GPU Finetuning|
-
-#### Run a Container
-Use `docker run` command to run an IPEX-LLM docker container. For detailed instructions, refer to the [IPEX-LLM Docker Container Guides](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/DockerGuides/index.html).
-
-
-#### Build Docker Image
-
-To build a Docker image from source, first clone the IPEX-LLM repository and navigate to the Dockerfile directory. For example, to build the CPU inference image, navigate to `docker/llm/inference/cpu/docker`.
-
-Then, use the following command to build the image (replace `your_image_name` with your desired image name):
-
-```bash
-docker build \
-  --build-arg no_proxy=localhost,127.0.0.1 \
-  --rm --no-cache -t your_image_name .
-```
-
-> Note: If you're working behind a proxy, also add args `--build-arg http_proxy=http://your_proxy_uri:port` and `--build-arg https_proxy=https://your_proxy_url:port`  
+# IPEX-LLM Docker Containers
+
+You can run IPEX-LLM containers (via docker or k8s) for inference, serving and fine-tuning on Intel CPU and GPU. Details on how to use these containers are available at [IPEX-LLM Docker Container Guides](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/DockerGuides/index.html).
+
+### Prerequisites
+
+- Docker on Windows or Linux
+- Windows Subsystem for Linux (WSL) is required if using Windows.  
+
+### Quick Start 
+
+
+#### Pull a IPEX-LLM Docker Image
+To pull IPEX-LLM Docker images from [Docker Hub](https://hub.docker.com/u/intelanalytics), use the `docker pull` command. For instance, to pull the CPU inference image:
+```bash
+docker pull intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT
+```
+
+Available images in hub are: 
+
+| Image Name | Description |
+| --- | --- |
+| intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT | CPU Inference |
+| intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT | GPU Inference |
+| intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT | CPU Serving|
+| intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT | GPU Serving|
+| intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT | CPU Finetuning via Docker|
+|intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT|CPU Finetuning via Kubernetes|
+| intelanalytics/ipex-llm-finetune-qlora-xpu:2.1.0-SNAPSHOT| GPU Finetuning|
+
+#### Run a Container
+Use `docker run` command to run an IPEX-LLM docker container. For detailed instructions, refer to the [IPEX-LLM Docker Container Guides](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/DockerGuides/index.html).
+
+
+#### Build Docker Image
+
+To build a Docker image from source, first clone the IPEX-LLM repository and navigate to the Dockerfile directory. For example, to build the CPU inference image, navigate to `docker/llm/inference/cpu/docker`.
+
+Then, use the following command to build the image (replace `your_image_name` with your desired image name):
+
+```bash
+docker build \
+  --build-arg no_proxy=localhost,127.0.0.1 \
+  --rm --no-cache -t your_image_name .
+```
+
+> Note: If you're working behind a proxy, also add args `--build-arg http_proxy=http://your_proxy_uri:port` and `--build-arg https_proxy=https://your_proxy_url:port`  
diff --git a/docker/llm/README_backup.md b/docker/llm/README_backup.md
index bc7e0f54..40946108 100644
--- a/docker/llm/README_backup.md
+++ b/docker/llm/README_backup.md
@@ -588,12 +588,12 @@ Then you can use `./outputs/checkpoint-200-merged` as a normal huggingface trans
 
 The following shows how to fine-tune LLM with Quantization (QLoRA built on IPEX-LLM 4bit optimizations) in a docker environment, which is accelerated by Intel XPU.
 
-### 1. Prepare ipex-llm-finetune-qlora-xpu Docker Image
+### 1. Prepare ipex-llm-finetune-xpu Docker Image
 
 Run the following command:
 
 ```bash
-docker pull intelanalytics/ipex-llm-finetune-qlora-xpu:2.1.0-SNAPSHOT
+docker pull intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT
 ```
 
 ### 2. Prepare Base Model, Data and Start Docker Container
@@ -606,7 +606,7 @@ export DATA_PATH=your_downloaded_data_path
 export HTTP_PROXY=your_http_proxy
 export HTTPS_PROXY=your_https_proxy
 export CONTAINER_NAME=my_container
-export DOCKER_IMAGE=intelanalytics/ipex-llm-finetune-qlora-xpu:2.1.0-SNAPSHOT
+export DOCKER_IMAGE=intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT
 
 docker run -itd \
    --net=host \
diff --git a/docker/llm/finetune/qlora/xpu/docker/README.md b/docker/llm/finetune/qlora/xpu/docker/README.md
deleted file mode 100644
index 649b06ee..00000000
--- a/docker/llm/finetune/qlora/xpu/docker/README.md
+++ /dev/null
@@ -1,98 +0,0 @@
-## Fine-tune LLM with IPEX LLM Container
-
-The following shows how to fine-tune LLM with Quantization (QLoRA built on IPEX-LLM 4bit optimizations) in a docker environment, which is accelerated by Intel XPU.
-
-### 1. Prepare Docker Image
-
-You can download directly from Dockerhub like:
-
-```bash
-docker pull intelanalytics/ipex-llm-finetune-qlora-xpu:2.1.0-SNAPSHOT
-```
-
-Or build the image from source:
-
-```bash
-export HTTP_PROXY=your_http_proxy
-export HTTPS_PROXY=your_https_proxy
-
-docker build \
-  --build-arg http_proxy=${HTTP_PROXY} \
-  --build-arg https_proxy=${HTTPS_PROXY} \
-  -t intelanalytics/ipex-llm-finetune-qlora-xpu:2.1.0-SNAPSHOT \
-  -f ./Dockerfile .
-```
-
-### 2. Prepare Base Model, Data and Container
-
-Here, we try to fine-tune a [Llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b) with [yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned) dataset, and please download them and start a docker container with files mounted like below:
-
-```bash
-export BASE_MODE_PATH=your_downloaded_base_model_path
-export DATA_PATH=your_downloaded_data_path
-export HTTP_PROXY=your_http_proxy
-export HTTPS_PROXY=your_https_proxy
-
-docker run -itd \
-   --net=host \
-   --device=/dev/dri \
-   --memory="32G" \
-   --name=ipex-llm-finetune-qlora-xpu \
-   -e http_proxy=${HTTP_PROXY} \
-   -e https_proxy=${HTTPS_PROXY} \
-   -v $BASE_MODE_PATH:/model \
-   -v $DATA_PATH:/data/alpaca-cleaned \
-   --shm-size="16g" \
-   intelanalytics/ipex-llm-finetune-qlora-xpu:2.1.0-SNAPSHOT
-```
-
-The download and mount of base model and data to a docker container demonstrates a standard fine-tuning process. You can skip this step for a quick start, and in this way, the fine-tuning codes will automatically download the needed files:
-
-```bash
-export HTTP_PROXY=your_http_proxy
-export HTTPS_PROXY=your_https_proxy
-
-docker run -itd \
-   --net=host \
-   --device=/dev/dri \
-   --memory="32G" \
-   --name=ipex-llm-finetune-qlora-xpu \
-   -e http_proxy=${HTTP_PROXY} \
-   -e https_proxy=${HTTPS_PROXY} \
-   --shm-size="16g" \
-   intelanalytics/ipex-llm-finetune-qlora-xpu:2.1.0-SNAPSHOT
-```
-
-However, we do recommend you to handle them manually, because the automatical download can be blocked by Internet access and Huggingface authentication etc. according to different environment, and the manual method allows you to fine-tune in a custom way (with different base model and dataset).
-
-### 3. Start Fine-Tuning
-
-Enter the running container:
-
-```bash
-docker exec -it ipex-llm-finetune-qlora-xpu bash
-```
-
-Then, start QLoRA fine-tuning:
-
-```bash
-bash start-qlora-finetuning-on-xpu.sh
-```
-
-After minutes, it is expected to get results like:
-
-```bash
-{'loss': 2.0251, 'learning_rate': 0.0002, 'epoch': 0.02}
-{'loss': 1.2389, 'learning_rate': 0.00017777777777777779, 'epoch': 0.03}
-{'loss': 1.032, 'learning_rate': 0.00015555555555555556, 'epoch': 0.05}
-{'loss': 0.9141, 'learning_rate': 0.00013333333333333334, 'epoch': 0.06}
-{'loss': 0.8505, 'learning_rate': 0.00011111111111111112, 'epoch': 0.08}
-{'loss': 0.8713, 'learning_rate': 8.888888888888889e-05, 'epoch': 0.09}
-{'loss': 0.8635, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.11}
-{'loss': 0.8853, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.12}
-{'loss': 0.859, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.14}
-{'loss': 0.8608, 'learning_rate': 0.0, 'epoch': 0.15}
-{'train_runtime': xxxx, 'train_samples_per_second': xxxx, 'train_steps_per_second': xxxx, 'train_loss': 1.0400420665740966, 'epoch': 0.15}
-100%|███████████████████████████████████████████████████████████████████████████████████| 200/200 [07:16<00:00,  2.18s/it]
-TrainOutput(global_step=200, training_loss=1.0400420665740966, metrics={'train_runtime': xxxx, 'train_samples_per_second': xxxx, 'train_steps_per_second': xxxx, 'train_loss': 1.0400420665740966, 'epoch': 0.15})
-```
diff --git a/docker/llm/finetune/qlora/xpu/docker/Dockerfile b/docker/llm/finetune/xpu/Dockerfile
similarity index 70%
rename from docker/llm/finetune/qlora/xpu/docker/Dockerfile
rename to docker/llm/finetune/xpu/Dockerfile
index 8123b057..2f10ba81 100644
--- a/docker/llm/finetune/qlora/xpu/docker/Dockerfile
+++ b/docker/llm/finetune/xpu/Dockerfile
@@ -3,7 +3,6 @@ ARG http_proxy
 ARG https_proxy
 ENV TZ=Asia/Shanghai
 ARG PIP_NO_CACHE_DIR=false
-ENV TRANSFORMERS_COMMIT_ID=1466677
 
 # retrive oneapi repo public key
 RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
@@ -18,7 +17,7 @@ RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-P
     apt-get install -y curl wget git gnupg gpg-agent software-properties-common libunwind8-dev vim less && \
     # install Intel GPU driver
     apt-get install -y intel-opencl-icd intel-level-zero-gpu level-zero level-zero-dev --allow-downgrades && \
-    # install python 3.9
+    # install python 3.11
     ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
     env DEBIAN_FRONTEND=noninteractive apt-get update && \
     add-apt-repository ppa:deadsnakes/ppa -y && \
@@ -27,17 +26,28 @@ RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-P
     ln -s /usr/bin/python3.11 /usr/bin/python3 && \
     ln -s /usr/bin/python3 /usr/bin/python && \
     apt-get install -y python3-pip python3.11-dev python3-wheel python3.11-distutils && \
+    # remove apt cache
+    rm -rf /var/lib/apt/lists/* && \
     curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
     python3 get-pip.py && \
     # install XPU ipex-llm
     pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ && \
-    # install huggingface dependencies
-    pip install git+https://github.com/huggingface/transformers.git@${TRANSFORMERS_COMMIT_ID} && \
-    pip install peft==0.6.0 datasets accelerate==0.23.0 && \
-    pip install bitsandbytes scipy && \
+    # prepare finetune code and scripts
     git clone https://github.com/intel-analytics/IPEX-LLM.git && \
-    mv IPEX-LLM/python/llm/example/GPU/LLM-Finetuning/common /common && \
-    rm -r IPEX-LLM && \
-    wget https://raw.githubusercontent.com/intel-analytics/IPEX-LLM/main/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py
+    mv IPEX-LLM/python/llm/example/GPU/LLM-Finetuning /LLM-Finetuning && \
+    rm -rf IPEX-LLM && \
+    # install axolotl
+    git clone https://github.com/OpenAccess-AI-Collective/axolotl && \
+    cd axolotl && git checkout v0.4.0 && \
+    mv /LLM-Finetuning/axolotl/requirements-xpu.txt requirements.txt && \
+    pip install -e . --ignore-installed blinker && \
+    rm -rf .git && \
+    # install transformers & peft dependencies
+    pip install transformers==4.36.0 && \
+    pip install peft==0.10.0 datasets accelerate==0.23.0 && \
+    pip install bitsandbytes scipy fire && \
+    # Prepare accelerate config
+    mkdir -p /root/.cache/huggingface/accelerate && \
+    mv /LLM-Finetuning/axolotl/default_config.yaml /root/.cache/huggingface/accelerate/
 
 COPY ./start-qlora-finetuning-on-xpu.sh /start-qlora-finetuning-on-xpu.sh
diff --git a/docker/llm/finetune/xpu/README.md b/docker/llm/finetune/xpu/README.md
new file mode 100644
index 00000000..d9579d08
--- /dev/null
+++ b/docker/llm/finetune/xpu/README.md
@@ -0,0 +1,160 @@
+# Finetune LLM with IPEX LLM Container
+
+The following shows how to finetune LLM with IPEX-LLM optimizations in a docker environment, which is accelerated by Intel XPU.
+
+
+With this docker image, we can use all [ipex-llm finetune examples on Intel GPU](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning), including:
+
+- [LoRA](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/LoRA): examples of running LoRA finetuning
+- [QLoRA](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/QLoRA): examples of running QLoRA finetuning
+- [QA-LoRA](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/QA-LoRA): examples of running QA-LoRA finetuning
+- [ReLora](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/ReLora): examples of running ReLora finetuning
+- [DPO](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/DPO): examples of running DPO finetuning
+- [HF-PEFT](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/HF-PEFT): run finetuning on Intel GPU using Hugging Face PEFT code without modification
+- [axolotl](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/axolotl): LLM finetuning on Intel GPU using axolotl without writing code
+
+
+## 1. Prepare Docker Image
+
+You can download directly from Dockerhub like:
+
+```bash
+docker pull intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT
+```
+
+Or build the image from source:
+
+```bash
+export HTTP_PROXY=your_http_proxy
+export HTTPS_PROXY=your_https_proxy
+
+docker build \
+  --build-arg http_proxy=${HTTP_PROXY} \
+  --build-arg https_proxy=${HTTPS_PROXY} \
+  -t intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT \
+  -f ./Dockerfile .
+```
+
+## 2. Prepare Base Model, Data and Container
+
+Here, we try to fine-tune a [Llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b) with [yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned) dataset, and please download them and start a docker container with files mounted like below:
+
+```bash
+export BASE_MODE_PATH=your_downloaded_base_model_path
+export DATA_PATH=your_downloaded_data_path
+export HTTP_PROXY=your_http_proxy
+export HTTPS_PROXY=your_https_proxy
+
+docker run -itd \
+   --net=host \
+   --device=/dev/dri \
+   --memory="32G" \
+   --name=ipex-llm-finetune-xpu \
+   -e http_proxy=${HTTP_PROXY} \
+   -e https_proxy=${HTTPS_PROXY} \
+   -v $BASE_MODE_PATH:/model \
+   -v $DATA_PATH:/data/alpaca-cleaned \
+   --shm-size="16g" \
+   intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT
+```
+
+The download and mount of base model and data to a docker container demonstrates a standard fine-tuning process. You can skip this step for a quick start, and in this way, the fine-tuning codes will automatically download the needed files:
+
+```bash
+export HTTP_PROXY=your_http_proxy
+export HTTPS_PROXY=your_https_proxy
+
+docker run -itd \
+   --net=host \
+   --device=/dev/dri \
+   --memory="32G" \
+   --name=ipex-llm-finetune-xpu \
+   -e http_proxy=${HTTP_PROXY} \
+   -e https_proxy=${HTTPS_PROXY} \
+   --shm-size="16g" \
+   intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT
+```
+
+However, we do recommend you to handle them manually, because the download can be blocked by Internet access and Huggingface authentication etc. according to different environment, and the manual method allows you to fine-tune in a custom way (with different base model and dataset).
+
+## 3. Start Fine-Tuning
+
+### 3.1 QLoRA Llama2-7b example
+
+Enter the running container:
+
+```bash
+docker exec -it ipex-llm-finetune-xpu bash
+```
+
+Then, start QLoRA fine-tuning:
+
+```bash
+bash start-qlora-finetuning-on-xpu.sh
+```
+
+After minutes, it is expected to get results like:
+
+```bash
+{'loss': 2.0251, 'learning_rate': 0.0002, 'epoch': 0.02}
+{'loss': 1.2389, 'learning_rate': 0.00017777777777777779, 'epoch': 0.03}
+{'loss': 1.032, 'learning_rate': 0.00015555555555555556, 'epoch': 0.05}
+{'loss': 0.9141, 'learning_rate': 0.00013333333333333334, 'epoch': 0.06}
+{'loss': 0.8505, 'learning_rate': 0.00011111111111111112, 'epoch': 0.08}
+{'loss': 0.8713, 'learning_rate': 8.888888888888889e-05, 'epoch': 0.09}
+{'loss': 0.8635, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.11}
+{'loss': 0.8853, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.12}
+{'loss': 0.859, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.14}
+{'loss': 0.8608, 'learning_rate': 0.0, 'epoch': 0.15}
+{'train_runtime': xxxx, 'train_samples_per_second': xxxx, 'train_steps_per_second': xxxx, 'train_loss': 1.0400420665740966, 'epoch': 0.15}
+100%|███████████████████████████████████████████████████████████████████████████████████| 200/200 [07:16<00:00,  2.18s/it]
+TrainOutput(global_step=200, training_loss=1.0400420665740966, metrics={'train_runtime': xxxx, 'train_samples_per_second': xxxx, 'train_steps_per_second': xxxx, 'train_loss': 1.0400420665740966, 'epoch': 0.15})
+```
+
+### 3.2 QA-LoRA Llama2-7b example
+
+Enter the running container:
+
+```bash
+docker exec -it ipex-llm-finetune-xpu bash
+```
+
+Enter QA-LoRA dir.
+
+```bash
+cd /LLM-Finetuning/QA-LoRA
+```
+
+Modify configuration in scripts, e.g., `--base_model` and `--data_path` in `qalora_finetune_llama2_7b_arc_1_card.sh`.
+
+Then, start QA-LoRA fine-tuning:
+
+```bash
+bash qalora_finetune_llama2_7b_arc_1_card.sh
+```
+
+For more details, please refer to [QA-LoRA example](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/QA-LoRA).
+
+### 3.3 Axolotl LoRA
+
+Enter the running container:
+
+```bash
+docker exec -it ipex-llm-finetune-xpu bash
+```
+
+Enter QA-LoRA dir.
+
+```bash
+cd /LLM-Finetuning/axolotl
+```
+
+Modify configuration in axolotl config, e.g., `base_model` and `datasets.path` in `lora.yml`.
+
+Then, start QA-LoRA fine-tuning:
+
+```bash
+accelerate launch finetune.py lora.yml
+```
+
+For more details, please refer to [axolotl example](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning/axolotl).
diff --git a/docker/llm/finetune/qlora/xpu/docker/start-qlora-finetuning-on-xpu.sh b/docker/llm/finetune/xpu/start-qlora-finetuning-on-xpu.sh
similarity index 88%
rename from docker/llm/finetune/qlora/xpu/docker/start-qlora-finetuning-on-xpu.sh
rename to docker/llm/finetune/xpu/start-qlora-finetuning-on-xpu.sh
index bdc2741b..bc599c9f 100644
--- a/docker/llm/finetune/qlora/xpu/docker/start-qlora-finetuning-on-xpu.sh
+++ b/docker/llm/finetune/xpu/start-qlora-finetuning-on-xpu.sh
@@ -14,5 +14,7 @@ then
   DATA_PARAM="--dataset ./data/alpaca-cleaned" # otherwise, default to download from HF dataset
 fi
 
-python qlora_finetuning.py $MODEL_PARAM $DATA_PARAM
+# QLoRA example dir
+cd /LLM-Finetuning/QLoRA/simple-example/
 
+python qlora_finetuning.py $MODEL_PARAM $DATA_PARAM