diff --git a/.github/workflows/manually_build.yml b/.github/workflows/manually_build.yml
index e5fe603d..5b2cc7af 100644
--- a/.github/workflows/manually_build.yml
+++ b/.github/workflows/manually_build.yml
@@ -12,6 +12,7 @@ on:
         - all
         - bigdl-llm-xpu
         - bigdl-llm-cpu
+        - bigdl-llm-serving-cpu
         - bigdl-ppml-gramine-base
         - bigdl-ppml-trusted-bigdl-llm-gramine-base
         - bigdl-ppml-trusted-bigdl-llm-gramine-ref
@@ -114,6 +115,32 @@ jobs:
         sudo docker push 10.239.45.10/arda/${image}:${TAG}
         sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
 
+  bigdl-llm-serving-cpu:
+    if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }}
+    runs-on: [self-hosted, Shire]
+    steps:
+    - uses: actions/checkout@v3
+    - name: docker login
+      run: |
+        docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
+    - name: bigdl-llm-serving-cpu
+      run: |
+        echo "##############################################################"
+        echo "####### bigdl-llm-serving-cpu ########"
+        echo "##############################################################"
+        export image=intelanalytics/bigdl-llm-serving-cpu
+        cd docker/llm/serving/cpu/docker
+        sudo docker build \
+          --no-cache=true \
+          --build-arg http_proxy=${HTTP_PROXY} \
+          --build-arg https_proxy=${HTTPS_PROXY} \
+          --build-arg no_proxy=${NO_PROXY} \
+          -t ${image}:${TAG} -f ./Dockerfile .
+        sudo docker push ${image}:${TAG}
+        sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+        sudo docker push 10.239.45.10/arda/${image}:${TAG}
+        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+
   bigdl-ppml-gramine-base:
     if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }}
     runs-on: [self-hosted, Shire]
diff --git a/README.md b/README.md
index 0966576e..841d9bcd 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,13 @@
 
 **[`bigdl-llm`](python/llm)** is a library for running **LLM** (large language model) on Intel **XPU** (from *Laptop* to *GPU* to *Cloud*) using **INT4** with very low latency[^1] (for any **PyTorch** model).
 
-> *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [gptq](https://github.com/IST-DASLab/gptq), [ggml](https://github.com/ggerganov/ggml), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.*
+> *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [ggml](https://github.com/ggerganov/ggml), [gptq](https://github.com/IST-DASLab/gptq), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.*
 
 ### Latest update
+- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/gpu/qlora_finetuning).
 - `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/gpu).
 - `bigdl-llm` tutorial is released [here](https://github.com/intel-analytics/bigdl-llm-tutorial).
-- Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly-v1/Dolly-v2, StarCoder, Whisper, InternLM, QWen, Baichuan, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models).
+- Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models).
      
 ### `bigdl-llm` Demos
 See the ***optimized performance*** of `chatglm2-6b` and `llama-2-13b-chat` models on 12th Gen Intel Core CPU and Intel Arc GPU below.
diff --git a/docker/llm/serving/cpu/docker/Dockerfile b/docker/llm/serving/cpu/docker/Dockerfile
index ede2b733..e058bed5 100644
--- a/docker/llm/serving/cpu/docker/Dockerfile
+++ b/docker/llm/serving/cpu/docker/Dockerfile
@@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
 
 ARG http_proxy
 ARG https_proxy
+ARG TINI_VERSION=v0.18.0
 
 # Disable pip's cache behavior
 ARG PIP_NO_CACHE_DIR=false
 
+ADD ./entrypoint.sh /opt/entrypoint.sh
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
 # Install Serving Dependencies
 RUN mkdir /llm && \
     cd /llm && \
@@ -13,7 +16,11 @@ RUN mkdir /llm && \
     cd FastChat && \
     git checkout dev-2023-09-22 && \
     pip3 install -e ".[model_worker,webui]" && \
-    cd /llm
+    cd /llm && \
+    chmod +x /opt/entrypoint.sh && \
+    chmod +x /sbin/tini && \
+    cp /sbin/tini /usr/bin/tini
 
 
 WORKDIR /llm/
+ENTRYPOINT [ "/opt/entrypoint.sh" ]
\ No newline at end of file
diff --git a/docker/llm/serving/cpu/docker/entrypoint.sh b/docker/llm/serving/cpu/docker/entrypoint.sh
new file mode 100644
index 00000000..99370654
--- /dev/null
+++ b/docker/llm/serving/cpu/docker/entrypoint.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+
+usage() {
+  echo "Usage: $0 [-m --mode <controller|worker>] [-h --help]"
+  echo "-h: Print help message."
+  echo "Controller mode reads the following env:"
+  echo "CONTROLLER_HOST (default: localhost)."
+  echo "CONTROLLER_PORT (default: 21001)."
+  echo "API_HOST (default: localhost)."
+  echo "API_PORT (default: 8000)."
+  echo "Worker mode reads the following env:"
+  echo "CONTROLLER_HOST (default: localhost)."
+  echo "CONTROLLER_PORT (default: 21001)."
+  echo "WORKER_HOST (default: localhost)."
+  echo "WORKER_PORT (default: 21002)."
+  echo "MODEL_PATH (default: empty)."
+  exit 1
+}
+
+# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
+calculate_total_cores() {
+  local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"
+
+  if [[ -f "$cpuset_file" ]]; then
+    local cpuset_cpus=$(cat "$cpuset_file")
+    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
+
+    local total_cores=0
+    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
+    for cpu in "${cpu_list[@]}"; do
+      if [[ $cpu =~ - ]]; then
+        # Range of CPUs
+        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
+        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
+        local range_cores=$((end_cpu - start_cpu + 1))
+        total_cores=$((total_cores + range_cores))
+      else
+        # Single CPU
+        total_cores=$((total_cores + 1))
+      fi
+    done
+
+    echo $total_cores
+    return
+  fi
+  # Kubernetes core-binding will use this file
+  cpuset_file="/sys/fs/cgroup/cpuset.cpus"
+  if [[ -f "$cpuset_file" ]]; then
+    local cpuset_cpus=$(cat "$cpuset_file")
+    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
+
+    local total_cores=0
+    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
+    for cpu in "${cpu_list[@]}"; do
+      if [[ $cpu =~ - ]]; then
+        # Range of CPUs
+        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
+        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
+        local range_cores=$((end_cpu - start_cpu + 1))
+        total_cores=$((total_cores + range_cores))
+      else
+        # Single CPU
+        total_cores=$((total_cores + 1))
+      fi
+    done
+
+    echo $total_cores
+    return
+  else
+    echo -1
+    return
+  fi
+}
+
+# Default values
+controller_host="localhost"
+controller_port="21001"
+api_host="localhost"
+api_port="8000"
+worker_host="localhost"
+worker_port="21002"
+model_path=""
+mode=""
+omp_num_threads=""
+dispatch_method="shortest_queue" # shortest_queue or lottery
+
+# Update rootCA config if needed
+update-ca-certificates
+
+# Remember the value of `OMP_NUM_THREADS`:
+if [[ -n "${OMP_NUM_THREADS}" ]]; then
+  omp_num_threads="${OMP_NUM_THREADS}"
+fi
+
+# We do not have any arguments, just run bash
+if [ "$#" == 0 ]; then
+  echo "[INFO] no command is passed in"
+  echo "[INFO] enter pass-through mode"
+  exec /usr/bin/tini -s -- "bash"
+else
+  # Parse command-line options
+  options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@")
+  if [ $? != 0 ]; then
+    usage
+  fi
+  eval set -- "$options"
+
+  while true; do
+    case "$1" in
+      -m|--mode)
+        mode="$2"
+        [[ $mode == "controller" || $mode == "worker" ]] || usage
+        shift 2
+        ;;
+      -h|--help)
+        usage
+        ;;
+      --)
+        shift
+        break
+        ;;
+      *)
+        usage
+        ;;
+    esac
+  done
+
+  if [[ -n $CONTROLLER_HOST ]]; then
+    controller_host=$CONTROLLER_HOST
+  fi
+
+  if [[ -n $CONTROLLER_PORT ]]; then
+    controller_port=$CONTROLLER_PORT
+  fi
+
+  if [[ -n $API_HOST ]]; then
+    api_host=$API_HOST
+  fi
+
+  if [[ -n $API_PORT ]]; then
+    api_port=$API_PORT
+  fi
+
+  if [[ -n $WORKER_HOST ]]; then
+    worker_host=$WORKER_HOST
+  fi
+
+  if [[ -n $WORKER_PORT ]]; then
+    worker_port=$WORKER_PORT
+  fi
+
+  if [[ -n $MODEL_PATH ]]; then
+    model_path=$MODEL_PATH
+  fi
+
+  if [[ -n $DISPATCH_METHOD ]]; then
+    dispatch_method=$DISPATCH_METHOD
+  fi
+
+  controller_address="http://$controller_host:$controller_port"
+  # Execute logic based on options
+  if [[ $mode == "controller" ]]; then
+    # Logic for controller mode
+    # Boot Controller
+    api_address="http://$api_host:$api_port"
+    echo "Controller address: $controller_address"
+    echo "OpenAI API address: $api_address"
+    python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
+    # Boot openai api server
+    python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
+  else
+    # Logic for non-controller(worker) mode
+    worker_address="http://$worker_host:$worker_port"
+    # Apply optimizations from bigdl-nano
+    source bigdl-nano-init -t
+    # First check if user have set OMP_NUM_THREADS by themselves
+    if [[ -n "${omp_num_threads}" ]]; then
+      echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
+      export OMP_NUM_THREADS=$omp_num_threads
+    else
+      # Use calculate_total_cores to acquire cpuset settings
+      # Set OMP_NUM_THREADS to correct numbers
+      cores=$(calculate_total_cores)
+      if [[ $cores == -1 || $cores == 0 ]]; then
+        echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
+      else
+        echo "Setting OMP_NUM_THREADS to $cores"
+        export OMP_NUM_THREADS=$cores
+      fi
+    fi
+    if [[ -z "${model_path}" ]]; then
+          echo "Please set env MODEL_PATH used for worker"
+          usage
+    fi
+    echo "Worker address: $worker_address"
+    echo "Controller address: $controller_address"
+    python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
+  fi
+fi
+
diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md
new file mode 100644
index 00000000..b0027f12
--- /dev/null
+++ b/docker/llm/serving/cpu/kubernetes/README.md
@@ -0,0 +1,235 @@
+## Deployment bigdl-llm serving service in K8S environment
+
+
+## Image
+
+To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT`
+
+## Before deployment
+
+### Models
+
+In this document, we will use `vicuna-7b-v1.5` as the deployment model.
+
+After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used.
+
+You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
+
+### Kubernetes config
+
+We recommend to setup your kubernetes cluster before deployment.  Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/).  Also, it would be great to also set the `topology management policy` to `single-numa-node`.
+
+### Machine config
+
+Set hyper-threading to off, ensure that only physical cores are used during deployment.
+
+## Deployment
+
+### Reminder on `OMP_NUM_THREADS`
+
+The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`.  However, this only happens correctly if the `core-binding` feature is enabled.  If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file.
+
+
+### Controller
+
+We use the following yaml file for controller deployment:
+
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: bigdl-fschat-a1234bd-controller
+  labels:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+spec:
+  dnsPolicy: "ClusterFirst"
+  containers:
+  - name: fastchat-controller # fixed
+    image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+    imagePullPolicy: IfNotPresent
+    env:
+    - name: CONTROLLER_HOST # fixed
+      value: "0.0.0.0"
+    - name: CONTROLLER_PORT # fixed
+      value: "21005"
+    - name: API_HOST # fixed
+      value: "0.0.0.0"
+    - name: API_PORT # fixed
+      value: "8000"
+    ports:
+      - containerPort: 21005
+        name: con-port
+      - containerPort: 8000
+        name: api-port
+    resources:
+      requests:
+        memory: 16Gi
+        cpu: 4
+      limits:
+        memory: 16Gi
+        cpu: 4
+    args: ["-m", "controller"]
+  restartPolicy: "Never"
+---
+# Service for the controller
+apiVersion: v1
+kind: Service
+metadata:
+  name: bigdl-a1234bd-fschat-controller-service
+spec:
+  # You may also want to change this to use the cluster's feature
+  type: NodePort
+  selector:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+  ports:
+    - name: cont-port
+      protocol: TCP
+      port: 21005
+      targetPort: 21005
+    - name: api-port
+      protocol: TCP
+      port: 8000
+      targetPort: 8000
+```
+
+### Worker
+
+We use the following deployment for worker deployment:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: bigdl-fschat-a1234bd-worker-deployment
+spec:
+  # Change this to the number you want
+  replicas: 1
+  selector:
+    matchLabels:
+      fastchat: worker
+  template:
+    metadata:
+      labels:
+        fastchat: worker
+    spec:
+      dnsPolicy: "ClusterFirst"
+      containers:
+      - name: fastchat-worker # fixed
+        image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+        imagePullPolicy: IfNotPresent
+        env:
+        - name: CONTROLLER_HOST # fixed
+          value: bigdl-a1234bd-fschat-controller-service
+        - name: CONTROLLER_PORT # fixed
+          value: "21005"
+        - name: WORKER_HOST # fixed
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+        - name: WORKER_PORT # fixed
+          value: "21841"
+        - name: MODEL_PATH # Change this
+          value: "/llm/models/vicuna-7b-v1.5-bigdl/"
+        - name: OMP_NUM_THREADS
+          value: "16"
+        resources:
+          requests:
+            memory: 32Gi
+            cpu: 16
+          limits:
+            memory: 32Gi
+            cpu: 16
+        args: ["-m", "worker"]
+        volumeMounts:
+          - name: llm-models
+            mountPath: /llm/models/
+      restartPolicy: "Always"
+      volumes:
+      - name: llm-models
+        hostPath:
+          path: /home/llm/models # change this in other envs
+```
+
+You may want to change the `MODEL_PATH` variable in the yaml.  Also, please remember to change the volume path accordingly.
+
+
+### Testing
+
+#### Using openai-python
+
+First, install openai-python:
+```bash
+pip install --upgrade openai
+```
+
+Then, interact with model vicuna-7b-v1.5-bigdl:
+```python
+import openai
+openai.api_key = "EMPTY"
+openai.api_base = "http://localhost:8000/v1"
+
+model = "vicuna-7b-v1.5-bigdl"
+prompt = "Once upon a time"
+
+# create a completion
+completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64)
+# print the completion
+print(prompt + completion.choices[0].text)
+
+# create a chat completion
+completion = openai.ChatCompletion.create(
+  model=model,
+  messages=[{"role": "user", "content": "Hello! What is your name?"}]
+)
+# print the completion
+print(completion.choices[0].message.content)
+```
+
+#### cURL
+cURL is another good tool for observing the output of the api.
+
+For the following examples, you may also change the service deployment address.
+
+List Models:
+```bash
+curl http://localhost:8000/v1/models
+```
+
+If you have `jq` installed, you can use it to format the output like this:
+```bash
+curl http://localhost:8000/v1/models | jq
+```
+
+Chat Completions:
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "YOUR_MODEL",
+    "messages": [{"role": "user", "content": "Hello! What is your name?"}]
+  }'
+```
+
+Text Completions:
+```bash
+curl http://localhost:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "YOUR_MODEL",
+    "prompt": "Once upon a time",
+    "max_tokens": 41,
+    "temperature": 0.5
+  }'
+```
+
+Embeddings:
+```bash
+curl http://localhost:8000/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "YOUR_MODEL",
+    "input": "Hello world!"
+  }'
+```
\ No newline at end of file
diff --git a/docker/llm/serving/cpu/kubernetes/clean.sh b/docker/llm/serving/cpu/kubernetes/clean.sh
new file mode 100644
index 00000000..d5d1729d
--- /dev/null
+++ b/docker/llm/serving/cpu/kubernetes/clean.sh
@@ -0,0 +1 @@
+kubectl delete -f deployment.yaml
\ No newline at end of file
diff --git a/docker/llm/serving/cpu/kubernetes/deployment.yaml b/docker/llm/serving/cpu/kubernetes/deployment.yaml
new file mode 100644
index 00000000..bd659fd4
--- /dev/null
+++ b/docker/llm/serving/cpu/kubernetes/deployment.yaml
@@ -0,0 +1,109 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: bigdl-fschat-a1234bd-controller
+  labels:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+spec:
+  dnsPolicy: "ClusterFirst"
+  containers:
+  - name: fastchat-controller # fixed
+    image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+    imagePullPolicy: IfNotPresent
+    env:
+    - name: CONTROLLER_HOST # fixed
+      value: "0.0.0.0"
+    - name: CONTROLLER_PORT # fixed
+      value: "21005"
+    - name: API_HOST # fixed
+      value: "0.0.0.0"
+    - name: API_PORT # fixed
+      value: "8000"
+    ports:
+      - containerPort: 21005
+        name: con-port
+      - containerPort: 8000
+        name: api-port
+    resources:
+      requests:
+        memory: 16Gi
+        cpu: 4
+      limits:
+        memory: 16Gi
+        cpu: 4
+    args: ["-m", "controller"]
+  restartPolicy: "Never"
+---
+# Service for the controller
+apiVersion: v1
+kind: Service
+metadata:
+  name: bigdl-a1234bd-fschat-controller-service
+spec:
+  # You may also want to change this to use the cluster's feature
+  type: NodePort
+  selector:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+  ports:
+    - name: cont-port
+      protocol: TCP
+      port: 21005
+      targetPort: 21005
+    - name: api-port
+      protocol: TCP
+      port: 8000
+      targetPort: 8000
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: bigdl-fschat-a1234bd-worker-deployment
+spec:
+  # Change this to the number you want
+  replicas: 1
+  selector:
+    matchLabels:
+      fastchat: worker
+  template:
+    metadata:
+      labels:
+        fastchat: worker
+    spec:
+      dnsPolicy: "ClusterFirst"
+      containers:
+      - name: fastchat-worker # fixed
+        image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+        imagePullPolicy: IfNotPresent
+        env:
+        - name: CONTROLLER_HOST # fixed
+          value: bigdl-a1234bd-fschat-controller-service
+        - name: CONTROLLER_PORT # fixed
+          value: "21005"
+        - name: WORKER_HOST # fixed
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+        - name: WORKER_PORT # fixed
+          value: "21841"
+        - name: MODEL_PATH # Change this
+          value: "/llm/models/vicuna-7b-v1.5-bigdl/"
+        - name: OMP_NUM_THREADS
+          value: "16"
+        resources:
+          requests:
+            memory: 32Gi
+            cpu: 16
+          limits:
+            memory: 32Gi
+            cpu: 16
+        args: ["-m", "worker"]
+        volumeMounts:
+          - name: llm-models
+            mountPath: /llm/models/
+      restartPolicy: "Always"
+      volumes:
+      - name: llm-models
+        hostPath:
+          path: /home/llm/models # change this in other envs
\ No newline at end of file
diff --git a/docs/readthedocs/source/_toc.yml b/docs/readthedocs/source/_toc.yml
index 9cba0641..094ebb4d 100644
--- a/docs/readthedocs/source/_toc.yml
+++ b/docs/readthedocs/source/_toc.yml
@@ -38,12 +38,12 @@ subtrees:
             title: "Key Features"
             subtrees:
               - entries:
+                - file: doc/LLM/Overview/KeyFeatures/optimize_model
                 - file: doc/LLM/Overview/KeyFeatures/transformers_style_api
                   subtrees:
                     - entries:
                       - file: doc/LLM/Overview/KeyFeatures/hugging_face_format
                       - file: doc/LLM/Overview/KeyFeatures/native_format
-                - file: doc/LLM/Overview/KeyFeatures/optimize_model
                 - file: doc/LLM/Overview/KeyFeatures/langchain_api
                 # - file: doc/LLM/Overview/KeyFeatures/cli
                 - file: doc/LLM/Overview/KeyFeatures/gpu_supports
diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/index.rst b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/index.rst
index 4914196b..823df5a1 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/index.rst
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/index.rst
@@ -3,12 +3,12 @@ BigDL-LLM Key Features
 
 You may run the LLMs using ``bigdl-llm`` through one of the following APIs:
 
+* `PyTorch API <./optimize_model.html>`_
 * |transformers_style_api|_
 
   * |hugging_face_transformers_format|_
   * `Native Format <./native_format.html>`_
 
-* `General PyTorch Model Supports <./langchain_api.html>`_
 * `LangChain API <./langchain_api.html>`_
 * `GPU Supports <./gpu_supports.html>`_
 
diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md
index eeb7a3c1..ac510688 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md
@@ -1,22 +1,27 @@
-## General PyTorch Model Supports
+## PyTorch API
 
-You may apply BigDL-LLM optimizations on any Pytorch models, not only Hugging Face *Transformers* models for acceleration. With BigDL-LLM, PyTorch models (in FP16/BF16/FP32) can be optimized with low-bit quantizations (supported precisions include INT4/INT5/INT8).
+In general, you just need one-line `optimize_model` to easily optimize any loaded PyTorch model, regardless of the library or API you are using. With BigDL-LLM, PyTorch models (in FP16/BF16/FP32) can be optimized with low-bit quantizations (supported precisions include INT4, INT5, INT8, etc).
 
-You can easily enable BigDL-LLM INT4 optimizations on any Pytorch models just as follows:
+First, use any PyTorch APIs you like to load your model. To help you better understand the process, here we use [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) library `LlamaForCausalLM` to load a popular model [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) as an example:
 
 ```python
-# Create or load any Pytorch model
-model = ...
+# Create or load any Pytorch model, take Llama-2-7b-chat-hf as an example
+from transformers import LlamaForCausalLM
+model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
+```
 
-# Add only two lines to enable BigDL-LLM INT4 optimizations on model
+Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default: 
+```python
 from bigdl.llm import optimize_model
+
+# With only one line to enable BigDL-LLM INT4 optimization
 model = optimize_model(model)
 ```
 
-After optimizing the model, you may straightly run the optimized model with no API changed and less inference latency.
+After optimizing the model, BigDL-LLM does not require any change in the inference code. You can use any libraries to run the optimized model with very low latency.
 
 ```eval_rst
 .. seealso::
 
-   See the examples for Hugging Face *Transformers* models `here <https://github.com/intel-analytics/BigDL/blob/main/python/llm/example/transformers/general_int4>`_. And examples for other general Pytorch models can be found `here <https://github.com/intel-analytics/BigDL/blob/main/python/llm/example/pytorch-model>`_.
+   * For more detailed usage of ``optimize_model``, please refer to the `API documentation <https://bigdl.readthedocs.io/en/latest/doc/PythonAPI/LLM/optimize.html>`_.
 ```
diff --git a/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md b/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md
index 763fd09a..5c2642db 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md
@@ -5,9 +5,11 @@
 Install BigDL-LLM for CPU supports using pip through:
 
 ```bash
-pip install bigdl-llm[all]
+pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option
 ```
 
+Please refer to [Environment Setup](#environment-setup) for more information.
+
 ```eval_rst
 .. note::
 
@@ -43,7 +45,7 @@ First we recommend using [Conda](https://docs.conda.io/en/latest/miniconda.html)
 conda create -n llm python=3.9
 conda activate llm
 
-pip install bigdl-llm[all] # install bigdl-llm for CPU with 'all' option
+pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option
 ```
 
 Then for running a LLM model with BigDL-LLM optimizations (taking an `example.py` an example):
diff --git a/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md b/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md
index 5429c150..0d36c39f 100644
--- a/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md
@@ -5,9 +5,11 @@
 Install BigDL-LLM for GPU supports using pip through:
 
 ```bash
-pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
+pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu # install bigdl-llm for GPU
 ```
 
+Please refer to [Environment Setup](#environment-setup) for more information.
+
 ```eval_rst
 .. note::
 
@@ -25,6 +27,12 @@ BigDL-LLM for GPU supports has been verified on:
 * Intel Arc™ A-Series Graphics
 * Intel Data Center GPU Flex Series
 
+```eval_rst
+.. note::
+
+   We currently supoort the Ubuntu 20.04 operating system or later. Windows supoort is in progress.
+```
+
 To apply Intel GPU acceleration, there're several steps for tools installation and environment preparation:
 
 * Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered.
diff --git a/docs/readthedocs/source/doc/LLM/index.rst b/docs/readthedocs/source/doc/LLM/index.rst
index f18aa1ab..e13cb0aa 100644
--- a/docs/readthedocs/source/doc/LLM/index.rst
+++ b/docs/readthedocs/source/doc/LLM/index.rst
@@ -32,8 +32,8 @@ BigDL-LLM
 
         +++
 
+        :bdg-link:`PyTorch <./Overview/KeyFeatures/optimize_model.html>` |
         :bdg-link:`transformers-style <./Overview/KeyFeatures/transformers_style_api.html>` |
-        :bdg-link:`Optimize Model <./Overview/KeyFeatures/optimize_model.html>` |
         :bdg-link:`LangChain <./Overview/KeyFeatures/langchain_api.html>` |
         :bdg-link:`GPU <./Overview/KeyFeatures/gpu_supports.html>`
 
diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/index.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/index.rst
index ea8d4fc0..6d6e38e1 100644
--- a/docs/readthedocs/source/doc/PythonAPI/LLM/index.rst
+++ b/docs/readthedocs/source/doc/PythonAPI/LLM/index.rst
@@ -4,6 +4,6 @@ BigDL-LLM API
 .. toctree::
     :maxdepth: 3
 
+    optimize.rst
     transformers.rst
     langchain.rst
-    optimize.rst
diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
index a6949247..01903ada 100644
--- a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
+++ b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
@@ -1,4 +1,4 @@
-BigDL-LLM Optimize API
+BigDL-LLM PyTorch API
 =====================
 
 llm.optimize
diff --git a/docs/readthedocs/source/index.rst b/docs/readthedocs/source/index.rst
index 6602890d..8a08d6cc 100644
--- a/docs/readthedocs/source/index.rst
+++ b/docs/readthedocs/source/index.rst
@@ -24,9 +24,10 @@ BigDL-LLM: low-Bit LLM library
 ============================================
 Latest update
 ============================================
+- **[New]** ``bigdl-llm`` now supports QLoRA fintuning on Intel GPU; see the the example `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/gpu/qlora_finetuning>`_.
 - ``bigdl-llm`` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/gpu>`_.
 - ``bigdl-llm`` tutorial is released `here <https://github.com/intel-analytics/bigdl-llm-tutorial>`_.
-- Over 20 models have been verified on ``bigdl-llm``, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly-v1/Dolly-v2, StarCoder, Whisper, InternLM, QWen, Baichuan, MOSS* and more; see the complete list `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/README.md#verified-models>`_.
+- Over 20 models have been verified on ``bigdl-llm``, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS* and more; see the complete list `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/README.md#verified-models>`_.
 
 
 ============================================
diff --git a/python/llm/example/gpu/qlora_finetuning/README.md b/python/llm/example/gpu/qlora_finetuning/README.md
index 7e14656c..7b98b1b6 100644
--- a/python/llm/example/gpu/qlora_finetuning/README.md
+++ b/python/llm/example/gpu/qlora_finetuning/README.md
@@ -1,4 +1,4 @@
-# Q-Lora (experimental support)
+# Finetuning LLAMA Using Q-Lora (experimental support)
 
 This example demonstrates how to finetune a llama2-7b model use Big-LLM 4bit optimizations using [Intel GPUs](../README.md).
 
@@ -7,7 +7,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi
 
 ## Example: Finetune llama2-7b using qlora
 
-This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing)
+This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing). The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
 
 ### 1. Install
 
@@ -26,13 +26,13 @@ pip install peft==0.5.0
 source /opt/intel/oneapi/setvars.sh
 ```
 
-### 3. Run
+### 3. Finetune model
 
 ```
 python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
 ```
 
-### Sample Output
+#### Sample Output
 ```log
 {'loss': 1.6134, 'learning_rate': 0.0002, 'epoch': 0.03}                                                                                 
 {'loss': 1.3038, 'learning_rate': 0.00017777777777777779, 'epoch': 0.06}                                                                 
@@ -47,4 +47,12 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
 {'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32}
 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:45<00:00,  1.13s/it]
 TrainOutput(global_step=200, training_loss=1.211241865158081, metrics={'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32})
-```
\ No newline at end of file
+```
+
+### 4. Merge the adapter into the original model
+
+```
+python ./export_merged_model.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --adapter_path ./outputs/checkpoint-200 --output_path ./outputs/checkpoint-200-merged
+```
+
+Then you can use `./outputs/checkpoint-200-merged` as a normal huggingface transformer model to do inference.
diff --git a/python/llm/example/gpu/qlora_finetuning/export_merged_model.py b/python/llm/example/gpu/qlora_finetuning/export_merged_model.py
new file mode 100644
index 00000000..1cf3c2ff
--- /dev/null
+++ b/python/llm/example/gpu/qlora_finetuning/export_merged_model.py
@@ -0,0 +1,93 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file is adapted from https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py
+#
+# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import torch
+import transformers
+from transformers import LlamaTokenizer  # noqa: F402
+from bigdl.llm.transformers.qlora import PeftModel
+from bigdl.llm.transformers import AutoModelForCausalLM
+import argparse
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
+    parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
+                        help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
+                             ', or the path to the huggingface checkpoint folder')
+    parser.add_argument('--adapter_path', type=str,)
+    parser.add_argument('--output_path', type=str,)
+
+    args = parser.parse_args()
+    base_model = model_path = args.repo_id_or_model_path
+    adapter_path = args.adapter_path
+    tokenizer = LlamaTokenizer.from_pretrained(base_model)
+
+    base_model = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        # load_in_low_bit="nf4", # should load the orignal model
+        torch_dtype=torch.float16,
+        device_map={"": "cpu"},
+    )
+
+    first_weight = base_model.model.layers[0].self_attn.q_proj.weight
+    first_weight_old = first_weight.clone()
+
+    lora_model = PeftModel.from_pretrained(
+        base_model,
+        adapter_path,
+        device_map={"": "cpu"},
+        torch_dtype=torch.float16,
+    )
+
+    lora_weight = lora_model.base_model.model.model.layers[
+        0
+    ].self_attn.q_proj.weight
+
+    assert torch.allclose(first_weight_old, first_weight)
+
+    # merge weights - new merging method from peft
+    lora_model = lora_model.merge_and_unload()
+
+    lora_model.train(False)
+
+    # did we do anything?
+    assert not torch.allclose(first_weight_old, first_weight)
+
+    lora_model_sd = lora_model.state_dict()
+    deloreanized_sd = {
+        k.replace("base_model.model.", ""): v
+        for k, v in lora_model_sd.items()
+        if "lora" not in k
+    }
+
+    base_model.save_pretrained(args.output_path, state_dict=deloreanized_sd)
+    tokenizer.save_pretrained(args.output_path)
diff --git a/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py b/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py
index 6531b483..85b5642e 100644
--- a/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py
+++ b/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py
@@ -45,8 +45,9 @@ if __name__ == "__main__":
     data = load_dataset(dataset_path)
     data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
     model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                load_in_4bit=True,
+                                                load_in_low_bit="nf4",
                                                 optimize_model=False,
+                                                torch_dtype=torch.float16,
                                                 modules_to_not_convert=["lm_head"],)
     model = model.to('xpu')
     model.gradient_checkpointing_enable()
@@ -71,7 +72,8 @@ if __name__ == "__main__":
             warmup_steps=20,
             max_steps=200,
             learning_rate=2e-4,
-            fp16=False, # fp16 is not supported yet
+            save_steps=100,
+            fp16=True,
             logging_steps=20,
             output_dir="outputs",
             optim="adamw_hf", # paged_adamw_8bit is not supported yet
diff --git a/python/llm/src/bigdl/llm/cli/llm-cli b/python/llm/src/bigdl/llm/cli/llm-cli
index 07562747..a145c09a 100755
--- a/python/llm/src/bigdl/llm/cli/llm-cli
+++ b/python/llm/src/bigdl/llm/cli/llm-cli
@@ -47,7 +47,11 @@ function starcoder {
 }
 
 function chatglm {
-  command="$lib_dir/main-chatglm_vnni -t $threads -n $n_predict ${filteredArguments[*]}"
+  if [[ $(lscpu | grep "amx_int8") ]]; then
+    command="$lib_dir/main-chatglm_amx -t $threads -n $n_predict ${filteredArguments[*]}"
+  else
+    command="$lib_dir/main-chatglm_vnni -t $threads -n $n_predict ${filteredArguments[*]}"
+  fi
   echo "$command"
   eval "$command"
 }
diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py
index 07f929c3..b0bc581d 100644
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/bigdl/llm/transformers/convert.py
@@ -135,6 +135,7 @@ def convert_forward(m, target_m, new_forward):
 def optimize(model):
     from packaging import version
     from bigdl.llm.transformers.models.llama import llama_attention_forward_4_31
+    from bigdl.llm.transformers.models.llama import llama_rms_norm_forward
     from transformers.modeling_utils import PreTrainedModel
 
     # All huggingface format models are inherited from `PreTrainedModel`
@@ -149,11 +150,16 @@ def optimize(model):
             model,
             transformers.models.llama.modeling_llama.LlamaAttention,
             llama_attention_forward_4_31,)
+        convert_forward(
+            model,
+            transformers.models.llama.modeling_llama.LlamaRMSNorm,
+            llama_rms_norm_forward,)
     else:
         # todo implement 4.28.0 ~ 4.30.2
         pass
 
-    if "chatglm2" in model.config._name_or_path:
+    if "chatglm-18b" in model.config._name_or_path or "chatglm2" in model.config._name_or_path:
+        # chatglm-18b or chatglm2-6b
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
         from bigdl.llm.transformers.models.chatglm2 import chatglm2_attention_forward_8eb45c
@@ -166,6 +172,7 @@ def optimize(model):
                         module.CoreAttention,
                         core_attn_forward_8eb45c)
     elif "chatglm" in model.config._name_or_path:
+        # chatglm-6b
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
         from bigdl.llm.transformers.models.chatglm import chatglm_attention_forward
@@ -280,4 +287,20 @@ def optimize(model):
                         module.InternLMAttention,
                         internlm_attention_forward
                         )
+    elif model.config.model_type == "qwen":
+        modeling_module_name = model.__class__.__module__
+        module = importlib.import_module(modeling_module_name)
+        from bigdl.llm.transformers.models.qwen import qwen_attention_forward
+        convert_forward(model,
+                        module.QWenAttention,
+                        qwen_attention_forward
+                        )
+    elif model.config.model_type == "aquila":
+        modeling_module_name = model.__class__.__module__
+        module = importlib.import_module(modeling_module_name)
+        from bigdl.llm.transformers.models.aquila import aquila_attention_forward
+        convert_forward(model,
+                        module.AquilaAttention,
+                        aquila_attention_forward
+                        )
     return model
diff --git a/python/llm/src/bigdl/llm/transformers/models/aquila.py b/python/llm/src/bigdl/llm/transformers/models/aquila.py
new file mode 100644
index 00000000..84abb6b8
--- /dev/null
+++ b/python/llm/src/bigdl/llm/transformers/models/aquila.py
@@ -0,0 +1,157 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://huggingface.co/BAAI/AquilaChat-7B/blob/main/modeling_aquila.py
+#
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
+from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb
+from bigdl.dllib.utils import log4Error
+
+KV_CACHE_ALLOC_BLOCK_LENGTH = 256
+
+
+def aquila_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states)\
+        .view(bsz, q_len, self.num_heads, self.head_dim)\
+        .transpose(1, 2)
+    key_states = self.k_proj(hidden_states)\
+        .view(bsz, q_len, self.num_heads, self.head_dim)\
+        .transpose(1, 2)
+    value_states = self.v_proj(hidden_states)\
+        .view(bsz, q_len, self.num_heads, self.head_dim)\
+        .transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
+                                                    cos, sin, position_ids, "aquila")
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        cache_k = past_key_value[0]
+        cache_v = past_key_value[1]
+        if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            # allocate new
+            new_cache_k, new_cache_v = extend_kv_cache(bsz,
+                                                       self.num_heads,  # Support GQA
+                                                       self.head_dim,
+                                                       cache_k.size(2),
+                                                       kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
+                                                       dtype=cache_k.dtype,
+                                                       device=hidden_states.device)
+            new_cache_k[:] = cache_k
+            new_cache_v[:] = cache_v
+            cache_k = new_cache_k
+            cache_v = new_cache_v
+
+        key_states, value_states = append_kv_cache(cache_k, cache_v, key_states, value_states)
+
+    elif use_cache:
+        max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
+        new_key_states, new_value_states = init_kv_cache(bsz,
+                                                         self.num_heads,
+                                                         self.head_dim,
+                                                         kv_seq_len,
+                                                         max_cache_length,
+                                                         dtype=key_states.dtype,
+                                                         device=hidden_states.device)
+        new_key_states[:] = key_states
+        new_value_states[:] = value_states
+        key_states = new_key_states
+        value_states = new_value_states
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+    attn_weights = torch.clamp(attn_weights, min=-1024., max=1024.)
+    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+        log4Error.invalidInputError(
+            f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, "
+            f"but is {attn_weights.size()}"
+        )
+
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            log4Error.invalidInputError(
+                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, "
+                f"but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights + attention_mask
+        attn_weights = torch.max(
+            attn_weights,
+            torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+        )
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32)\
+        .to(query_states.dtype)
+    attn_output = torch.matmul(attn_weights, value_states)
+
+    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        log4Error.invalidInputError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, "
+            f"but is {attn_output.size()}"
+        )
+
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
diff --git a/python/llm/src/bigdl/llm/transformers/models/llama.py b/python/llm/src/bigdl/llm/transformers/models/llama.py
index 51ddb2ee..7953670a 100644
--- a/python/llm/src/bigdl/llm/transformers/models/llama.py
+++ b/python/llm/src/bigdl/llm/transformers/models/llama.py
@@ -39,6 +39,7 @@ import torch.nn.functional as F
 from bigdl.llm.utils.common import invalidInputError
 from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 from bigdl.llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb
+from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
 
 
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -57,6 +58,19 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256
 
 
+def llama_rms_norm_forward(self, hidden_states):
+    if hidden_states.device.type == "xpu" and not (self.training and hidden_states.requires_grad):
+        hidden_states, _ = torch.ops.torch_ipex.rms_norm(hidden_states,
+                                                         [self.weight.size(0)], self.weight)
+    else:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    return hidden_states
+
+
 def llama_attention_forward_4_31(
     self,
     hidden_states: torch.Tensor,
@@ -103,9 +117,20 @@ def llama_attention_forward_4_31(
     kv_seq_len = key_states.shape[-2]
     if past_key_value is not None:
         kv_seq_len += past_key_value[0].shape[-2]
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
-                                                    cos, sin, position_ids, "llama")
+
+    use_fuse_rope = query_states.device.type == "xpu"
+    use_fuse_rope = use_fuse_rope and not (self.training and query_states.requires_grad)
+    use_fuse_rope = use_fuse_rope and self.config.rope_scaling is None
+
+    if use_fuse_rope:
+        query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states,
+                                                                     key_states,
+                                                                     position_ids,
+                                                                     "llama")
+    else:
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
+                                                        cos, sin, position_ids, "llama")
 
     if past_key_value is not None:
         # reuse k, v, self_attention
diff --git a/python/llm/src/bigdl/llm/transformers/models/qwen.py b/python/llm/src/bigdl/llm/transformers/models/qwen.py
new file mode 100644
index 00000000..ed2c3e51
--- /dev/null
+++ b/python/llm/src/bigdl/llm/transformers/models/qwen.py
@@ -0,0 +1,217 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py
+#
+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import importlib
+import math
+from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from transformers.utils import logging
+
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+
+from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
+from bigdl.llm.utils.common import invalidInputError
+
+apply_rotary_emb_func = None
+
+flash_attn_unpadded_func = None
+
+logger = logging.get_logger(__name__)
+
+KV_CACHE_ALLOC_BLOCK_LENGTH = 256
+
+
+def _rotate_half(x):
+    from einops import rearrange
+
+    x = rearrange(x, "... (j d) -> ... j d", j=2)
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(t, freqs):
+    if apply_rotary_emb_func is not None:
+        t_ = t.float()
+        freqs = freqs.squeeze(0).squeeze(1)
+        cos = freqs[:, : freqs.shape[-1] // 2].cos()
+        sin = freqs[:, : freqs.shape[-1] // 2].sin()
+        output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
+        return output
+    else:
+        rot_dim = freqs.shape[-1]
+        t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
+        t_ = t_.float()
+        t_pass_ = t_pass_.float()
+        t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin())
+        return torch.cat((t_, t_pass_), dim=-1).type_as(t)
+
+
+def qwen_attention_forward(
+    self,
+    hidden_states: Optional[Tuple[torch.FloatTensor]],
+    layer_past: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    output_attentions: Optional[bool] = False,
+    use_cache: Optional[bool] = False,
+):
+    mixed_x_layer = self.c_attn(hidden_states)
+    query, key, value = mixed_x_layer.split(self.split_size, dim=2)
+
+    query = self._split_heads(query, self.num_heads, self.head_dim)
+    key = self._split_heads(key, self.num_heads, self.head_dim)
+    value = self._split_heads(value, self.num_heads, self.head_dim)
+
+    kv_seq_len = hidden_states.size()[1]
+
+    if layer_past:
+        # layer past[0] shape: bs * seq_len * head_num * dim
+        kv_seq_len += layer_past[0].shape[1]
+    if (
+        self.use_dynamic_ntk
+        and kv_seq_len == hidden_states.size()[1]
+        and not self.training
+    ):
+        context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
+        ntk_alpha = 2 ** math.ceil(context_value) - 1
+        ntk_alpha = max(ntk_alpha, 1)
+        self._ntk_cached = ntk_alpha
+    else:
+        ntk_alpha = self._ntk_cached
+    rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(
+        hidden_states.device
+    )
+
+    if rotary_pos_emb is not None:
+        if isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = rotary_pos_emb
+        else:
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+    if rotary_pos_emb is not None:
+        q_pos_emb, k_pos_emb = rotary_pos_emb
+        # Slice the pos emb for current inference
+        cur_len = query.shape[1]
+        q_pos_emb = q_pos_emb[:, -cur_len:, :, :]
+        k_pos_emb = k_pos_emb[:, -cur_len:, :, :]
+        query = apply_rotary_pos_emb(query, q_pos_emb)
+        key = apply_rotary_pos_emb(key, k_pos_emb)
+
+    bsz, _, n_heads, head_dim = key.size()
+
+    if layer_past is not None:
+        # past_key, past_value = layer_past[0], layer_past[1]
+        # key = torch.cat((past_key, key), dim=1)
+        # value = torch.cat((past_value, value), dim=1)
+        cache_k = layer_past[0].transpose(1, 2)
+        cache_v = layer_past[1].transpose(1, 2)
+        if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            # allocate new
+            new_cache_k, new_cache_v = extend_kv_cache(bsz,
+                                                       self.num_heads,  # Support GQA
+                                                       self.head_dim,
+                                                       cache_k.size(2),
+                                                       kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
+                                                       dtype=cache_k.dtype,
+                                                       device=hidden_states.device)
+            new_cache_k[:] = cache_k
+            new_cache_v[:] = cache_v
+            cache_k = new_cache_k
+            cache_v = new_cache_v
+
+        key_states, value_states = append_kv_cache(cache_k, cache_v,
+                                                   key.transpose(1, 2), value.transpose(1, 2))
+        key = key_states.transpose(1, 2)
+        value = value_states.transpose(1, 2)
+    elif use_cache:
+        max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
+        new_key_states, new_value_states = init_kv_cache(bsz,
+                                                         self.num_heads,
+                                                         self.head_dim,
+                                                         kv_seq_len,
+                                                         max_cache_length,
+                                                         dtype=key.dtype,
+                                                         device=hidden_states.device)
+        new_key_states[:] = key.transpose(1, 2)
+        new_value_states[:] = value.transpose(1, 2)
+        key = new_key_states.transpose(1, 2)
+        value = new_value_states.transpose(1, 2)
+
+    if use_cache:
+        present = (key, value)
+    else:
+        present = None
+
+    if self.use_logn_attn and not self.training:
+        if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
+            self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
+        seq_start = key.size(1) - query.size(1)
+        seq_end = key.size(1)
+        logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
+        query = query * logn_tensor.expand_as(query)
+
+    if (
+        self.use_flash_attn
+        and flash_attn_unpadded_func is not None
+        and not self.is_fp32
+        and query.is_cuda
+    ):
+        q, k, v = query, key, value
+        context_layer = self.core_attention_flash(q, k, v)
+
+        context_layer = rearrange(
+            context_layer, "b s h d -> b s (h d)"
+        ).contiguous()
+    else:
+        query = query.permute(0, 2, 1, 3)
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+        attn_output, attn_weight = self._attn(
+            query, key, value, attention_mask, head_mask
+        )
+        context_layer = self._merge_heads(
+            attn_output, self.num_heads, self.head_dim
+        )
+
+    attn_output = self.c_proj(context_layer)
+    outputs = (attn_output, present)
+    if output_attentions:
+        if (
+            self.use_flash_attn
+            and flash_attn_unpadded_func is not None
+            and not self.is_fp32
+        ):
+            invalidInputError("Cannot output attentions while using flash-attn")
+        else:
+            outputs += (attn_weight,)
+
+    return outputs
diff --git a/python/llm/src/bigdl/llm/transformers/models/utils.py b/python/llm/src/bigdl/llm/transformers/models/utils.py
index 4489b268..1aed301f 100644
--- a/python/llm/src/bigdl/llm/transformers/models/utils.py
+++ b/python/llm/src/bigdl/llm/transformers/models/utils.py
@@ -71,7 +71,7 @@ def rotate_every_two(x):
 
 
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
-    if model_family in ["llama", "baichuan", "internlm"]:
+    if model_family in ["llama", "baichuan", "internlm", "aquila"]:
         # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
         cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
         sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
@@ -97,3 +97,18 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
     else:
         invalidInputError(False,
                           f"{model_family} is not supported.")
+
+
+def apply_rotary_pos_emb_no_cache_xpu(q, k, position_ids, model_family):
+    if q.device.type != "xpu":
+        invalidInputError(False,
+                          f"only xpu is supported in this function")
+    import linear_q4_0
+    q_embed = torch.empty(q.shape, dtype=q.dtype, device=q.device)
+    k_embed = torch.empty(k.shape, dtype=k.dtype, device=k.device)
+    if model_family in ["llama", "baichuan", "internlm", "aquila", "gpt_neox"]:
+        linear_q4_0.apply_rotary_embedding_half_qk(q, k, position_ids, q_embed, k_embed)
+        return q_embed, k_embed
+    else:
+        invalidInputError(False,
+                          f"{model_family} is not supported.")
diff --git a/python/llm/src/bigdl/llm/transformers/qlora.py b/python/llm/src/bigdl/llm/transformers/qlora.py
index d2728f08..2b074105 100644
--- a/python/llm/src/bigdl/llm/transformers/qlora.py
+++ b/python/llm/src/bigdl/llm/transformers/qlora.py
@@ -36,6 +36,7 @@ import torch
 from bigdl.llm.transformers.low_bit_linear import LowBitLinear
 from peft.tuners.lora import LoraLayer
 from bigdl.llm.utils.common import invalidInputError
+import functools
 
 
 class LoraLowBitLinear(LowBitLinear, LoraLayer):
@@ -94,13 +95,11 @@ class LoraLowBitLinear(LowBitLinear, LoraLayer):
         return result
 
 
-@staticmethod
-def _create_new_module(lora_config, adapter_name, target, **kwargs):
-
-    bias = kwargs.pop("bias", False)
+def _create_new_module(create_new_module_func, lora_config, adapter_name, target, **kwargs):
 
     if isinstance(target, LowBitLinear):
         low_bit_kwargs = kwargs.copy()
+        bias = low_bit_kwargs.pop("bias", False)
         low_bit_kwargs.update(
             {
                 "qtype": target.qtype,
@@ -112,9 +111,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                                       bias=bias,
                                       **low_bit_kwargs)
     else:
-        invalidInputError(False,
-                          f"Target module {target} is not supported. "
-                          f"Currently, only `LowBitLinear` are supported.")
+        new_module = create_new_module_func(lora_config, adapter_name, target, **kwargs)
 
     return new_module
 
@@ -124,7 +121,8 @@ from peft.tuners.lora import LoraModel
 
 def get_peft_model(*args, **kwargs):
     old_create_new_module = LoraModel._create_new_module
-    LoraModel._create_new_module = _create_new_module
+    LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module,
+                                                                  old_create_new_module))
     try:
         from peft import get_peft_model as get_peft_model_original
         model = get_peft_model_original(*args, **kwargs)
@@ -181,7 +179,8 @@ class PeftModel:
     def from_pretrained(*args,
                         **kwargs):
         old_create_new_module = LoraModel._create_new_module
-        LoraModel._create_new_module = _create_new_module
+        LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module,
+                                                                      old_create_new_module))
         from peft import PeftModel
         try:
             model = PeftModel.from_pretrained(*args, **kwargs)