Merge remote-tracking branch 'upstream/main'

2023-10-07 09:53:52 +08:00 · 2023-10-07 09:53:52 +08:00 · 4aee952b10
commit 4aee952b10
parent ddcd9e7d0a b773d67dd4
26 changed files with 1181 additions and 42 deletions
--- a/.github/workflows/manually_build.yml
+++ b/.github/workflows/manually_build.yml
@ -12,6 +12,7 @@ on:
        - all
        - bigdl-llm-xpu
        - bigdl-llm-cpu
+        - bigdl-llm-serving-cpu
        - bigdl-ppml-gramine-base
        - bigdl-ppml-trusted-bigdl-llm-gramine-base
        - bigdl-ppml-trusted-bigdl-llm-gramine-ref
@ -114,6 +115,32 @@ jobs:
        sudo docker push 10.239.45.10/arda/${image}:${TAG}
        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}

+  bigdl-llm-serving-cpu:
+    if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }}
+    runs-on: [self-hosted, Shire]
+    steps:
+    - uses: actions/checkout@v3
+    - name: docker login
+      run: |
+        docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
+    - name: bigdl-llm-serving-cpu
+      run: |
+        echo "##############################################################"
+        echo "####### bigdl-llm-serving-cpu ########"
+        echo "##############################################################"
+        export image=intelanalytics/bigdl-llm-serving-cpu
+        cd docker/llm/serving/cpu/docker
+        sudo docker build \
+          --no-cache=true \
+          --build-arg http_proxy=${HTTP_PROXY} \
+          --build-arg https_proxy=${HTTPS_PROXY} \
+          --build-arg no_proxy=${NO_PROXY} \
+          -t ${image}:${TAG} -f ./Dockerfile .
+        sudo docker push ${image}:${TAG}
+        sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+        sudo docker push 10.239.45.10/arda/${image}:${TAG}
+        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+
  bigdl-ppml-gramine-base:
    if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }}
    runs-on: [self-hosted, Shire]
--- a/README.md
+++ b/README.md
@ -9,12 +9,13 @@

 **[`bigdl-llm`](python/llm)** is a library for running **LLM** (large language model) on Intel **XPU** (from *Laptop* to *GPU* to *Cloud*) using **INT4** with very low latency[^1] (for any **PyTorch** model).

-> *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [gptq](https://github.com/IST-DASLab/gptq), [ggml](https://github.com/ggerganov/ggml), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.*
+> *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [ggml](https://github.com/ggerganov/ggml), [gptq](https://github.com/IST-DASLab/gptq), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.*

 ### Latest update
+- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/gpu/qlora_finetuning).
 - `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/gpu).
 - `bigdl-llm` tutorial is released [here](https://github.com/intel-analytics/bigdl-llm-tutorial).
- Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly-v1/Dolly-v2, StarCoder, Whisper, InternLM, QWen, Baichuan, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models).
+- Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models).
     
 ### `bigdl-llm` Demos
 See the ***optimized performance*** of `chatglm2-6b` and `llama-2-13b-chat` models on 12th Gen Intel Core CPU and Intel Arc GPU below.
--- a/docker/llm/serving/cpu/docker/Dockerfile
+++ b/docker/llm/serving/cpu/docker/Dockerfile
@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT

 ARG http_proxy
 ARG https_proxy
+ARG TINI_VERSION=v0.18.0

 # Disable pip's cache behavior
 ARG PIP_NO_CACHE_DIR=false

+ADD ./entrypoint.sh /opt/entrypoint.sh
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
 # Install Serving Dependencies
 RUN mkdir /llm && \
    cd /llm && \
@ -13,7 +16,11 @@ RUN mkdir /llm && \
    cd FastChat && \
    git checkout dev-2023-09-22 && \
    pip3 install -e ".[model_worker,webui]" && \
-    cd /llm
+    cd /llm && \
+    chmod +x /opt/entrypoint.sh && \
+    chmod +x /sbin/tini && \
+    cp /sbin/tini /usr/bin/tini


 WORKDIR /llm/
+ENTRYPOINT [ "/opt/entrypoint.sh" ]
--- a/docker/llm/serving/cpu/docker/entrypoint.sh
+++ b/docker/llm/serving/cpu/docker/entrypoint.sh
@ -0,0 +1,200 @@
+#!/bin/bash
+
+usage() {
+  echo "Usage: $0 [-m --mode <controller|worker>] [-h --help]"
+  echo "-h: Print help message."
+  echo "Controller mode reads the following env:"
+  echo "CONTROLLER_HOST (default: localhost)."
+  echo "CONTROLLER_PORT (default: 21001)."
+  echo "API_HOST (default: localhost)."
+  echo "API_PORT (default: 8000)."
+  echo "Worker mode reads the following env:"
+  echo "CONTROLLER_HOST (default: localhost)."
+  echo "CONTROLLER_PORT (default: 21001)."
+  echo "WORKER_HOST (default: localhost)."
+  echo "WORKER_PORT (default: 21002)."
+  echo "MODEL_PATH (default: empty)."
+  exit 1
+}
+
+# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
+calculate_total_cores() {
+  local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"
+
+  if [[ -f "$cpuset_file" ]]; then
+    local cpuset_cpus=$(cat "$cpuset_file")
+    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
+
+    local total_cores=0
+    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
+    for cpu in "${cpu_list[@]}"; do
+      if [[ $cpu =~ - ]]; then
+        # Range of CPUs
+        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
+        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
+        local range_cores=$((end_cpu - start_cpu + 1))
+        total_cores=$((total_cores + range_cores))
+      else
+        # Single CPU
+        total_cores=$((total_cores + 1))
+      fi
+    done
+
+    echo $total_cores
+    return
+  fi
+  # Kubernetes core-binding will use this file
+  cpuset_file="/sys/fs/cgroup/cpuset.cpus"
+  if [[ -f "$cpuset_file" ]]; then
+    local cpuset_cpus=$(cat "$cpuset_file")
+    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
+
+    local total_cores=0
+    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
+    for cpu in "${cpu_list[@]}"; do
+      if [[ $cpu =~ - ]]; then
+        # Range of CPUs
+        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
+        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
+        local range_cores=$((end_cpu - start_cpu + 1))
+        total_cores=$((total_cores + range_cores))
+      else
+        # Single CPU
+        total_cores=$((total_cores + 1))
+      fi
+    done
+
+    echo $total_cores
+    return
+  else
+    echo -1
+    return
+  fi
+}
+
+# Default values
+controller_host="localhost"
+controller_port="21001"
+api_host="localhost"
+api_port="8000"
+worker_host="localhost"
+worker_port="21002"
+model_path=""
+mode=""
+omp_num_threads=""
+dispatch_method="shortest_queue" # shortest_queue or lottery
+
+# Update rootCA config if needed
+update-ca-certificates
+
+# Remember the value of `OMP_NUM_THREADS`:
+if [[ -n "${OMP_NUM_THREADS}" ]]; then
+  omp_num_threads="${OMP_NUM_THREADS}"
+fi
+
+# We do not have any arguments, just run bash
+if [ "$#" == 0 ]; then
+  echo "[INFO] no command is passed in"
+  echo "[INFO] enter pass-through mode"
+  exec /usr/bin/tini -s -- "bash"
+else
+  # Parse command-line options
+  options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@")
+  if [ $? != 0 ]; then
+    usage
+  fi
+  eval set -- "$options"
+
+  while true; do
+    case "$1" in
+      -m|--mode)
+        mode="$2"
+        [[ $mode == "controller" || $mode == "worker" ]] || usage
+        shift 2
+        ;;
+      -h|--help)
+        usage
+        ;;
+      --)
+        shift
+        break
+        ;;
+      *)
+        usage
+        ;;
+    esac
+  done
+
+  if [[ -n $CONTROLLER_HOST ]]; then
+    controller_host=$CONTROLLER_HOST
+  fi
+
+  if [[ -n $CONTROLLER_PORT ]]; then
+    controller_port=$CONTROLLER_PORT
+  fi
+
+  if [[ -n $API_HOST ]]; then
+    api_host=$API_HOST
+  fi
+
+  if [[ -n $API_PORT ]]; then
+    api_port=$API_PORT
+  fi
+
+  if [[ -n $WORKER_HOST ]]; then
+    worker_host=$WORKER_HOST
+  fi
+
+  if [[ -n $WORKER_PORT ]]; then
+    worker_port=$WORKER_PORT
+  fi
+
+  if [[ -n $MODEL_PATH ]]; then
+    model_path=$MODEL_PATH
+  fi
+
+  if [[ -n $DISPATCH_METHOD ]]; then
+    dispatch_method=$DISPATCH_METHOD
+  fi
+
+  controller_address="http://$controller_host:$controller_port"
+  # Execute logic based on options
+  if [[ $mode == "controller" ]]; then
+    # Logic for controller mode
+    # Boot Controller
+    api_address="http://$api_host:$api_port"
+    echo "Controller address: $controller_address"
+    echo "OpenAI API address: $api_address"
+    python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
+    # Boot openai api server
+    python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
+  else
+    # Logic for non-controller(worker) mode
+    worker_address="http://$worker_host:$worker_port"
+    # Apply optimizations from bigdl-nano
+    source bigdl-nano-init -t
+    # First check if user have set OMP_NUM_THREADS by themselves
+    if [[ -n "${omp_num_threads}" ]]; then
+      echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
+      export OMP_NUM_THREADS=$omp_num_threads
+    else
+      # Use calculate_total_cores to acquire cpuset settings
+      # Set OMP_NUM_THREADS to correct numbers
+      cores=$(calculate_total_cores)
+      if [[ $cores == -1 || $cores == 0 ]]; then
+        echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
+      else
+        echo "Setting OMP_NUM_THREADS to $cores"
+        export OMP_NUM_THREADS=$cores
+      fi
+    fi
+    if [[ -z "${model_path}" ]]; then
+          echo "Please set env MODEL_PATH used for worker"
+          usage
+    fi
+    echo "Worker address: $worker_address"
+    echo "Controller address: $controller_address"
+    python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
+  fi
+fi
+
--- a/docker/llm/serving/cpu/kubernetes/README.md
+++ b/docker/llm/serving/cpu/kubernetes/README.md
@ -0,0 +1,235 @@
+## Deployment bigdl-llm serving service in K8S environment
+
+
+## Image
+
+To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT`
+
+## Before deployment
+
+### Models
+
+In this document, we will use `vicuna-7b-v1.5` as the deployment model.
+
+After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used.
+
+You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
+
+### Kubernetes config
+
+We recommend to setup your kubernetes cluster before deployment.  Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/).  Also, it would be great to also set the `topology management policy` to `single-numa-node`.
+
+### Machine config
+
+Set hyper-threading to off, ensure that only physical cores are used during deployment.
+
+## Deployment
+
+### Reminder on `OMP_NUM_THREADS`
+
+The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`.  However, this only happens correctly if the `core-binding` feature is enabled.  If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file.
+
+
+### Controller
+
+We use the following yaml file for controller deployment:
+
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: bigdl-fschat-a1234bd-controller
+  labels:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+spec:
+  dnsPolicy: "ClusterFirst"
+  containers:
+  - name: fastchat-controller # fixed
+    image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+    imagePullPolicy: IfNotPresent
+    env:
+    - name: CONTROLLER_HOST # fixed
+      value: "0.0.0.0"
+    - name: CONTROLLER_PORT # fixed
+      value: "21005"
+    - name: API_HOST # fixed
+      value: "0.0.0.0"
+    - name: API_PORT # fixed
+      value: "8000"
+    ports:
+      - containerPort: 21005
+        name: con-port
+      - containerPort: 8000
+        name: api-port
+    resources:
+      requests:
+        memory: 16Gi
+        cpu: 4
+      limits:
+        memory: 16Gi
+        cpu: 4
+    args: ["-m", "controller"]
+  restartPolicy: "Never"
+---
+# Service for the controller
+apiVersion: v1
+kind: Service
+metadata:
+  name: bigdl-a1234bd-fschat-controller-service
+spec:
+  # You may also want to change this to use the cluster's feature
+  type: NodePort
+  selector:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+  ports:
+    - name: cont-port
+      protocol: TCP
+      port: 21005
+      targetPort: 21005
+    - name: api-port
+      protocol: TCP
+      port: 8000
+      targetPort: 8000
+```
+
+### Worker
+
+We use the following deployment for worker deployment:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: bigdl-fschat-a1234bd-worker-deployment
+spec:
+  # Change this to the number you want
+  replicas: 1
+  selector:
+    matchLabels:
+      fastchat: worker
+  template:
+    metadata:
+      labels:
+        fastchat: worker
+    spec:
+      dnsPolicy: "ClusterFirst"
+      containers:
+      - name: fastchat-worker # fixed
+        image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+        imagePullPolicy: IfNotPresent
+        env:
+        - name: CONTROLLER_HOST # fixed
+          value: bigdl-a1234bd-fschat-controller-service
+        - name: CONTROLLER_PORT # fixed
+          value: "21005"
+        - name: WORKER_HOST # fixed
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+        - name: WORKER_PORT # fixed
+          value: "21841"
+        - name: MODEL_PATH # Change this
+          value: "/llm/models/vicuna-7b-v1.5-bigdl/"
+        - name: OMP_NUM_THREADS
+          value: "16"
+        resources:
+          requests:
+            memory: 32Gi
+            cpu: 16
+          limits:
+            memory: 32Gi
+            cpu: 16
+        args: ["-m", "worker"]
+        volumeMounts:
+          - name: llm-models
+            mountPath: /llm/models/
+      restartPolicy: "Always"
+      volumes:
+      - name: llm-models
+        hostPath:
+          path: /home/llm/models # change this in other envs
+```
+
+You may want to change the `MODEL_PATH` variable in the yaml.  Also, please remember to change the volume path accordingly.
+
+
+### Testing
+
+#### Using openai-python
+
+First, install openai-python:
+```bash
+pip install --upgrade openai
+```
+
+Then, interact with model vicuna-7b-v1.5-bigdl:
+```python
+import openai
+openai.api_key = "EMPTY"
+openai.api_base = "http://localhost:8000/v1"
+
+model = "vicuna-7b-v1.5-bigdl"
+prompt = "Once upon a time"
+
+# create a completion
+completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64)
+# print the completion
+print(prompt + completion.choices[0].text)
+
+# create a chat completion
+completion = openai.ChatCompletion.create(
+  model=model,
+  messages=[{"role": "user", "content": "Hello! What is your name?"}]
+)
+# print the completion
+print(completion.choices[0].message.content)
+```
+
+#### cURL
+cURL is another good tool for observing the output of the api.
+
+For the following examples, you may also change the service deployment address.
+
+List Models:
+```bash
+curl http://localhost:8000/v1/models
+```
+
+If you have `jq` installed, you can use it to format the output like this:
+```bash
+curl http://localhost:8000/v1/models | jq
+```
+
+Chat Completions:
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "YOUR_MODEL",
+    "messages": [{"role": "user", "content": "Hello! What is your name?"}]
+  }'
+```
+
+Text Completions:
+```bash
+curl http://localhost:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "YOUR_MODEL",
+    "prompt": "Once upon a time",
+    "max_tokens": 41,
+    "temperature": 0.5
+  }'
+```
+
+Embeddings:
+```bash
+curl http://localhost:8000/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "YOUR_MODEL",
+    "input": "Hello world!"
+  }'
+```
--- a/docker/llm/serving/cpu/kubernetes/clean.sh
+++ b/docker/llm/serving/cpu/kubernetes/clean.sh
@ -0,0 +1 @@
+kubectl delete -f deployment.yaml
--- a/docker/llm/serving/cpu/kubernetes/deployment.yaml
+++ b/docker/llm/serving/cpu/kubernetes/deployment.yaml
@ -0,0 +1,109 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: bigdl-fschat-a1234bd-controller
+  labels:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+spec:
+  dnsPolicy: "ClusterFirst"
+  containers:
+  - name: fastchat-controller # fixed
+    image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+    imagePullPolicy: IfNotPresent
+    env:
+    - name: CONTROLLER_HOST # fixed
+      value: "0.0.0.0"
+    - name: CONTROLLER_PORT # fixed
+      value: "21005"
+    - name: API_HOST # fixed
+      value: "0.0.0.0"
+    - name: API_PORT # fixed
+      value: "8000"
+    ports:
+      - containerPort: 21005
+        name: con-port
+      - containerPort: 8000
+        name: api-port
+    resources:
+      requests:
+        memory: 16Gi
+        cpu: 4
+      limits:
+        memory: 16Gi
+        cpu: 4
+    args: ["-m", "controller"]
+  restartPolicy: "Never"
+---
+# Service for the controller
+apiVersion: v1
+kind: Service
+metadata:
+  name: bigdl-a1234bd-fschat-controller-service
+spec:
+  # You may also want to change this to use the cluster's feature
+  type: NodePort
+  selector:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+  ports:
+    - name: cont-port
+      protocol: TCP
+      port: 21005
+      targetPort: 21005
+    - name: api-port
+      protocol: TCP
+      port: 8000
+      targetPort: 8000
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: bigdl-fschat-a1234bd-worker-deployment
+spec:
+  # Change this to the number you want
+  replicas: 1
+  selector:
+    matchLabels:
+      fastchat: worker
+  template:
+    metadata:
+      labels:
+        fastchat: worker
+    spec:
+      dnsPolicy: "ClusterFirst"
+      containers:
+      - name: fastchat-worker # fixed
+        image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+        imagePullPolicy: IfNotPresent
+        env:
+        - name: CONTROLLER_HOST # fixed
+          value: bigdl-a1234bd-fschat-controller-service
+        - name: CONTROLLER_PORT # fixed
+          value: "21005"
+        - name: WORKER_HOST # fixed
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+        - name: WORKER_PORT # fixed
+          value: "21841"
+        - name: MODEL_PATH # Change this
+          value: "/llm/models/vicuna-7b-v1.5-bigdl/"
+        - name: OMP_NUM_THREADS
+          value: "16"
+        resources:
+          requests:
+            memory: 32Gi
+            cpu: 16
+          limits:
+            memory: 32Gi
+            cpu: 16
+        args: ["-m", "worker"]
+        volumeMounts:
+          - name: llm-models
+            mountPath: /llm/models/
+      restartPolicy: "Always"
+      volumes:
+      - name: llm-models
+        hostPath:
+          path: /home/llm/models # change this in other envs
--- a/docs/readthedocs/source/_toc.yml
+++ b/docs/readthedocs/source/_toc.yml
@ -38,12 +38,12 @@ subtrees:
            title: "Key Features"
            subtrees:
              - entries:
+                - file: doc/LLM/Overview/KeyFeatures/optimize_model
                - file: doc/LLM/Overview/KeyFeatures/transformers_style_api
                  subtrees:
                    - entries:
                      - file: doc/LLM/Overview/KeyFeatures/hugging_face_format
                      - file: doc/LLM/Overview/KeyFeatures/native_format
-                - file: doc/LLM/Overview/KeyFeatures/optimize_model
                - file: doc/LLM/Overview/KeyFeatures/langchain_api
                # - file: doc/LLM/Overview/KeyFeatures/cli
                - file: doc/LLM/Overview/KeyFeatures/gpu_supports
--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/index.rst
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/index.rst
@ -3,12 +3,12 @@ BigDL-LLM Key Features

 You may run the LLMs using ``bigdl-llm`` through one of the following APIs:

+* `PyTorch API <./optimize_model.html>`_
 * |transformers_style_api|_

  * |hugging_face_transformers_format|_
  * `Native Format <./native_format.html>`_

-* `General PyTorch Model Supports <./langchain_api.html>`_
 * `LangChain API <./langchain_api.html>`_
 * `GPU Supports <./gpu_supports.html>`_

--- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md
@ -1,22 +1,27 @@
-## General PyTorch Model Supports
+## PyTorch API

-You may apply BigDL-LLM optimizations on any Pytorch models, not only Hugging Face *Transformers* models for acceleration. With BigDL-LLM, PyTorch models (in FP16/BF16/FP32) can be optimized with low-bit quantizations (supported precisions include INT4/INT5/INT8).
+In general, you just need one-line `optimize_model` to easily optimize any loaded PyTorch model, regardless of the library or API you are using. With BigDL-LLM, PyTorch models (in FP16/BF16/FP32) can be optimized with low-bit quantizations (supported precisions include INT4, INT5, INT8, etc).

-You can easily enable BigDL-LLM INT4 optimizations on any Pytorch models just as follows:
+First, use any PyTorch APIs you like to load your model. To help you better understand the process, here we use [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) library `LlamaForCausalLM` to load a popular model [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) as an example:

 ```python
-# Create or load any Pytorch model
-model = ...
+# Create or load any Pytorch model, take Llama-2-7b-chat-hf as an example
+from transformers import LlamaForCausalLM
+model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
+```

-# Add only two lines to enable BigDL-LLM INT4 optimizations on model
+Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default: 
+```python
 from bigdl.llm import optimize_model
+
+# With only one line to enable BigDL-LLM INT4 optimization
 model = optimize_model(model)
 ```

-After optimizing the model, you may straightly run the optimized model with no API changed and less inference latency.
+After optimizing the model, BigDL-LLM does not require any change in the inference code. You can use any libraries to run the optimized model with very low latency.

 ```eval_rst
 .. seealso::

-   See the examples for Hugging Face *Transformers* models `here <https://github.com/intel-analytics/BigDL/blob/main/python/llm/example/transformers/general_int4>`_. And examples for other general Pytorch models can be found `here <https://github.com/intel-analytics/BigDL/blob/main/python/llm/example/pytorch-model>`_.
+   * For more detailed usage of ``optimize_model``, please refer to the `API documentation <https://bigdl.readthedocs.io/en/latest/doc/PythonAPI/LLM/optimize.html>`_.
 ```
--- a/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md
@ -5,9 +5,11 @@
 Install BigDL-LLM for CPU supports using pip through:

 ```bash
-pip install bigdl-llm[all]
+pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option
 ```

+Please refer to [Environment Setup](#environment-setup) for more information.
+
 ```eval_rst
 .. note::

@ -43,7 +45,7 @@ First we recommend using [Conda](https://docs.conda.io/en/latest/miniconda.html)
 conda create -n llm python=3.9
 conda activate llm

-pip install bigdl-llm[all] # install bigdl-llm for CPU with 'all' option
+pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option
 ```

 Then for running a LLM model with BigDL-LLM optimizations (taking an `example.py` an example):
--- a/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md
+++ b/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md
@ -5,9 +5,11 @@
 Install BigDL-LLM for GPU supports using pip through:

 ```bash
-pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
+pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu # install bigdl-llm for GPU
 ```

+Please refer to [Environment Setup](#environment-setup) for more information.
+
 ```eval_rst
 .. note::

@ -25,6 +27,12 @@ BigDL-LLM for GPU supports has been verified on:
 * Intel Arc™ A-Series Graphics
 * Intel Data Center GPU Flex Series

+```eval_rst
+.. note::
+
+   We currently supoort the Ubuntu 20.04 operating system or later. Windows supoort is in progress.
+```
+
 To apply Intel GPU acceleration, there're several steps for tools installation and environment preparation:

 * Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered.
--- a/docs/readthedocs/source/doc/LLM/index.rst
+++ b/docs/readthedocs/source/doc/LLM/index.rst
@ -32,8 +32,8 @@ BigDL-LLM

        +++

+        :bdg-link:`PyTorch <./Overview/KeyFeatures/optimize_model.html>` |
        :bdg-link:`transformers-style <./Overview/KeyFeatures/transformers_style_api.html>` |
-        :bdg-link:`Optimize Model <./Overview/KeyFeatures/optimize_model.html>` |
        :bdg-link:`LangChain <./Overview/KeyFeatures/langchain_api.html>` |
        :bdg-link:`GPU <./Overview/KeyFeatures/gpu_supports.html>`

--- a/docs/readthedocs/source/doc/PythonAPI/LLM/index.rst
+++ b/docs/readthedocs/source/doc/PythonAPI/LLM/index.rst
@ -4,6 +4,6 @@ BigDL-LLM API
 .. toctree::
    :maxdepth: 3

+    optimize.rst
    transformers.rst
    langchain.rst
-    optimize.rst
--- a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
+++ b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
@ -1,4 +1,4 @@
-BigDL-LLM Optimize API
+BigDL-LLM PyTorch API
 =====================

 llm.optimize
--- a/docs/readthedocs/source/index.rst
+++ b/docs/readthedocs/source/index.rst
@ -24,9 +24,10 @@ BigDL-LLM: low-Bit LLM library
 ============================================
 Latest update
 ============================================
+- **[New]** ``bigdl-llm`` now supports QLoRA fintuning on Intel GPU; see the the example `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/gpu/qlora_finetuning>`_.
 - ``bigdl-llm`` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/gpu>`_.
 - ``bigdl-llm`` tutorial is released `here <https://github.com/intel-analytics/bigdl-llm-tutorial>`_.
- Over 20 models have been verified on ``bigdl-llm``, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly-v1/Dolly-v2, StarCoder, Whisper, InternLM, QWen, Baichuan, MOSS* and more; see the complete list `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/README.md#verified-models>`_.
+- Over 20 models have been verified on ``bigdl-llm``, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS* and more; see the complete list `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/README.md#verified-models>`_.


 ============================================
--- a/python/llm/example/gpu/qlora_finetuning/README.md
+++ b/python/llm/example/gpu/qlora_finetuning/README.md
@ -1,4 +1,4 @@
-# Q-Lora (experimental support)
+# Finetuning LLAMA Using Q-Lora (experimental support)

 This example demonstrates how to finetune a llama2-7b model use Big-LLM 4bit optimizations using [Intel GPUs](../README.md).

@ -7,7 +7,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi

 ## Example: Finetune llama2-7b using qlora

-This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing)
+This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing). The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).

 ### 1. Install

@ -26,13 +26,13 @@ pip install peft==0.5.0
 source /opt/intel/oneapi/setvars.sh
 ```

-### 3. Run
+### 3. Finetune model

 ```
 python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
 ```

-### Sample Output
+#### Sample Output
 ```log
 {'loss': 1.6134, 'learning_rate': 0.0002, 'epoch': 0.03}                                                                                 
 {'loss': 1.3038, 'learning_rate': 0.00017777777777777779, 'epoch': 0.06}                                                                 
@ -47,4 +47,12 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
 {'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32}
 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:45<00:00,  1.13s/it]
 TrainOutput(global_step=200, training_loss=1.211241865158081, metrics={'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32})
-```
+```
+
+### 4. Merge the adapter into the original model
+
+```
+python ./export_merged_model.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --adapter_path ./outputs/checkpoint-200 --output_path ./outputs/checkpoint-200-merged
+```
+
+Then you can use `./outputs/checkpoint-200-merged` as a normal huggingface transformer model to do inference.
--- a/python/llm/example/gpu/qlora_finetuning/export_merged_model.py
+++ b/python/llm/example/gpu/qlora_finetuning/export_merged_model.py
@ -0,0 +1,93 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file is adapted from https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py
+#
+# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import torch
+import transformers
+from transformers import LlamaTokenizer  # noqa: F402
+from bigdl.llm.transformers.qlora import PeftModel
+from bigdl.llm.transformers import AutoModelForCausalLM
+import argparse
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
+    parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
+                        help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
+                             ', or the path to the huggingface checkpoint folder')
+    parser.add_argument('--adapter_path', type=str,)
+    parser.add_argument('--output_path', type=str,)
+
+    args = parser.parse_args()
+    base_model = model_path = args.repo_id_or_model_path
+    adapter_path = args.adapter_path
+    tokenizer = LlamaTokenizer.from_pretrained(base_model)
+
+    base_model = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        # load_in_low_bit="nf4", # should load the orignal model
+        torch_dtype=torch.float16,
+        device_map={"": "cpu"},
+    )
+
+    first_weight = base_model.model.layers[0].self_attn.q_proj.weight
+    first_weight_old = first_weight.clone()
+
+    lora_model = PeftModel.from_pretrained(
+        base_model,
+        adapter_path,
+        device_map={"": "cpu"},
+        torch_dtype=torch.float16,
+    )
+
+    lora_weight = lora_model.base_model.model.model.layers[
+        0
+    ].self_attn.q_proj.weight
+
+    assert torch.allclose(first_weight_old, first_weight)
+
+    # merge weights - new merging method from peft
+    lora_model = lora_model.merge_and_unload()
+
+    lora_model.train(False)
+
+    # did we do anything?
+    assert not torch.allclose(first_weight_old, first_weight)
+
+    lora_model_sd = lora_model.state_dict()
+    deloreanized_sd = {
+        k.replace("base_model.model.", ""): v
+        for k, v in lora_model_sd.items()
+        if "lora" not in k
+    }
+
+    base_model.save_pretrained(args.output_path, state_dict=deloreanized_sd)
+    tokenizer.save_pretrained(args.output_path)
--- a/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py
+++ b/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py
@ -45,8 +45,9 @@ if __name__ == "__main__":
    data = load_dataset(dataset_path)
    data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
    model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                load_in_4bit=True,
+                                                load_in_low_bit="nf4",
                                                optimize_model=False,
+                                                torch_dtype=torch.float16,
                                                modules_to_not_convert=["lm_head"],)
    model = model.to('xpu')
    model.gradient_checkpointing_enable()
@ -71,7 +72,8 @@ if __name__ == "__main__":
            warmup_steps=20,
            max_steps=200,
            learning_rate=2e-4,
-            fp16=False, # fp16 is not supported yet
+            save_steps=100,
+            fp16=True,
            logging_steps=20,
            output_dir="outputs",
            optim="adamw_hf", # paged_adamw_8bit is not supported yet
--- a/python/llm/src/bigdl/llm/cli/llm-cli
+++ b/python/llm/src/bigdl/llm/cli/llm-cli
@ -47,7 +47,11 @@ function starcoder {
 }

 function chatglm {
-  command="$lib_dir/main-chatglm_vnni -t $threads -n $n_predict ${filteredArguments[*]}"
+  if [[ $(lscpu | grep "amx_int8") ]]; then
+    command="$lib_dir/main-chatglm_amx -t $threads -n $n_predict ${filteredArguments[*]}"
+  else
+    command="$lib_dir/main-chatglm_vnni -t $threads -n $n_predict ${filteredArguments[*]}"
+  fi
  echo "$command"
  eval "$command"
 }
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/bigdl/llm/transformers/convert.py
@ -135,6 +135,7 @@ def convert_forward(m, target_m, new_forward):
 def optimize(model):
    from packaging import version
    from bigdl.llm.transformers.models.llama import llama_attention_forward_4_31
+    from bigdl.llm.transformers.models.llama import llama_rms_norm_forward
    from transformers.modeling_utils import PreTrainedModel

    # All huggingface format models are inherited from `PreTrainedModel`
@ -149,11 +150,16 @@ def optimize(model):
            model,
            transformers.models.llama.modeling_llama.LlamaAttention,
            llama_attention_forward_4_31,)
+        convert_forward(
+            model,
+            transformers.models.llama.modeling_llama.LlamaRMSNorm,
+            llama_rms_norm_forward,)
    else:
        # todo implement 4.28.0 ~ 4.30.2
        pass

-    if "chatglm2" in model.config._name_or_path:
+    if "chatglm-18b" in model.config._name_or_path or "chatglm2" in model.config._name_or_path:
+        # chatglm-18b or chatglm2-6b
        modeling_module_name = model.__class__.__module__
        module = importlib.import_module(modeling_module_name)
        from bigdl.llm.transformers.models.chatglm2 import chatglm2_attention_forward_8eb45c
@ -166,6 +172,7 @@ def optimize(model):
                        module.CoreAttention,
                        core_attn_forward_8eb45c)
    elif "chatglm" in model.config._name_or_path:
+        # chatglm-6b
        modeling_module_name = model.__class__.__module__
        module = importlib.import_module(modeling_module_name)
        from bigdl.llm.transformers.models.chatglm import chatglm_attention_forward
@ -280,4 +287,20 @@ def optimize(model):
                        module.InternLMAttention,
                        internlm_attention_forward
                        )
+    elif model.config.model_type == "qwen":
+        modeling_module_name = model.__class__.__module__
+        module = importlib.import_module(modeling_module_name)
+        from bigdl.llm.transformers.models.qwen import qwen_attention_forward
+        convert_forward(model,
+                        module.QWenAttention,
+                        qwen_attention_forward
+                        )
+    elif model.config.model_type == "aquila":
+        modeling_module_name = model.__class__.__module__
+        module = importlib.import_module(modeling_module_name)
+        from bigdl.llm.transformers.models.aquila import aquila_attention_forward
+        convert_forward(model,
+                        module.AquilaAttention,
+                        aquila_attention_forward
+                        )
    return model
--- a/python/llm/src/bigdl/llm/transformers/models/aquila.py
+++ b/python/llm/src/bigdl/llm/transformers/models/aquila.py
@ -0,0 +1,157 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://huggingface.co/BAAI/AquilaChat-7B/blob/main/modeling_aquila.py
+#
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
+from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb
+from bigdl.dllib.utils import log4Error
+
+KV_CACHE_ALLOC_BLOCK_LENGTH = 256
+
+
+def aquila_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states)\
+        .view(bsz, q_len, self.num_heads, self.head_dim)\
+        .transpose(1, 2)
+    key_states = self.k_proj(hidden_states)\
+        .view(bsz, q_len, self.num_heads, self.head_dim)\
+        .transpose(1, 2)
+    value_states = self.v_proj(hidden_states)\
+        .view(bsz, q_len, self.num_heads, self.head_dim)\
+        .transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
+                                                    cos, sin, position_ids, "aquila")
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        cache_k = past_key_value[0]
+        cache_v = past_key_value[1]
+        if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            # allocate new
+            new_cache_k, new_cache_v = extend_kv_cache(bsz,
+                                                       self.num_heads,  # Support GQA
+                                                       self.head_dim,
+                                                       cache_k.size(2),
+                                                       kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
+                                                       dtype=cache_k.dtype,
+                                                       device=hidden_states.device)
+            new_cache_k[:] = cache_k
+            new_cache_v[:] = cache_v
+            cache_k = new_cache_k
+            cache_v = new_cache_v
+
+        key_states, value_states = append_kv_cache(cache_k, cache_v, key_states, value_states)
+
+    elif use_cache:
+        max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
+        new_key_states, new_value_states = init_kv_cache(bsz,
+                                                         self.num_heads,
+                                                         self.head_dim,
+                                                         kv_seq_len,
+                                                         max_cache_length,
+                                                         dtype=key_states.dtype,
+                                                         device=hidden_states.device)
+        new_key_states[:] = key_states
+        new_value_states[:] = value_states
+        key_states = new_key_states
+        value_states = new_value_states
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+    attn_weights = torch.clamp(attn_weights, min=-1024., max=1024.)
+    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+        log4Error.invalidInputError(
+            f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, "
+            f"but is {attn_weights.size()}"
+        )
+
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            log4Error.invalidInputError(
+                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, "
+                f"but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights + attention_mask
+        attn_weights = torch.max(
+            attn_weights,
+            torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+        )
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32)\
+        .to(query_states.dtype)
+    attn_output = torch.matmul(attn_weights, value_states)
+
+    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        log4Error.invalidInputError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, "
+            f"but is {attn_output.size()}"
+        )
+
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
--- a/python/llm/src/bigdl/llm/transformers/models/llama.py
+++ b/python/llm/src/bigdl/llm/transformers/models/llama.py
@ -39,6 +39,7 @@ import torch.nn.functional as F
 from bigdl.llm.utils.common import invalidInputError
 from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
 from bigdl.llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb
+from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu


 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@ -57,6 +58,19 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 KV_CACHE_ALLOC_BLOCK_LENGTH = 256


+def llama_rms_norm_forward(self, hidden_states):
+    if hidden_states.device.type == "xpu" and not (self.training and hidden_states.requires_grad):
+        hidden_states, _ = torch.ops.torch_ipex.rms_norm(hidden_states,
+                                                         [self.weight.size(0)], self.weight)
+    else:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    return hidden_states
+
+
 def llama_attention_forward_4_31(
    self,
    hidden_states: torch.Tensor,
@ -103,9 +117,20 @@ def llama_attention_forward_4_31(
    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        kv_seq_len += past_key_value[0].shape[-2]
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
-                                                    cos, sin, position_ids, "llama")
+
+    use_fuse_rope = query_states.device.type == "xpu"
+    use_fuse_rope = use_fuse_rope and not (self.training and query_states.requires_grad)
+    use_fuse_rope = use_fuse_rope and self.config.rope_scaling is None
+
+    if use_fuse_rope:
+        query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states,
+                                                                     key_states,
+                                                                     position_ids,
+                                                                     "llama")
+    else:
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
+                                                        cos, sin, position_ids, "llama")

    if past_key_value is not None:
        # reuse k, v, self_attention
--- a/python/llm/src/bigdl/llm/transformers/models/qwen.py
+++ b/python/llm/src/bigdl/llm/transformers/models/qwen.py
@ -0,0 +1,217 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py
+#
+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import importlib
+import math
+from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from transformers.utils import logging
+
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+
+from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
+from bigdl.llm.utils.common import invalidInputError
+
+apply_rotary_emb_func = None
+
+flash_attn_unpadded_func = None
+
+logger = logging.get_logger(__name__)
+
+KV_CACHE_ALLOC_BLOCK_LENGTH = 256
+
+
+def _rotate_half(x):
+    from einops import rearrange
+
+    x = rearrange(x, "... (j d) -> ... j d", j=2)
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(t, freqs):
+    if apply_rotary_emb_func is not None:
+        t_ = t.float()
+        freqs = freqs.squeeze(0).squeeze(1)
+        cos = freqs[:, : freqs.shape[-1] // 2].cos()
+        sin = freqs[:, : freqs.shape[-1] // 2].sin()
+        output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
+        return output
+    else:
+        rot_dim = freqs.shape[-1]
+        t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
+        t_ = t_.float()
+        t_pass_ = t_pass_.float()
+        t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin())
+        return torch.cat((t_, t_pass_), dim=-1).type_as(t)
+
+
+def qwen_attention_forward(
+    self,
+    hidden_states: Optional[Tuple[torch.FloatTensor]],
+    layer_past: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    output_attentions: Optional[bool] = False,
+    use_cache: Optional[bool] = False,
+):
+    mixed_x_layer = self.c_attn(hidden_states)
+    query, key, value = mixed_x_layer.split(self.split_size, dim=2)
+
+    query = self._split_heads(query, self.num_heads, self.head_dim)
+    key = self._split_heads(key, self.num_heads, self.head_dim)
+    value = self._split_heads(value, self.num_heads, self.head_dim)
+
+    kv_seq_len = hidden_states.size()[1]
+
+    if layer_past:
+        # layer past[0] shape: bs * seq_len * head_num * dim
+        kv_seq_len += layer_past[0].shape[1]
+    if (
+        self.use_dynamic_ntk
+        and kv_seq_len == hidden_states.size()[1]
+        and not self.training
+    ):
+        context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
+        ntk_alpha = 2 ** math.ceil(context_value) - 1
+        ntk_alpha = max(ntk_alpha, 1)
+        self._ntk_cached = ntk_alpha
+    else:
+        ntk_alpha = self._ntk_cached
+    rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(
+        hidden_states.device
+    )
+
+    if rotary_pos_emb is not None:
+        if isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = rotary_pos_emb
+        else:
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+    if rotary_pos_emb is not None:
+        q_pos_emb, k_pos_emb = rotary_pos_emb
+        # Slice the pos emb for current inference
+        cur_len = query.shape[1]
+        q_pos_emb = q_pos_emb[:, -cur_len:, :, :]
+        k_pos_emb = k_pos_emb[:, -cur_len:, :, :]
+        query = apply_rotary_pos_emb(query, q_pos_emb)
+        key = apply_rotary_pos_emb(key, k_pos_emb)
+
+    bsz, _, n_heads, head_dim = key.size()
+
+    if layer_past is not None:
+        # past_key, past_value = layer_past[0], layer_past[1]
+        # key = torch.cat((past_key, key), dim=1)
+        # value = torch.cat((past_value, value), dim=1)
+        cache_k = layer_past[0].transpose(1, 2)
+        cache_v = layer_past[1].transpose(1, 2)
+        if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
+            # allocate new
+            new_cache_k, new_cache_v = extend_kv_cache(bsz,
+                                                       self.num_heads,  # Support GQA
+                                                       self.head_dim,
+                                                       cache_k.size(2),
+                                                       kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
+                                                       dtype=cache_k.dtype,
+                                                       device=hidden_states.device)
+            new_cache_k[:] = cache_k
+            new_cache_v[:] = cache_v
+            cache_k = new_cache_k
+            cache_v = new_cache_v
+
+        key_states, value_states = append_kv_cache(cache_k, cache_v,
+                                                   key.transpose(1, 2), value.transpose(1, 2))
+        key = key_states.transpose(1, 2)
+        value = value_states.transpose(1, 2)
+    elif use_cache:
+        max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
+        new_key_states, new_value_states = init_kv_cache(bsz,
+                                                         self.num_heads,
+                                                         self.head_dim,
+                                                         kv_seq_len,
+                                                         max_cache_length,
+                                                         dtype=key.dtype,
+                                                         device=hidden_states.device)
+        new_key_states[:] = key.transpose(1, 2)
+        new_value_states[:] = value.transpose(1, 2)
+        key = new_key_states.transpose(1, 2)
+        value = new_value_states.transpose(1, 2)
+
+    if use_cache:
+        present = (key, value)
+    else:
+        present = None
+
+    if self.use_logn_attn and not self.training:
+        if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
+            self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
+        seq_start = key.size(1) - query.size(1)
+        seq_end = key.size(1)
+        logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
+        query = query * logn_tensor.expand_as(query)
+
+    if (
+        self.use_flash_attn
+        and flash_attn_unpadded_func is not None
+        and not self.is_fp32
+        and query.is_cuda
+    ):
+        q, k, v = query, key, value
+        context_layer = self.core_attention_flash(q, k, v)
+
+        context_layer = rearrange(
+            context_layer, "b s h d -> b s (h d)"
+        ).contiguous()
+    else:
+        query = query.permute(0, 2, 1, 3)
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+        attn_output, attn_weight = self._attn(
+            query, key, value, attention_mask, head_mask
+        )
+        context_layer = self._merge_heads(
+            attn_output, self.num_heads, self.head_dim
+        )
+
+    attn_output = self.c_proj(context_layer)
+    outputs = (attn_output, present)
+    if output_attentions:
+        if (
+            self.use_flash_attn
+            and flash_attn_unpadded_func is not None
+            and not self.is_fp32
+        ):
+            invalidInputError("Cannot output attentions while using flash-attn")
+        else:
+            outputs += (attn_weight,)
+
+    return outputs
--- a/python/llm/src/bigdl/llm/transformers/models/utils.py
+++ b/python/llm/src/bigdl/llm/transformers/models/utils.py
@ -71,7 +71,7 @@ def rotate_every_two(x):


 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
-    if model_family in ["llama", "baichuan", "internlm"]:
+    if model_family in ["llama", "baichuan", "internlm", "aquila"]:
        # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
        cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
        sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
@ -97,3 +97,18 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
    else:
        invalidInputError(False,
                          f"{model_family} is not supported.")
+
+
+def apply_rotary_pos_emb_no_cache_xpu(q, k, position_ids, model_family):
+    if q.device.type != "xpu":
+        invalidInputError(False,
+                          f"only xpu is supported in this function")
+    import linear_q4_0
+    q_embed = torch.empty(q.shape, dtype=q.dtype, device=q.device)
+    k_embed = torch.empty(k.shape, dtype=k.dtype, device=k.device)
+    if model_family in ["llama", "baichuan", "internlm", "aquila", "gpt_neox"]:
+        linear_q4_0.apply_rotary_embedding_half_qk(q, k, position_ids, q_embed, k_embed)
+        return q_embed, k_embed
+    else:
+        invalidInputError(False,
+                          f"{model_family} is not supported.")
--- a/python/llm/src/bigdl/llm/transformers/qlora.py
+++ b/python/llm/src/bigdl/llm/transformers/qlora.py
@ -36,6 +36,7 @@ import torch
 from bigdl.llm.transformers.low_bit_linear import LowBitLinear
 from peft.tuners.lora import LoraLayer
 from bigdl.llm.utils.common import invalidInputError
+import functools


 class LoraLowBitLinear(LowBitLinear, LoraLayer):
@ -94,13 +95,11 @@ class LoraLowBitLinear(LowBitLinear, LoraLayer):
        return result


-@staticmethod
-def _create_new_module(lora_config, adapter_name, target, **kwargs):
-
-    bias = kwargs.pop("bias", False)
+def _create_new_module(create_new_module_func, lora_config, adapter_name, target, **kwargs):

    if isinstance(target, LowBitLinear):
        low_bit_kwargs = kwargs.copy()
+        bias = low_bit_kwargs.pop("bias", False)
        low_bit_kwargs.update(
            {
                "qtype": target.qtype,
@ -112,9 +111,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                                      bias=bias,
                                      **low_bit_kwargs)
    else:
-        invalidInputError(False,
-                          f"Target module {target} is not supported. "
-                          f"Currently, only `LowBitLinear` are supported.")
+        new_module = create_new_module_func(lora_config, adapter_name, target, **kwargs)

    return new_module

@ -124,7 +121,8 @@ from peft.tuners.lora import LoraModel

 def get_peft_model(*args, **kwargs):
    old_create_new_module = LoraModel._create_new_module
-    LoraModel._create_new_module = _create_new_module
+    LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module,
+                                                                  old_create_new_module))
    try:
        from peft import get_peft_model as get_peft_model_original
        model = get_peft_model_original(*args, **kwargs)
@ -181,7 +179,8 @@ class PeftModel:
    def from_pretrained(*args,
                        **kwargs):
        old_create_new_module = LoraModel._create_new_module
-        LoraModel._create_new_module = _create_new_module
+        LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module,
+                                                                      old_create_new_module))
        from peft import PeftModel
        try:
            model = PeftModel.from_pretrained(*args, **kwargs)