diff --git a/.github/workflows/manually_build.yml b/.github/workflows/manually_build.yml index e5fe603d..5b2cc7af 100644 --- a/.github/workflows/manually_build.yml +++ b/.github/workflows/manually_build.yml @@ -12,6 +12,7 @@ on: - all - bigdl-llm-xpu - bigdl-llm-cpu + - bigdl-llm-serving-cpu - bigdl-ppml-gramine-base - bigdl-ppml-trusted-bigdl-llm-gramine-base - bigdl-ppml-trusted-bigdl-llm-gramine-ref @@ -114,6 +115,32 @@ jobs: sudo docker push 10.239.45.10/arda/${image}:${TAG} sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} + bigdl-llm-serving-cpu: + if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }} + runs-on: [self-hosted, Shire] + steps: + - uses: actions/checkout@v3 + - name: docker login + run: | + docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD} + - name: bigdl-llm-serving-cpu + run: | + echo "##############################################################" + echo "####### bigdl-llm-serving-cpu ########" + echo "##############################################################" + export image=intelanalytics/bigdl-llm-serving-cpu + cd docker/llm/serving/cpu/docker + sudo docker build \ + --no-cache=true \ + --build-arg http_proxy=${HTTP_PROXY} \ + --build-arg https_proxy=${HTTPS_PROXY} \ + --build-arg no_proxy=${NO_PROXY} \ + -t ${image}:${TAG} -f ./Dockerfile . + sudo docker push ${image}:${TAG} + sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} + sudo docker push 10.239.45.10/arda/${image}:${TAG} + sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} + bigdl-ppml-gramine-base: if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }} runs-on: [self-hosted, Shire] diff --git a/README.md b/README.md index 0966576e..841d9bcd 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,13 @@ **[`bigdl-llm`](python/llm)** is a library for running **LLM** (large language model) on Intel **XPU** (from *Laptop* to *GPU* to *Cloud*) using **INT4** with very low latency[^1] (for any **PyTorch** model). -> *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [gptq](https://github.com/IST-DASLab/gptq), [ggml](https://github.com/ggerganov/ggml), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.* +> *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [ggml](https://github.com/ggerganov/ggml), [gptq](https://github.com/IST-DASLab/gptq), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.* ### Latest update +- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/gpu/qlora_finetuning). - `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/gpu). - `bigdl-llm` tutorial is released [here](https://github.com/intel-analytics/bigdl-llm-tutorial). -- Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly-v1/Dolly-v2, StarCoder, Whisper, InternLM, QWen, Baichuan, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models). +- Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models). ### `bigdl-llm` Demos See the ***optimized performance*** of `chatglm2-6b` and `llama-2-13b-chat` models on 12th Gen Intel Core CPU and Intel Arc GPU below. diff --git a/docker/llm/serving/cpu/docker/Dockerfile b/docker/llm/serving/cpu/docker/Dockerfile index ede2b733..e058bed5 100644 --- a/docker/llm/serving/cpu/docker/Dockerfile +++ b/docker/llm/serving/cpu/docker/Dockerfile @@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT ARG http_proxy ARG https_proxy +ARG TINI_VERSION=v0.18.0 # Disable pip's cache behavior ARG PIP_NO_CACHE_DIR=false +ADD ./entrypoint.sh /opt/entrypoint.sh +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini # Install Serving Dependencies RUN mkdir /llm && \ cd /llm && \ @@ -13,7 +16,11 @@ RUN mkdir /llm && \ cd FastChat && \ git checkout dev-2023-09-22 && \ pip3 install -e ".[model_worker,webui]" && \ - cd /llm + cd /llm && \ + chmod +x /opt/entrypoint.sh && \ + chmod +x /sbin/tini && \ + cp /sbin/tini /usr/bin/tini WORKDIR /llm/ +ENTRYPOINT [ "/opt/entrypoint.sh" ] \ No newline at end of file diff --git a/docker/llm/serving/cpu/docker/entrypoint.sh b/docker/llm/serving/cpu/docker/entrypoint.sh new file mode 100644 index 00000000..99370654 --- /dev/null +++ b/docker/llm/serving/cpu/docker/entrypoint.sh @@ -0,0 +1,200 @@ +#!/bin/bash + +usage() { + echo "Usage: $0 [-m --mode ] [-h --help]" + echo "-h: Print help message." + echo "Controller mode reads the following env:" + echo "CONTROLLER_HOST (default: localhost)." + echo "CONTROLLER_PORT (default: 21001)." + echo "API_HOST (default: localhost)." + echo "API_PORT (default: 8000)." + echo "Worker mode reads the following env:" + echo "CONTROLLER_HOST (default: localhost)." + echo "CONTROLLER_PORT (default: 21001)." + echo "WORKER_HOST (default: localhost)." + echo "WORKER_PORT (default: 21002)." + echo "MODEL_PATH (default: empty)." + exit 1 +} + +# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist +calculate_total_cores() { + local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus" + + if [[ -f "$cpuset_file" ]]; then + local cpuset_cpus=$(cat "$cpuset_file") + cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n') + + local total_cores=0 + IFS=',' read -ra cpu_list <<< "$cpuset_cpus" + for cpu in "${cpu_list[@]}"; do + if [[ $cpu =~ - ]]; then + # Range of CPUs + local start_cpu=$(echo "$cpu" | cut -d'-' -f1) + local end_cpu=$(echo "$cpu" | cut -d'-' -f2) + local range_cores=$((end_cpu - start_cpu + 1)) + total_cores=$((total_cores + range_cores)) + else + # Single CPU + total_cores=$((total_cores + 1)) + fi + done + + echo $total_cores + return + fi + # Kubernetes core-binding will use this file + cpuset_file="/sys/fs/cgroup/cpuset.cpus" + if [[ -f "$cpuset_file" ]]; then + local cpuset_cpus=$(cat "$cpuset_file") + cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n') + + local total_cores=0 + IFS=',' read -ra cpu_list <<< "$cpuset_cpus" + for cpu in "${cpu_list[@]}"; do + if [[ $cpu =~ - ]]; then + # Range of CPUs + local start_cpu=$(echo "$cpu" | cut -d'-' -f1) + local end_cpu=$(echo "$cpu" | cut -d'-' -f2) + local range_cores=$((end_cpu - start_cpu + 1)) + total_cores=$((total_cores + range_cores)) + else + # Single CPU + total_cores=$((total_cores + 1)) + fi + done + + echo $total_cores + return + else + echo -1 + return + fi +} + +# Default values +controller_host="localhost" +controller_port="21001" +api_host="localhost" +api_port="8000" +worker_host="localhost" +worker_port="21002" +model_path="" +mode="" +omp_num_threads="" +dispatch_method="shortest_queue" # shortest_queue or lottery + +# Update rootCA config if needed +update-ca-certificates + +# Remember the value of `OMP_NUM_THREADS`: +if [[ -n "${OMP_NUM_THREADS}" ]]; then + omp_num_threads="${OMP_NUM_THREADS}" +fi + +# We do not have any arguments, just run bash +if [ "$#" == 0 ]; then + echo "[INFO] no command is passed in" + echo "[INFO] enter pass-through mode" + exec /usr/bin/tini -s -- "bash" +else + # Parse command-line options + options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@") + if [ $? != 0 ]; then + usage + fi + eval set -- "$options" + + while true; do + case "$1" in + -m|--mode) + mode="$2" + [[ $mode == "controller" || $mode == "worker" ]] || usage + shift 2 + ;; + -h|--help) + usage + ;; + --) + shift + break + ;; + *) + usage + ;; + esac + done + + if [[ -n $CONTROLLER_HOST ]]; then + controller_host=$CONTROLLER_HOST + fi + + if [[ -n $CONTROLLER_PORT ]]; then + controller_port=$CONTROLLER_PORT + fi + + if [[ -n $API_HOST ]]; then + api_host=$API_HOST + fi + + if [[ -n $API_PORT ]]; then + api_port=$API_PORT + fi + + if [[ -n $WORKER_HOST ]]; then + worker_host=$WORKER_HOST + fi + + if [[ -n $WORKER_PORT ]]; then + worker_port=$WORKER_PORT + fi + + if [[ -n $MODEL_PATH ]]; then + model_path=$MODEL_PATH + fi + + if [[ -n $DISPATCH_METHOD ]]; then + dispatch_method=$DISPATCH_METHOD + fi + + controller_address="http://$controller_host:$controller_port" + # Execute logic based on options + if [[ $mode == "controller" ]]; then + # Logic for controller mode + # Boot Controller + api_address="http://$api_host:$api_port" + echo "Controller address: $controller_address" + echo "OpenAI API address: $api_address" + python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method & + # Boot openai api server + python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address + else + # Logic for non-controller(worker) mode + worker_address="http://$worker_host:$worker_port" + # Apply optimizations from bigdl-nano + source bigdl-nano-init -t + # First check if user have set OMP_NUM_THREADS by themselves + if [[ -n "${omp_num_threads}" ]]; then + echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads" + export OMP_NUM_THREADS=$omp_num_threads + else + # Use calculate_total_cores to acquire cpuset settings + # Set OMP_NUM_THREADS to correct numbers + cores=$(calculate_total_cores) + if [[ $cores == -1 || $cores == 0 ]]; then + echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS" + else + echo "Setting OMP_NUM_THREADS to $cores" + export OMP_NUM_THREADS=$cores + fi + fi + if [[ -z "${model_path}" ]]; then + echo "Please set env MODEL_PATH used for worker" + usage + fi + echo "Worker address: $worker_address" + echo "Controller address: $controller_address" + python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address + fi +fi + diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md new file mode 100644 index 00000000..b0027f12 --- /dev/null +++ b/docker/llm/serving/cpu/kubernetes/README.md @@ -0,0 +1,235 @@ +## Deployment bigdl-llm serving service in K8S environment + + +## Image + +To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT` + +## Before deployment + +### Models + +In this document, we will use `vicuna-7b-v1.5` as the deployment model. + +After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used. + +You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5). + +### Kubernetes config + +We recommend to setup your kubernetes cluster before deployment. Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/). Also, it would be great to also set the `topology management policy` to `single-numa-node`. + +### Machine config + +Set hyper-threading to off, ensure that only physical cores are used during deployment. + +## Deployment + +### Reminder on `OMP_NUM_THREADS` + +The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`. However, this only happens correctly if the `core-binding` feature is enabled. If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file. + + +### Controller + +We use the following yaml file for controller deployment: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: bigdl-fschat-a1234bd-controller + labels: + fastchat-appid: a1234bd + fastchat-app-type: controller +spec: + dnsPolicy: "ClusterFirst" + containers: + - name: fastchat-controller # fixed + image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT + imagePullPolicy: IfNotPresent + env: + - name: CONTROLLER_HOST # fixed + value: "0.0.0.0" + - name: CONTROLLER_PORT # fixed + value: "21005" + - name: API_HOST # fixed + value: "0.0.0.0" + - name: API_PORT # fixed + value: "8000" + ports: + - containerPort: 21005 + name: con-port + - containerPort: 8000 + name: api-port + resources: + requests: + memory: 16Gi + cpu: 4 + limits: + memory: 16Gi + cpu: 4 + args: ["-m", "controller"] + restartPolicy: "Never" +--- +# Service for the controller +apiVersion: v1 +kind: Service +metadata: + name: bigdl-a1234bd-fschat-controller-service +spec: + # You may also want to change this to use the cluster's feature + type: NodePort + selector: + fastchat-appid: a1234bd + fastchat-app-type: controller + ports: + - name: cont-port + protocol: TCP + port: 21005 + targetPort: 21005 + - name: api-port + protocol: TCP + port: 8000 + targetPort: 8000 +``` + +### Worker + +We use the following deployment for worker deployment: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: bigdl-fschat-a1234bd-worker-deployment +spec: + # Change this to the number you want + replicas: 1 + selector: + matchLabels: + fastchat: worker + template: + metadata: + labels: + fastchat: worker + spec: + dnsPolicy: "ClusterFirst" + containers: + - name: fastchat-worker # fixed + image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT + imagePullPolicy: IfNotPresent + env: + - name: CONTROLLER_HOST # fixed + value: bigdl-a1234bd-fschat-controller-service + - name: CONTROLLER_PORT # fixed + value: "21005" + - name: WORKER_HOST # fixed + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: WORKER_PORT # fixed + value: "21841" + - name: MODEL_PATH # Change this + value: "/llm/models/vicuna-7b-v1.5-bigdl/" + - name: OMP_NUM_THREADS + value: "16" + resources: + requests: + memory: 32Gi + cpu: 16 + limits: + memory: 32Gi + cpu: 16 + args: ["-m", "worker"] + volumeMounts: + - name: llm-models + mountPath: /llm/models/ + restartPolicy: "Always" + volumes: + - name: llm-models + hostPath: + path: /home/llm/models # change this in other envs +``` + +You may want to change the `MODEL_PATH` variable in the yaml. Also, please remember to change the volume path accordingly. + + +### Testing + +#### Using openai-python + +First, install openai-python: +```bash +pip install --upgrade openai +``` + +Then, interact with model vicuna-7b-v1.5-bigdl: +```python +import openai +openai.api_key = "EMPTY" +openai.api_base = "http://localhost:8000/v1" + +model = "vicuna-7b-v1.5-bigdl" +prompt = "Once upon a time" + +# create a completion +completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64) +# print the completion +print(prompt + completion.choices[0].text) + +# create a chat completion +completion = openai.ChatCompletion.create( + model=model, + messages=[{"role": "user", "content": "Hello! What is your name?"}] +) +# print the completion +print(completion.choices[0].message.content) +``` + +#### cURL +cURL is another good tool for observing the output of the api. + +For the following examples, you may also change the service deployment address. + +List Models: +```bash +curl http://localhost:8000/v1/models +``` + +If you have `jq` installed, you can use it to format the output like this: +```bash +curl http://localhost:8000/v1/models | jq +``` + +Chat Completions: +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "YOUR_MODEL", + "messages": [{"role": "user", "content": "Hello! What is your name?"}] + }' +``` + +Text Completions: +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "YOUR_MODEL", + "prompt": "Once upon a time", + "max_tokens": 41, + "temperature": 0.5 + }' +``` + +Embeddings: +```bash +curl http://localhost:8000/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{ + "model": "YOUR_MODEL", + "input": "Hello world!" + }' +``` \ No newline at end of file diff --git a/docker/llm/serving/cpu/kubernetes/clean.sh b/docker/llm/serving/cpu/kubernetes/clean.sh new file mode 100644 index 00000000..d5d1729d --- /dev/null +++ b/docker/llm/serving/cpu/kubernetes/clean.sh @@ -0,0 +1 @@ +kubectl delete -f deployment.yaml \ No newline at end of file diff --git a/docker/llm/serving/cpu/kubernetes/deployment.yaml b/docker/llm/serving/cpu/kubernetes/deployment.yaml new file mode 100644 index 00000000..bd659fd4 --- /dev/null +++ b/docker/llm/serving/cpu/kubernetes/deployment.yaml @@ -0,0 +1,109 @@ +apiVersion: v1 +kind: Pod +metadata: + name: bigdl-fschat-a1234bd-controller + labels: + fastchat-appid: a1234bd + fastchat-app-type: controller +spec: + dnsPolicy: "ClusterFirst" + containers: + - name: fastchat-controller # fixed + image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT + imagePullPolicy: IfNotPresent + env: + - name: CONTROLLER_HOST # fixed + value: "0.0.0.0" + - name: CONTROLLER_PORT # fixed + value: "21005" + - name: API_HOST # fixed + value: "0.0.0.0" + - name: API_PORT # fixed + value: "8000" + ports: + - containerPort: 21005 + name: con-port + - containerPort: 8000 + name: api-port + resources: + requests: + memory: 16Gi + cpu: 4 + limits: + memory: 16Gi + cpu: 4 + args: ["-m", "controller"] + restartPolicy: "Never" +--- +# Service for the controller +apiVersion: v1 +kind: Service +metadata: + name: bigdl-a1234bd-fschat-controller-service +spec: + # You may also want to change this to use the cluster's feature + type: NodePort + selector: + fastchat-appid: a1234bd + fastchat-app-type: controller + ports: + - name: cont-port + protocol: TCP + port: 21005 + targetPort: 21005 + - name: api-port + protocol: TCP + port: 8000 + targetPort: 8000 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: bigdl-fschat-a1234bd-worker-deployment +spec: + # Change this to the number you want + replicas: 1 + selector: + matchLabels: + fastchat: worker + template: + metadata: + labels: + fastchat: worker + spec: + dnsPolicy: "ClusterFirst" + containers: + - name: fastchat-worker # fixed + image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT + imagePullPolicy: IfNotPresent + env: + - name: CONTROLLER_HOST # fixed + value: bigdl-a1234bd-fschat-controller-service + - name: CONTROLLER_PORT # fixed + value: "21005" + - name: WORKER_HOST # fixed + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: WORKER_PORT # fixed + value: "21841" + - name: MODEL_PATH # Change this + value: "/llm/models/vicuna-7b-v1.5-bigdl/" + - name: OMP_NUM_THREADS + value: "16" + resources: + requests: + memory: 32Gi + cpu: 16 + limits: + memory: 32Gi + cpu: 16 + args: ["-m", "worker"] + volumeMounts: + - name: llm-models + mountPath: /llm/models/ + restartPolicy: "Always" + volumes: + - name: llm-models + hostPath: + path: /home/llm/models # change this in other envs \ No newline at end of file diff --git a/docs/readthedocs/source/_toc.yml b/docs/readthedocs/source/_toc.yml index 9cba0641..094ebb4d 100644 --- a/docs/readthedocs/source/_toc.yml +++ b/docs/readthedocs/source/_toc.yml @@ -38,12 +38,12 @@ subtrees: title: "Key Features" subtrees: - entries: + - file: doc/LLM/Overview/KeyFeatures/optimize_model - file: doc/LLM/Overview/KeyFeatures/transformers_style_api subtrees: - entries: - file: doc/LLM/Overview/KeyFeatures/hugging_face_format - file: doc/LLM/Overview/KeyFeatures/native_format - - file: doc/LLM/Overview/KeyFeatures/optimize_model - file: doc/LLM/Overview/KeyFeatures/langchain_api # - file: doc/LLM/Overview/KeyFeatures/cli - file: doc/LLM/Overview/KeyFeatures/gpu_supports diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/index.rst b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/index.rst index 4914196b..823df5a1 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/index.rst +++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/index.rst @@ -3,12 +3,12 @@ BigDL-LLM Key Features You may run the LLMs using ``bigdl-llm`` through one of the following APIs: +* `PyTorch API <./optimize_model.html>`_ * |transformers_style_api|_ * |hugging_face_transformers_format|_ * `Native Format <./native_format.html>`_ -* `General PyTorch Model Supports <./langchain_api.html>`_ * `LangChain API <./langchain_api.html>`_ * `GPU Supports <./gpu_supports.html>`_ diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md index eeb7a3c1..ac510688 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md +++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md @@ -1,22 +1,27 @@ -## General PyTorch Model Supports +## PyTorch API -You may apply BigDL-LLM optimizations on any Pytorch models, not only Hugging Face *Transformers* models for acceleration. With BigDL-LLM, PyTorch models (in FP16/BF16/FP32) can be optimized with low-bit quantizations (supported precisions include INT4/INT5/INT8). +In general, you just need one-line `optimize_model` to easily optimize any loaded PyTorch model, regardless of the library or API you are using. With BigDL-LLM, PyTorch models (in FP16/BF16/FP32) can be optimized with low-bit quantizations (supported precisions include INT4, INT5, INT8, etc). -You can easily enable BigDL-LLM INT4 optimizations on any Pytorch models just as follows: +First, use any PyTorch APIs you like to load your model. To help you better understand the process, here we use [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) library `LlamaForCausalLM` to load a popular model [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) as an example: ```python -# Create or load any Pytorch model -model = ... +# Create or load any Pytorch model, take Llama-2-7b-chat-hf as an example +from transformers import LlamaForCausalLM +model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True) +``` -# Add only two lines to enable BigDL-LLM INT4 optimizations on model +Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default: +```python from bigdl.llm import optimize_model + +# With only one line to enable BigDL-LLM INT4 optimization model = optimize_model(model) ``` -After optimizing the model, you may straightly run the optimized model with no API changed and less inference latency. +After optimizing the model, BigDL-LLM does not require any change in the inference code. You can use any libraries to run the optimized model with very low latency. ```eval_rst .. seealso:: - See the examples for Hugging Face *Transformers* models `here `_. And examples for other general Pytorch models can be found `here `_. + * For more detailed usage of ``optimize_model``, please refer to the `API documentation `_. ``` diff --git a/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md b/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md index 763fd09a..5c2642db 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md +++ b/docs/readthedocs/source/doc/LLM/Overview/install_cpu.md @@ -5,9 +5,11 @@ Install BigDL-LLM for CPU supports using pip through: ```bash -pip install bigdl-llm[all] +pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option ``` +Please refer to [Environment Setup](#environment-setup) for more information. + ```eval_rst .. note:: @@ -43,7 +45,7 @@ First we recommend using [Conda](https://docs.conda.io/en/latest/miniconda.html) conda create -n llm python=3.9 conda activate llm -pip install bigdl-llm[all] # install bigdl-llm for CPU with 'all' option +pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option ``` Then for running a LLM model with BigDL-LLM optimizations (taking an `example.py` an example): diff --git a/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md b/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md index 5429c150..0d36c39f 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md +++ b/docs/readthedocs/source/doc/LLM/Overview/install_gpu.md @@ -5,9 +5,11 @@ Install BigDL-LLM for GPU supports using pip through: ```bash -pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu +pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu # install bigdl-llm for GPU ``` +Please refer to [Environment Setup](#environment-setup) for more information. + ```eval_rst .. note:: @@ -25,6 +27,12 @@ BigDL-LLM for GPU supports has been verified on: * Intel Arc™ A-Series Graphics * Intel Data Center GPU Flex Series +```eval_rst +.. note:: + + We currently supoort the Ubuntu 20.04 operating system or later. Windows supoort is in progress. +``` + To apply Intel GPU acceleration, there're several steps for tools installation and environment preparation: * Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered. diff --git a/docs/readthedocs/source/doc/LLM/index.rst b/docs/readthedocs/source/doc/LLM/index.rst index f18aa1ab..e13cb0aa 100644 --- a/docs/readthedocs/source/doc/LLM/index.rst +++ b/docs/readthedocs/source/doc/LLM/index.rst @@ -32,8 +32,8 @@ BigDL-LLM +++ + :bdg-link:`PyTorch <./Overview/KeyFeatures/optimize_model.html>` | :bdg-link:`transformers-style <./Overview/KeyFeatures/transformers_style_api.html>` | - :bdg-link:`Optimize Model <./Overview/KeyFeatures/optimize_model.html>` | :bdg-link:`LangChain <./Overview/KeyFeatures/langchain_api.html>` | :bdg-link:`GPU <./Overview/KeyFeatures/gpu_supports.html>` diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/index.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/index.rst index ea8d4fc0..6d6e38e1 100644 --- a/docs/readthedocs/source/doc/PythonAPI/LLM/index.rst +++ b/docs/readthedocs/source/doc/PythonAPI/LLM/index.rst @@ -4,6 +4,6 @@ BigDL-LLM API .. toctree:: :maxdepth: 3 + optimize.rst transformers.rst langchain.rst - optimize.rst diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst index a6949247..01903ada 100644 --- a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst +++ b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst @@ -1,4 +1,4 @@ -BigDL-LLM Optimize API +BigDL-LLM PyTorch API ===================== llm.optimize diff --git a/docs/readthedocs/source/index.rst b/docs/readthedocs/source/index.rst index 6602890d..8a08d6cc 100644 --- a/docs/readthedocs/source/index.rst +++ b/docs/readthedocs/source/index.rst @@ -24,9 +24,10 @@ BigDL-LLM: low-Bit LLM library ============================================ Latest update ============================================ +- **[New]** ``bigdl-llm`` now supports QLoRA fintuning on Intel GPU; see the the example `here `_. - ``bigdl-llm`` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples `here `_. - ``bigdl-llm`` tutorial is released `here `_. -- Over 20 models have been verified on ``bigdl-llm``, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly-v1/Dolly-v2, StarCoder, Whisper, InternLM, QWen, Baichuan, MOSS* and more; see the complete list `here `_. +- Over 20 models have been verified on ``bigdl-llm``, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS* and more; see the complete list `here `_. ============================================ diff --git a/python/llm/example/gpu/qlora_finetuning/README.md b/python/llm/example/gpu/qlora_finetuning/README.md index 7e14656c..7b98b1b6 100644 --- a/python/llm/example/gpu/qlora_finetuning/README.md +++ b/python/llm/example/gpu/qlora_finetuning/README.md @@ -1,4 +1,4 @@ -# Q-Lora (experimental support) +# Finetuning LLAMA Using Q-Lora (experimental support) This example demonstrates how to finetune a llama2-7b model use Big-LLM 4bit optimizations using [Intel GPUs](../README.md). @@ -7,7 +7,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi ## Example: Finetune llama2-7b using qlora -This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing) +This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing). The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py). ### 1. Install @@ -26,13 +26,13 @@ pip install peft==0.5.0 source /opt/intel/oneapi/setvars.sh ``` -### 3. Run +### 3. Finetune model ``` python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH ``` -### Sample Output +#### Sample Output ```log {'loss': 1.6134, 'learning_rate': 0.0002, 'epoch': 0.03} {'loss': 1.3038, 'learning_rate': 0.00017777777777777779, 'epoch': 0.06} @@ -47,4 +47,12 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH {'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32} 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:45<00:00, 1.13s/it] TrainOutput(global_step=200, training_loss=1.211241865158081, metrics={'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32}) -``` \ No newline at end of file +``` + +### 4. Merge the adapter into the original model + +``` +python ./export_merged_model.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --adapter_path ./outputs/checkpoint-200 --output_path ./outputs/checkpoint-200-merged +``` + +Then you can use `./outputs/checkpoint-200-merged` as a normal huggingface transformer model to do inference. diff --git a/python/llm/example/gpu/qlora_finetuning/export_merged_model.py b/python/llm/example/gpu/qlora_finetuning/export_merged_model.py new file mode 100644 index 00000000..1cf3c2ff --- /dev/null +++ b/python/llm/example/gpu/qlora_finetuning/export_merged_model.py @@ -0,0 +1,93 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file is adapted from https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py +# +# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import torch +import transformers +from transformers import LlamaTokenizer # noqa: F402 +from bigdl.llm.transformers.qlora import PeftModel +from bigdl.llm.transformers import AutoModelForCausalLM +import argparse + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model') + parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf", + help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--adapter_path', type=str,) + parser.add_argument('--output_path', type=str,) + + args = parser.parse_args() + base_model = model_path = args.repo_id_or_model_path + adapter_path = args.adapter_path + tokenizer = LlamaTokenizer.from_pretrained(base_model) + + base_model = AutoModelForCausalLM.from_pretrained( + base_model, + # load_in_low_bit="nf4", # should load the orignal model + torch_dtype=torch.float16, + device_map={"": "cpu"}, + ) + + first_weight = base_model.model.layers[0].self_attn.q_proj.weight + first_weight_old = first_weight.clone() + + lora_model = PeftModel.from_pretrained( + base_model, + adapter_path, + device_map={"": "cpu"}, + torch_dtype=torch.float16, + ) + + lora_weight = lora_model.base_model.model.model.layers[ + 0 + ].self_attn.q_proj.weight + + assert torch.allclose(first_weight_old, first_weight) + + # merge weights - new merging method from peft + lora_model = lora_model.merge_and_unload() + + lora_model.train(False) + + # did we do anything? + assert not torch.allclose(first_weight_old, first_weight) + + lora_model_sd = lora_model.state_dict() + deloreanized_sd = { + k.replace("base_model.model.", ""): v + for k, v in lora_model_sd.items() + if "lora" not in k + } + + base_model.save_pretrained(args.output_path, state_dict=deloreanized_sd) + tokenizer.save_pretrained(args.output_path) diff --git a/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py b/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py index 6531b483..85b5642e 100644 --- a/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py +++ b/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py @@ -45,8 +45,9 @@ if __name__ == "__main__": data = load_dataset(dataset_path) data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) model = AutoModelForCausalLM.from_pretrained(model_path, - load_in_4bit=True, + load_in_low_bit="nf4", optimize_model=False, + torch_dtype=torch.float16, modules_to_not_convert=["lm_head"],) model = model.to('xpu') model.gradient_checkpointing_enable() @@ -71,7 +72,8 @@ if __name__ == "__main__": warmup_steps=20, max_steps=200, learning_rate=2e-4, - fp16=False, # fp16 is not supported yet + save_steps=100, + fp16=True, logging_steps=20, output_dir="outputs", optim="adamw_hf", # paged_adamw_8bit is not supported yet diff --git a/python/llm/src/bigdl/llm/cli/llm-cli b/python/llm/src/bigdl/llm/cli/llm-cli index 07562747..a145c09a 100755 --- a/python/llm/src/bigdl/llm/cli/llm-cli +++ b/python/llm/src/bigdl/llm/cli/llm-cli @@ -47,7 +47,11 @@ function starcoder { } function chatglm { - command="$lib_dir/main-chatglm_vnni -t $threads -n $n_predict ${filteredArguments[*]}" + if [[ $(lscpu | grep "amx_int8") ]]; then + command="$lib_dir/main-chatglm_amx -t $threads -n $n_predict ${filteredArguments[*]}" + else + command="$lib_dir/main-chatglm_vnni -t $threads -n $n_predict ${filteredArguments[*]}" + fi echo "$command" eval "$command" } diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py index 07f929c3..b0bc581d 100644 --- a/python/llm/src/bigdl/llm/transformers/convert.py +++ b/python/llm/src/bigdl/llm/transformers/convert.py @@ -135,6 +135,7 @@ def convert_forward(m, target_m, new_forward): def optimize(model): from packaging import version from bigdl.llm.transformers.models.llama import llama_attention_forward_4_31 + from bigdl.llm.transformers.models.llama import llama_rms_norm_forward from transformers.modeling_utils import PreTrainedModel # All huggingface format models are inherited from `PreTrainedModel` @@ -149,11 +150,16 @@ def optimize(model): model, transformers.models.llama.modeling_llama.LlamaAttention, llama_attention_forward_4_31,) + convert_forward( + model, + transformers.models.llama.modeling_llama.LlamaRMSNorm, + llama_rms_norm_forward,) else: # todo implement 4.28.0 ~ 4.30.2 pass - if "chatglm2" in model.config._name_or_path: + if "chatglm-18b" in model.config._name_or_path or "chatglm2" in model.config._name_or_path: + # chatglm-18b or chatglm2-6b modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) from bigdl.llm.transformers.models.chatglm2 import chatglm2_attention_forward_8eb45c @@ -166,6 +172,7 @@ def optimize(model): module.CoreAttention, core_attn_forward_8eb45c) elif "chatglm" in model.config._name_or_path: + # chatglm-6b modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) from bigdl.llm.transformers.models.chatglm import chatglm_attention_forward @@ -280,4 +287,20 @@ def optimize(model): module.InternLMAttention, internlm_attention_forward ) + elif model.config.model_type == "qwen": + modeling_module_name = model.__class__.__module__ + module = importlib.import_module(modeling_module_name) + from bigdl.llm.transformers.models.qwen import qwen_attention_forward + convert_forward(model, + module.QWenAttention, + qwen_attention_forward + ) + elif model.config.model_type == "aquila": + modeling_module_name = model.__class__.__module__ + module = importlib.import_module(modeling_module_name) + from bigdl.llm.transformers.models.aquila import aquila_attention_forward + convert_forward(model, + module.AquilaAttention, + aquila_attention_forward + ) return model diff --git a/python/llm/src/bigdl/llm/transformers/models/aquila.py b/python/llm/src/bigdl/llm/transformers/models/aquila.py new file mode 100644 index 00000000..84abb6b8 --- /dev/null +++ b/python/llm/src/bigdl/llm/transformers/models/aquila.py @@ -0,0 +1,157 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Some parts of this file is adapted from +# https://huggingface.co/BAAI/AquilaChat-7B/blob/main/modeling_aquila.py +# +# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache +from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb +from bigdl.dllib.utils import log4Error + +KV_CACHE_ALLOC_BLOCK_LENGTH = 256 + + +def aquila_attention_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states)\ + .view(bsz, q_len, self.num_heads, self.head_dim)\ + .transpose(1, 2) + key_states = self.k_proj(hidden_states)\ + .view(bsz, q_len, self.num_heads, self.head_dim)\ + .transpose(1, 2) + value_states = self.v_proj(hidden_states)\ + .view(bsz, q_len, self.num_heads, self.head_dim)\ + .transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, + cos, sin, position_ids, "aquila") + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + cache_k = past_key_value[0] + cache_v = past_key_value[1] + if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + # allocate new + new_cache_k, new_cache_v = extend_kv_cache(bsz, + self.num_heads, # Support GQA + self.head_dim, + cache_k.size(2), + kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, + dtype=cache_k.dtype, + device=hidden_states.device) + new_cache_k[:] = cache_k + new_cache_v[:] = cache_v + cache_k = new_cache_k + cache_v = new_cache_v + + key_states, value_states = append_kv_cache(cache_k, cache_v, key_states, value_states) + + elif use_cache: + max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH + new_key_states, new_value_states = init_kv_cache(bsz, + self.num_heads, + self.head_dim, + kv_seq_len, + max_cache_length, + dtype=key_states.dtype, + device=hidden_states.device) + new_key_states[:] = key_states + new_value_states[:] = value_states + key_states = new_key_states + value_states = new_value_states + + past_key_value = (key_states, value_states) if use_cache else None + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + attn_weights = torch.clamp(attn_weights, min=-1024., max=1024.) + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + log4Error.invalidInputError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, " + f"but is {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + log4Error.invalidInputError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, " + f"but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + attn_weights = torch.max( + attn_weights, + torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device) + ) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32)\ + .to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + log4Error.invalidInputError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, " + f"but is {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value diff --git a/python/llm/src/bigdl/llm/transformers/models/llama.py b/python/llm/src/bigdl/llm/transformers/models/llama.py index 51ddb2ee..7953670a 100644 --- a/python/llm/src/bigdl/llm/transformers/models/llama.py +++ b/python/llm/src/bigdl/llm/transformers/models/llama.py @@ -39,6 +39,7 @@ import torch.nn.functional as F from bigdl.llm.utils.common import invalidInputError from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache from bigdl.llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb +from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: @@ -57,6 +58,19 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: KV_CACHE_ALLOC_BLOCK_LENGTH = 256 +def llama_rms_norm_forward(self, hidden_states): + if hidden_states.device.type == "xpu" and not (self.training and hidden_states.requires_grad): + hidden_states, _ = torch.ops.torch_ipex.rms_norm(hidden_states, + [self.weight.size(0)], self.weight) + else: + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + return hidden_states + + def llama_attention_forward_4_31( self, hidden_states: torch.Tensor, @@ -103,9 +117,20 @@ def llama_attention_forward_4_31( kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids, "llama") + + use_fuse_rope = query_states.device.type == "xpu" + use_fuse_rope = use_fuse_rope and not (self.training and query_states.requires_grad) + use_fuse_rope = use_fuse_rope and self.config.rope_scaling is None + + if use_fuse_rope: + query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, + key_states, + position_ids, + "llama") + else: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, + cos, sin, position_ids, "llama") if past_key_value is not None: # reuse k, v, self_attention diff --git a/python/llm/src/bigdl/llm/transformers/models/qwen.py b/python/llm/src/bigdl/llm/transformers/models/qwen.py new file mode 100644 index 00000000..ed2c3e51 --- /dev/null +++ b/python/llm/src/bigdl/llm/transformers/models/qwen.py @@ -0,0 +1,217 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Some parts of this file is adapted from +# https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py +# +# Copyright (c) Alibaba Cloud. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# + +import importlib +import math +from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from transformers.utils import logging + +try: + from einops import rearrange +except ImportError: + rearrange = None + +from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache +from bigdl.llm.utils.common import invalidInputError + +apply_rotary_emb_func = None + +flash_attn_unpadded_func = None + +logger = logging.get_logger(__name__) + +KV_CACHE_ALLOC_BLOCK_LENGTH = 256 + + +def _rotate_half(x): + from einops import rearrange + + x = rearrange(x, "... (j d) -> ... j d", j=2) + x1, x2 = x.unbind(dim=-2) + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(t, freqs): + if apply_rotary_emb_func is not None: + t_ = t.float() + freqs = freqs.squeeze(0).squeeze(1) + cos = freqs[:, : freqs.shape[-1] // 2].cos() + sin = freqs[:, : freqs.shape[-1] // 2].sin() + output = apply_rotary_emb_func(t_, cos, sin).type_as(t) + return output + else: + rot_dim = freqs.shape[-1] + t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:] + t_ = t_.float() + t_pass_ = t_pass_.float() + t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin()) + return torch.cat((t_, t_pass_), dim=-1).type_as(t) + + +def qwen_attention_forward( + self, + hidden_states: Optional[Tuple[torch.FloatTensor]], + layer_past: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, +): + mixed_x_layer = self.c_attn(hidden_states) + query, key, value = mixed_x_layer.split(self.split_size, dim=2) + + query = self._split_heads(query, self.num_heads, self.head_dim) + key = self._split_heads(key, self.num_heads, self.head_dim) + value = self._split_heads(value, self.num_heads, self.head_dim) + + kv_seq_len = hidden_states.size()[1] + + if layer_past: + # layer past[0] shape: bs * seq_len * head_num * dim + kv_seq_len += layer_past[0].shape[1] + if ( + self.use_dynamic_ntk + and kv_seq_len == hidden_states.size()[1] + and not self.training + ): + context_value = math.log(kv_seq_len / self.seq_length, 2) + 1 + ntk_alpha = 2 ** math.ceil(context_value) - 1 + ntk_alpha = max(ntk_alpha, 1) + self._ntk_cached = ntk_alpha + else: + ntk_alpha = self._ntk_cached + rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to( + hidden_states.device + ) + + if rotary_pos_emb is not None: + if isinstance(rotary_pos_emb, tuple): + rotary_pos_emb = rotary_pos_emb + else: + rotary_pos_emb = (rotary_pos_emb,) * 2 + + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + # Slice the pos emb for current inference + cur_len = query.shape[1] + q_pos_emb = q_pos_emb[:, -cur_len:, :, :] + k_pos_emb = k_pos_emb[:, -cur_len:, :, :] + query = apply_rotary_pos_emb(query, q_pos_emb) + key = apply_rotary_pos_emb(key, k_pos_emb) + + bsz, _, n_heads, head_dim = key.size() + + if layer_past is not None: + # past_key, past_value = layer_past[0], layer_past[1] + # key = torch.cat((past_key, key), dim=1) + # value = torch.cat((past_value, value), dim=1) + cache_k = layer_past[0].transpose(1, 2) + cache_v = layer_past[1].transpose(1, 2) + if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + # allocate new + new_cache_k, new_cache_v = extend_kv_cache(bsz, + self.num_heads, # Support GQA + self.head_dim, + cache_k.size(2), + kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, + dtype=cache_k.dtype, + device=hidden_states.device) + new_cache_k[:] = cache_k + new_cache_v[:] = cache_v + cache_k = new_cache_k + cache_v = new_cache_v + + key_states, value_states = append_kv_cache(cache_k, cache_v, + key.transpose(1, 2), value.transpose(1, 2)) + key = key_states.transpose(1, 2) + value = value_states.transpose(1, 2) + elif use_cache: + max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH + new_key_states, new_value_states = init_kv_cache(bsz, + self.num_heads, + self.head_dim, + kv_seq_len, + max_cache_length, + dtype=key.dtype, + device=hidden_states.device) + new_key_states[:] = key.transpose(1, 2) + new_value_states[:] = value.transpose(1, 2) + key = new_key_states.transpose(1, 2) + value = new_value_states.transpose(1, 2) + + if use_cache: + present = (key, value) + else: + present = None + + if self.use_logn_attn and not self.training: + if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype: + self.logn_tensor = self.logn_tensor.to(query.device).type_as(query) + seq_start = key.size(1) - query.size(1) + seq_end = key.size(1) + logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :] + query = query * logn_tensor.expand_as(query) + + if ( + self.use_flash_attn + and flash_attn_unpadded_func is not None + and not self.is_fp32 + and query.is_cuda + ): + q, k, v = query, key, value + context_layer = self.core_attention_flash(q, k, v) + + context_layer = rearrange( + context_layer, "b s h d -> b s (h d)" + ).contiguous() + else: + query = query.permute(0, 2, 1, 3) + key = key.permute(0, 2, 1, 3) + value = value.permute(0, 2, 1, 3) + attn_output, attn_weight = self._attn( + query, key, value, attention_mask, head_mask + ) + context_layer = self._merge_heads( + attn_output, self.num_heads, self.head_dim + ) + + attn_output = self.c_proj(context_layer) + outputs = (attn_output, present) + if output_attentions: + if ( + self.use_flash_attn + and flash_attn_unpadded_func is not None + and not self.is_fp32 + ): + invalidInputError("Cannot output attentions while using flash-attn") + else: + outputs += (attn_weight,) + + return outputs diff --git a/python/llm/src/bigdl/llm/transformers/models/utils.py b/python/llm/src/bigdl/llm/transformers/models/utils.py index 4489b268..1aed301f 100644 --- a/python/llm/src/bigdl/llm/transformers/models/utils.py +++ b/python/llm/src/bigdl/llm/transformers/models/utils.py @@ -71,7 +71,7 @@ def rotate_every_two(x): def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family): - if model_family in ["llama", "baichuan", "internlm"]: + if model_family in ["llama", "baichuan", "internlm", "aquila"]: # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] @@ -97,3 +97,18 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family): else: invalidInputError(False, f"{model_family} is not supported.") + + +def apply_rotary_pos_emb_no_cache_xpu(q, k, position_ids, model_family): + if q.device.type != "xpu": + invalidInputError(False, + f"only xpu is supported in this function") + import linear_q4_0 + q_embed = torch.empty(q.shape, dtype=q.dtype, device=q.device) + k_embed = torch.empty(k.shape, dtype=k.dtype, device=k.device) + if model_family in ["llama", "baichuan", "internlm", "aquila", "gpt_neox"]: + linear_q4_0.apply_rotary_embedding_half_qk(q, k, position_ids, q_embed, k_embed) + return q_embed, k_embed + else: + invalidInputError(False, + f"{model_family} is not supported.") diff --git a/python/llm/src/bigdl/llm/transformers/qlora.py b/python/llm/src/bigdl/llm/transformers/qlora.py index d2728f08..2b074105 100644 --- a/python/llm/src/bigdl/llm/transformers/qlora.py +++ b/python/llm/src/bigdl/llm/transformers/qlora.py @@ -36,6 +36,7 @@ import torch from bigdl.llm.transformers.low_bit_linear import LowBitLinear from peft.tuners.lora import LoraLayer from bigdl.llm.utils.common import invalidInputError +import functools class LoraLowBitLinear(LowBitLinear, LoraLayer): @@ -94,13 +95,11 @@ class LoraLowBitLinear(LowBitLinear, LoraLayer): return result -@staticmethod -def _create_new_module(lora_config, adapter_name, target, **kwargs): - - bias = kwargs.pop("bias", False) +def _create_new_module(create_new_module_func, lora_config, adapter_name, target, **kwargs): if isinstance(target, LowBitLinear): low_bit_kwargs = kwargs.copy() + bias = low_bit_kwargs.pop("bias", False) low_bit_kwargs.update( { "qtype": target.qtype, @@ -112,9 +111,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs): bias=bias, **low_bit_kwargs) else: - invalidInputError(False, - f"Target module {target} is not supported. " - f"Currently, only `LowBitLinear` are supported.") + new_module = create_new_module_func(lora_config, adapter_name, target, **kwargs) return new_module @@ -124,7 +121,8 @@ from peft.tuners.lora import LoraModel def get_peft_model(*args, **kwargs): old_create_new_module = LoraModel._create_new_module - LoraModel._create_new_module = _create_new_module + LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module, + old_create_new_module)) try: from peft import get_peft_model as get_peft_model_original model = get_peft_model_original(*args, **kwargs) @@ -181,7 +179,8 @@ class PeftModel: def from_pretrained(*args, **kwargs): old_create_new_module = LoraModel._create_new_module - LoraModel._create_new_module = _create_new_module + LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module, + old_create_new_module)) from peft import PeftModel try: model = PeftModel.from_pretrained(*args, **kwargs)