Merge remote-tracking branch 'upstream/main'

This commit is contained in:
Wang 2023-10-07 09:53:52 +08:00
commit 4aee952b10
26 changed files with 1181 additions and 42 deletions

View file

@ -12,6 +12,7 @@ on:
- all
- bigdl-llm-xpu
- bigdl-llm-cpu
- bigdl-llm-serving-cpu
- bigdl-ppml-gramine-base
- bigdl-ppml-trusted-bigdl-llm-gramine-base
- bigdl-ppml-trusted-bigdl-llm-gramine-ref
@ -114,6 +115,32 @@ jobs:
sudo docker push 10.239.45.10/arda/${image}:${TAG}
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
bigdl-llm-serving-cpu:
if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }}
runs-on: [self-hosted, Shire]
steps:
- uses: actions/checkout@v3
- name: docker login
run: |
docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
- name: bigdl-llm-serving-cpu
run: |
echo "##############################################################"
echo "####### bigdl-llm-serving-cpu ########"
echo "##############################################################"
export image=intelanalytics/bigdl-llm-serving-cpu
cd docker/llm/serving/cpu/docker
sudo docker build \
--no-cache=true \
--build-arg http_proxy=${HTTP_PROXY} \
--build-arg https_proxy=${HTTPS_PROXY} \
--build-arg no_proxy=${NO_PROXY} \
-t ${image}:${TAG} -f ./Dockerfile .
sudo docker push ${image}:${TAG}
sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
sudo docker push 10.239.45.10/arda/${image}:${TAG}
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
bigdl-ppml-gramine-base:
if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }}
runs-on: [self-hosted, Shire]

View file

@ -9,12 +9,13 @@
**[`bigdl-llm`](python/llm)** is a library for running **LLM** (large language model) on Intel **XPU** (from *Laptop* to *GPU* to *Cloud*) using **INT4** with very low latency[^1] (for any **PyTorch** model).
> *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [gptq](https://github.com/IST-DASLab/gptq), [ggml](https://github.com/ggerganov/ggml), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.*
> *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [ggml](https://github.com/ggerganov/ggml), [gptq](https://github.com/IST-DASLab/gptq), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.*
### Latest update
- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/gpu/qlora_finetuning).
- `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/gpu).
- `bigdl-llm` tutorial is released [here](https://github.com/intel-analytics/bigdl-llm-tutorial).
- Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly-v1/Dolly-v2, StarCoder, Whisper, InternLM, QWen, Baichuan, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models).
- Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models).
### `bigdl-llm` Demos
See the ***optimized performance*** of `chatglm2-6b` and `llama-2-13b-chat` models on 12th Gen Intel Core CPU and Intel Arc GPU below.

View file

@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
ARG http_proxy
ARG https_proxy
ARG TINI_VERSION=v0.18.0
# Disable pip's cache behavior
ARG PIP_NO_CACHE_DIR=false
ADD ./entrypoint.sh /opt/entrypoint.sh
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
# Install Serving Dependencies
RUN mkdir /llm && \
cd /llm && \
@ -13,7 +16,11 @@ RUN mkdir /llm && \
cd FastChat && \
git checkout dev-2023-09-22 && \
pip3 install -e ".[model_worker,webui]" && \
cd /llm
cd /llm && \
chmod +x /opt/entrypoint.sh && \
chmod +x /sbin/tini && \
cp /sbin/tini /usr/bin/tini
WORKDIR /llm/
ENTRYPOINT [ "/opt/entrypoint.sh" ]

View file

@ -0,0 +1,200 @@
#!/bin/bash
usage() {
echo "Usage: $0 [-m --mode <controller|worker>] [-h --help]"
echo "-h: Print help message."
echo "Controller mode reads the following env:"
echo "CONTROLLER_HOST (default: localhost)."
echo "CONTROLLER_PORT (default: 21001)."
echo "API_HOST (default: localhost)."
echo "API_PORT (default: 8000)."
echo "Worker mode reads the following env:"
echo "CONTROLLER_HOST (default: localhost)."
echo "CONTROLLER_PORT (default: 21001)."
echo "WORKER_HOST (default: localhost)."
echo "WORKER_PORT (default: 21002)."
echo "MODEL_PATH (default: empty)."
exit 1
}
# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
calculate_total_cores() {
local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"
if [[ -f "$cpuset_file" ]]; then
local cpuset_cpus=$(cat "$cpuset_file")
cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
local total_cores=0
IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
for cpu in "${cpu_list[@]}"; do
if [[ $cpu =~ - ]]; then
# Range of CPUs
local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
local range_cores=$((end_cpu - start_cpu + 1))
total_cores=$((total_cores + range_cores))
else
# Single CPU
total_cores=$((total_cores + 1))
fi
done
echo $total_cores
return
fi
# Kubernetes core-binding will use this file
cpuset_file="/sys/fs/cgroup/cpuset.cpus"
if [[ -f "$cpuset_file" ]]; then
local cpuset_cpus=$(cat "$cpuset_file")
cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
local total_cores=0
IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
for cpu in "${cpu_list[@]}"; do
if [[ $cpu =~ - ]]; then
# Range of CPUs
local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
local range_cores=$((end_cpu - start_cpu + 1))
total_cores=$((total_cores + range_cores))
else
# Single CPU
total_cores=$((total_cores + 1))
fi
done
echo $total_cores
return
else
echo -1
return
fi
}
# Default values
controller_host="localhost"
controller_port="21001"
api_host="localhost"
api_port="8000"
worker_host="localhost"
worker_port="21002"
model_path=""
mode=""
omp_num_threads=""
dispatch_method="shortest_queue" # shortest_queue or lottery
# Update rootCA config if needed
update-ca-certificates
# Remember the value of `OMP_NUM_THREADS`:
if [[ -n "${OMP_NUM_THREADS}" ]]; then
omp_num_threads="${OMP_NUM_THREADS}"
fi
# We do not have any arguments, just run bash
if [ "$#" == 0 ]; then
echo "[INFO] no command is passed in"
echo "[INFO] enter pass-through mode"
exec /usr/bin/tini -s -- "bash"
else
# Parse command-line options
options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@")
if [ $? != 0 ]; then
usage
fi
eval set -- "$options"
while true; do
case "$1" in
-m|--mode)
mode="$2"
[[ $mode == "controller" || $mode == "worker" ]] || usage
shift 2
;;
-h|--help)
usage
;;
--)
shift
break
;;
*)
usage
;;
esac
done
if [[ -n $CONTROLLER_HOST ]]; then
controller_host=$CONTROLLER_HOST
fi
if [[ -n $CONTROLLER_PORT ]]; then
controller_port=$CONTROLLER_PORT
fi
if [[ -n $API_HOST ]]; then
api_host=$API_HOST
fi
if [[ -n $API_PORT ]]; then
api_port=$API_PORT
fi
if [[ -n $WORKER_HOST ]]; then
worker_host=$WORKER_HOST
fi
if [[ -n $WORKER_PORT ]]; then
worker_port=$WORKER_PORT
fi
if [[ -n $MODEL_PATH ]]; then
model_path=$MODEL_PATH
fi
if [[ -n $DISPATCH_METHOD ]]; then
dispatch_method=$DISPATCH_METHOD
fi
controller_address="http://$controller_host:$controller_port"
# Execute logic based on options
if [[ $mode == "controller" ]]; then
# Logic for controller mode
# Boot Controller
api_address="http://$api_host:$api_port"
echo "Controller address: $controller_address"
echo "OpenAI API address: $api_address"
python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
# Boot openai api server
python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
else
# Logic for non-controller(worker) mode
worker_address="http://$worker_host:$worker_port"
# Apply optimizations from bigdl-nano
source bigdl-nano-init -t
# First check if user have set OMP_NUM_THREADS by themselves
if [[ -n "${omp_num_threads}" ]]; then
echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
export OMP_NUM_THREADS=$omp_num_threads
else
# Use calculate_total_cores to acquire cpuset settings
# Set OMP_NUM_THREADS to correct numbers
cores=$(calculate_total_cores)
if [[ $cores == -1 || $cores == 0 ]]; then
echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
else
echo "Setting OMP_NUM_THREADS to $cores"
export OMP_NUM_THREADS=$cores
fi
fi
if [[ -z "${model_path}" ]]; then
echo "Please set env MODEL_PATH used for worker"
usage
fi
echo "Worker address: $worker_address"
echo "Controller address: $controller_address"
python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
fi
fi

View file

@ -0,0 +1,235 @@
## Deployment bigdl-llm serving service in K8S environment
## Image
To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT`
## Before deployment
### Models
In this document, we will use `vicuna-7b-v1.5` as the deployment model.
After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used.
You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
### Kubernetes config
We recommend to setup your kubernetes cluster before deployment. Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/). Also, it would be great to also set the `topology management policy` to `single-numa-node`.
### Machine config
Set hyper-threading to off, ensure that only physical cores are used during deployment.
## Deployment
### Reminder on `OMP_NUM_THREADS`
The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`. However, this only happens correctly if the `core-binding` feature is enabled. If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file.
### Controller
We use the following yaml file for controller deployment:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: bigdl-fschat-a1234bd-controller
labels:
fastchat-appid: a1234bd
fastchat-app-type: controller
spec:
dnsPolicy: "ClusterFirst"
containers:
- name: fastchat-controller # fixed
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
imagePullPolicy: IfNotPresent
env:
- name: CONTROLLER_HOST # fixed
value: "0.0.0.0"
- name: CONTROLLER_PORT # fixed
value: "21005"
- name: API_HOST # fixed
value: "0.0.0.0"
- name: API_PORT # fixed
value: "8000"
ports:
- containerPort: 21005
name: con-port
- containerPort: 8000
name: api-port
resources:
requests:
memory: 16Gi
cpu: 4
limits:
memory: 16Gi
cpu: 4
args: ["-m", "controller"]
restartPolicy: "Never"
---
# Service for the controller
apiVersion: v1
kind: Service
metadata:
name: bigdl-a1234bd-fschat-controller-service
spec:
# You may also want to change this to use the cluster's feature
type: NodePort
selector:
fastchat-appid: a1234bd
fastchat-app-type: controller
ports:
- name: cont-port
protocol: TCP
port: 21005
targetPort: 21005
- name: api-port
protocol: TCP
port: 8000
targetPort: 8000
```
### Worker
We use the following deployment for worker deployment:
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: bigdl-fschat-a1234bd-worker-deployment
spec:
# Change this to the number you want
replicas: 1
selector:
matchLabels:
fastchat: worker
template:
metadata:
labels:
fastchat: worker
spec:
dnsPolicy: "ClusterFirst"
containers:
- name: fastchat-worker # fixed
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
imagePullPolicy: IfNotPresent
env:
- name: CONTROLLER_HOST # fixed
value: bigdl-a1234bd-fschat-controller-service
- name: CONTROLLER_PORT # fixed
value: "21005"
- name: WORKER_HOST # fixed
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: WORKER_PORT # fixed
value: "21841"
- name: MODEL_PATH # Change this
value: "/llm/models/vicuna-7b-v1.5-bigdl/"
- name: OMP_NUM_THREADS
value: "16"
resources:
requests:
memory: 32Gi
cpu: 16
limits:
memory: 32Gi
cpu: 16
args: ["-m", "worker"]
volumeMounts:
- name: llm-models
mountPath: /llm/models/
restartPolicy: "Always"
volumes:
- name: llm-models
hostPath:
path: /home/llm/models # change this in other envs
```
You may want to change the `MODEL_PATH` variable in the yaml. Also, please remember to change the volume path accordingly.
### Testing
#### Using openai-python
First, install openai-python:
```bash
pip install --upgrade openai
```
Then, interact with model vicuna-7b-v1.5-bigdl:
```python
import openai
openai.api_key = "EMPTY"
openai.api_base = "http://localhost:8000/v1"
model = "vicuna-7b-v1.5-bigdl"
prompt = "Once upon a time"
# create a completion
completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64)
# print the completion
print(prompt + completion.choices[0].text)
# create a chat completion
completion = openai.ChatCompletion.create(
model=model,
messages=[{"role": "user", "content": "Hello! What is your name?"}]
)
# print the completion
print(completion.choices[0].message.content)
```
#### cURL
cURL is another good tool for observing the output of the api.
For the following examples, you may also change the service deployment address.
List Models:
```bash
curl http://localhost:8000/v1/models
```
If you have `jq` installed, you can use it to format the output like this:
```bash
curl http://localhost:8000/v1/models | jq
```
Chat Completions:
```bash
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "YOUR_MODEL",
"messages": [{"role": "user", "content": "Hello! What is your name?"}]
}'
```
Text Completions:
```bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "YOUR_MODEL",
"prompt": "Once upon a time",
"max_tokens": 41,
"temperature": 0.5
}'
```
Embeddings:
```bash
curl http://localhost:8000/v1/embeddings \
-H "Content-Type: application/json" \
-d '{
"model": "YOUR_MODEL",
"input": "Hello world!"
}'
```

View file

@ -0,0 +1 @@
kubectl delete -f deployment.yaml

View file

@ -0,0 +1,109 @@
apiVersion: v1
kind: Pod
metadata:
name: bigdl-fschat-a1234bd-controller
labels:
fastchat-appid: a1234bd
fastchat-app-type: controller
spec:
dnsPolicy: "ClusterFirst"
containers:
- name: fastchat-controller # fixed
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
imagePullPolicy: IfNotPresent
env:
- name: CONTROLLER_HOST # fixed
value: "0.0.0.0"
- name: CONTROLLER_PORT # fixed
value: "21005"
- name: API_HOST # fixed
value: "0.0.0.0"
- name: API_PORT # fixed
value: "8000"
ports:
- containerPort: 21005
name: con-port
- containerPort: 8000
name: api-port
resources:
requests:
memory: 16Gi
cpu: 4
limits:
memory: 16Gi
cpu: 4
args: ["-m", "controller"]
restartPolicy: "Never"
---
# Service for the controller
apiVersion: v1
kind: Service
metadata:
name: bigdl-a1234bd-fschat-controller-service
spec:
# You may also want to change this to use the cluster's feature
type: NodePort
selector:
fastchat-appid: a1234bd
fastchat-app-type: controller
ports:
- name: cont-port
protocol: TCP
port: 21005
targetPort: 21005
- name: api-port
protocol: TCP
port: 8000
targetPort: 8000
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: bigdl-fschat-a1234bd-worker-deployment
spec:
# Change this to the number you want
replicas: 1
selector:
matchLabels:
fastchat: worker
template:
metadata:
labels:
fastchat: worker
spec:
dnsPolicy: "ClusterFirst"
containers:
- name: fastchat-worker # fixed
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
imagePullPolicy: IfNotPresent
env:
- name: CONTROLLER_HOST # fixed
value: bigdl-a1234bd-fschat-controller-service
- name: CONTROLLER_PORT # fixed
value: "21005"
- name: WORKER_HOST # fixed
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: WORKER_PORT # fixed
value: "21841"
- name: MODEL_PATH # Change this
value: "/llm/models/vicuna-7b-v1.5-bigdl/"
- name: OMP_NUM_THREADS
value: "16"
resources:
requests:
memory: 32Gi
cpu: 16
limits:
memory: 32Gi
cpu: 16
args: ["-m", "worker"]
volumeMounts:
- name: llm-models
mountPath: /llm/models/
restartPolicy: "Always"
volumes:
- name: llm-models
hostPath:
path: /home/llm/models # change this in other envs

View file

@ -38,12 +38,12 @@ subtrees:
title: "Key Features"
subtrees:
- entries:
- file: doc/LLM/Overview/KeyFeatures/optimize_model
- file: doc/LLM/Overview/KeyFeatures/transformers_style_api
subtrees:
- entries:
- file: doc/LLM/Overview/KeyFeatures/hugging_face_format
- file: doc/LLM/Overview/KeyFeatures/native_format
- file: doc/LLM/Overview/KeyFeatures/optimize_model
- file: doc/LLM/Overview/KeyFeatures/langchain_api
# - file: doc/LLM/Overview/KeyFeatures/cli
- file: doc/LLM/Overview/KeyFeatures/gpu_supports

View file

@ -3,12 +3,12 @@ BigDL-LLM Key Features
You may run the LLMs using ``bigdl-llm`` through one of the following APIs:
* `PyTorch API <./optimize_model.html>`_
* |transformers_style_api|_
* |hugging_face_transformers_format|_
* `Native Format <./native_format.html>`_
* `General PyTorch Model Supports <./langchain_api.html>`_
* `LangChain API <./langchain_api.html>`_
* `GPU Supports <./gpu_supports.html>`_

View file

@ -1,22 +1,27 @@
## General PyTorch Model Supports
## PyTorch API
You may apply BigDL-LLM optimizations on any Pytorch models, not only Hugging Face *Transformers* models for acceleration. With BigDL-LLM, PyTorch models (in FP16/BF16/FP32) can be optimized with low-bit quantizations (supported precisions include INT4/INT5/INT8).
In general, you just need one-line `optimize_model` to easily optimize any loaded PyTorch model, regardless of the library or API you are using. With BigDL-LLM, PyTorch models (in FP16/BF16/FP32) can be optimized with low-bit quantizations (supported precisions include INT4, INT5, INT8, etc).
You can easily enable BigDL-LLM INT4 optimizations on any Pytorch models just as follows:
First, use any PyTorch APIs you like to load your model. To help you better understand the process, here we use [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) library `LlamaForCausalLM` to load a popular model [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) as an example:
```python
# Create or load any Pytorch model
model = ...
# Create or load any Pytorch model, take Llama-2-7b-chat-hf as an example
from transformers import LlamaForCausalLM
model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
```
# Add only two lines to enable BigDL-LLM INT4 optimizations on model
Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default:
```python
from bigdl.llm import optimize_model
# With only one line to enable BigDL-LLM INT4 optimization
model = optimize_model(model)
```
After optimizing the model, you may straightly run the optimized model with no API changed and less inference latency.
After optimizing the model, BigDL-LLM does not require any change in the inference code. You can use any libraries to run the optimized model with very low latency.
```eval_rst
.. seealso::
See the examples for Hugging Face *Transformers* models `here <https://github.com/intel-analytics/BigDL/blob/main/python/llm/example/transformers/general_int4>`_. And examples for other general Pytorch models can be found `here <https://github.com/intel-analytics/BigDL/blob/main/python/llm/example/pytorch-model>`_.
* For more detailed usage of ``optimize_model``, please refer to the `API documentation <https://bigdl.readthedocs.io/en/latest/doc/PythonAPI/LLM/optimize.html>`_.
```

View file

@ -5,9 +5,11 @@
Install BigDL-LLM for CPU supports using pip through:
```bash
pip install bigdl-llm[all]
pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option
```
Please refer to [Environment Setup](#environment-setup) for more information.
```eval_rst
.. note::
@ -43,7 +45,7 @@ First we recommend using [Conda](https://docs.conda.io/en/latest/miniconda.html)
conda create -n llm python=3.9
conda activate llm
pip install bigdl-llm[all] # install bigdl-llm for CPU with 'all' option
pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option
```
Then for running a LLM model with BigDL-LLM optimizations (taking an `example.py` an example):

View file

@ -5,9 +5,11 @@
Install BigDL-LLM for GPU supports using pip through:
```bash
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu # install bigdl-llm for GPU
```
Please refer to [Environment Setup](#environment-setup) for more information.
```eval_rst
.. note::
@ -25,6 +27,12 @@ BigDL-LLM for GPU supports has been verified on:
* Intel Arc™ A-Series Graphics
* Intel Data Center GPU Flex Series
```eval_rst
.. note::
We currently supoort the Ubuntu 20.04 operating system or later. Windows supoort is in progress.
```
To apply Intel GPU acceleration, there're several steps for tools installation and environment preparation:
* Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered.

View file

@ -32,8 +32,8 @@ BigDL-LLM
+++
:bdg-link:`PyTorch <./Overview/KeyFeatures/optimize_model.html>` |
:bdg-link:`transformers-style <./Overview/KeyFeatures/transformers_style_api.html>` |
:bdg-link:`Optimize Model <./Overview/KeyFeatures/optimize_model.html>` |
:bdg-link:`LangChain <./Overview/KeyFeatures/langchain_api.html>` |
:bdg-link:`GPU <./Overview/KeyFeatures/gpu_supports.html>`

View file

@ -4,6 +4,6 @@ BigDL-LLM API
.. toctree::
:maxdepth: 3
optimize.rst
transformers.rst
langchain.rst
optimize.rst

View file

@ -1,4 +1,4 @@
BigDL-LLM Optimize API
BigDL-LLM PyTorch API
=====================
llm.optimize

View file

@ -24,9 +24,10 @@ BigDL-LLM: low-Bit LLM library
============================================
Latest update
============================================
- **[New]** ``bigdl-llm`` now supports QLoRA fintuning on Intel GPU; see the the example `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/gpu/qlora_finetuning>`_.
- ``bigdl-llm`` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/gpu>`_.
- ``bigdl-llm`` tutorial is released `here <https://github.com/intel-analytics/bigdl-llm-tutorial>`_.
- Over 20 models have been verified on ``bigdl-llm``, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly-v1/Dolly-v2, StarCoder, Whisper, InternLM, QWen, Baichuan, MOSS* and more; see the complete list `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/README.md#verified-models>`_.
- Over 20 models have been verified on ``bigdl-llm``, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS* and more; see the complete list `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/README.md#verified-models>`_.
============================================

View file

@ -1,4 +1,4 @@
# Q-Lora (experimental support)
# Finetuning LLAMA Using Q-Lora (experimental support)
This example demonstrates how to finetune a llama2-7b model use Big-LLM 4bit optimizations using [Intel GPUs](../README.md).
@ -7,7 +7,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi
## Example: Finetune llama2-7b using qlora
This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing)
This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing). The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
### 1. Install
@ -26,13 +26,13 @@ pip install peft==0.5.0
source /opt/intel/oneapi/setvars.sh
```
### 3. Run
### 3. Finetune model
```
python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
```
### Sample Output
#### Sample Output
```log
{'loss': 1.6134, 'learning_rate': 0.0002, 'epoch': 0.03}
{'loss': 1.3038, 'learning_rate': 0.00017777777777777779, 'epoch': 0.06}
@ -47,4 +47,12 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
{'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:45<00:00, 1.13s/it]
TrainOutput(global_step=200, training_loss=1.211241865158081, metrics={'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32})
```
```
### 4. Merge the adapter into the original model
```
python ./export_merged_model.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --adapter_path ./outputs/checkpoint-200 --output_path ./outputs/checkpoint-200-merged
```
Then you can use `./outputs/checkpoint-200-merged` as a normal huggingface transformer model to do inference.

View file

@ -0,0 +1,93 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This file is adapted from https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py
#
# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
import transformers
from transformers import LlamaTokenizer # noqa: F402
from bigdl.llm.transformers.qlora import PeftModel
from bigdl.llm.transformers import AutoModelForCausalLM
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
', or the path to the huggingface checkpoint folder')
parser.add_argument('--adapter_path', type=str,)
parser.add_argument('--output_path', type=str,)
args = parser.parse_args()
base_model = model_path = args.repo_id_or_model_path
adapter_path = args.adapter_path
tokenizer = LlamaTokenizer.from_pretrained(base_model)
base_model = AutoModelForCausalLM.from_pretrained(
base_model,
# load_in_low_bit="nf4", # should load the orignal model
torch_dtype=torch.float16,
device_map={"": "cpu"},
)
first_weight = base_model.model.layers[0].self_attn.q_proj.weight
first_weight_old = first_weight.clone()
lora_model = PeftModel.from_pretrained(
base_model,
adapter_path,
device_map={"": "cpu"},
torch_dtype=torch.float16,
)
lora_weight = lora_model.base_model.model.model.layers[
0
].self_attn.q_proj.weight
assert torch.allclose(first_weight_old, first_weight)
# merge weights - new merging method from peft
lora_model = lora_model.merge_and_unload()
lora_model.train(False)
# did we do anything?
assert not torch.allclose(first_weight_old, first_weight)
lora_model_sd = lora_model.state_dict()
deloreanized_sd = {
k.replace("base_model.model.", ""): v
for k, v in lora_model_sd.items()
if "lora" not in k
}
base_model.save_pretrained(args.output_path, state_dict=deloreanized_sd)
tokenizer.save_pretrained(args.output_path)

View file

@ -45,8 +45,9 @@ if __name__ == "__main__":
data = load_dataset(dataset_path)
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
load_in_low_bit="nf4",
optimize_model=False,
torch_dtype=torch.float16,
modules_to_not_convert=["lm_head"],)
model = model.to('xpu')
model.gradient_checkpointing_enable()
@ -71,7 +72,8 @@ if __name__ == "__main__":
warmup_steps=20,
max_steps=200,
learning_rate=2e-4,
fp16=False, # fp16 is not supported yet
save_steps=100,
fp16=True,
logging_steps=20,
output_dir="outputs",
optim="adamw_hf", # paged_adamw_8bit is not supported yet

View file

@ -47,7 +47,11 @@ function starcoder {
}
function chatglm {
command="$lib_dir/main-chatglm_vnni -t $threads -n $n_predict ${filteredArguments[*]}"
if [[ $(lscpu | grep "amx_int8") ]]; then
command="$lib_dir/main-chatglm_amx -t $threads -n $n_predict ${filteredArguments[*]}"
else
command="$lib_dir/main-chatglm_vnni -t $threads -n $n_predict ${filteredArguments[*]}"
fi
echo "$command"
eval "$command"
}

View file

@ -135,6 +135,7 @@ def convert_forward(m, target_m, new_forward):
def optimize(model):
from packaging import version
from bigdl.llm.transformers.models.llama import llama_attention_forward_4_31
from bigdl.llm.transformers.models.llama import llama_rms_norm_forward
from transformers.modeling_utils import PreTrainedModel
# All huggingface format models are inherited from `PreTrainedModel`
@ -149,11 +150,16 @@ def optimize(model):
model,
transformers.models.llama.modeling_llama.LlamaAttention,
llama_attention_forward_4_31,)
convert_forward(
model,
transformers.models.llama.modeling_llama.LlamaRMSNorm,
llama_rms_norm_forward,)
else:
# todo implement 4.28.0 ~ 4.30.2
pass
if "chatglm2" in model.config._name_or_path:
if "chatglm-18b" in model.config._name_or_path or "chatglm2" in model.config._name_or_path:
# chatglm-18b or chatglm2-6b
modeling_module_name = model.__class__.__module__
module = importlib.import_module(modeling_module_name)
from bigdl.llm.transformers.models.chatglm2 import chatglm2_attention_forward_8eb45c
@ -166,6 +172,7 @@ def optimize(model):
module.CoreAttention,
core_attn_forward_8eb45c)
elif "chatglm" in model.config._name_or_path:
# chatglm-6b
modeling_module_name = model.__class__.__module__
module = importlib.import_module(modeling_module_name)
from bigdl.llm.transformers.models.chatglm import chatglm_attention_forward
@ -280,4 +287,20 @@ def optimize(model):
module.InternLMAttention,
internlm_attention_forward
)
elif model.config.model_type == "qwen":
modeling_module_name = model.__class__.__module__
module = importlib.import_module(modeling_module_name)
from bigdl.llm.transformers.models.qwen import qwen_attention_forward
convert_forward(model,
module.QWenAttention,
qwen_attention_forward
)
elif model.config.model_type == "aquila":
modeling_module_name = model.__class__.__module__
module = importlib.import_module(modeling_module_name)
from bigdl.llm.transformers.models.aquila import aquila_attention_forward
convert_forward(model,
module.AquilaAttention,
aquila_attention_forward
)
return model

View file

@ -0,0 +1,157 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Some parts of this file is adapted from
# https://huggingface.co/BAAI/AquilaChat-7B/blob/main/modeling_aquila.py
#
# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb
from bigdl.dllib.utils import log4Error
KV_CACHE_ALLOC_BLOCK_LENGTH = 256
def aquila_attention_forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)\
.view(bsz, q_len, self.num_heads, self.head_dim)\
.transpose(1, 2)
key_states = self.k_proj(hidden_states)\
.view(bsz, q_len, self.num_heads, self.head_dim)\
.transpose(1, 2)
value_states = self.v_proj(hidden_states)\
.view(bsz, q_len, self.num_heads, self.head_dim)\
.transpose(1, 2)
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
cos, sin, position_ids, "aquila")
# [bsz, nh, t, hd]
if past_key_value is not None:
# reuse k, v, self_attention
cache_k = past_key_value[0]
cache_v = past_key_value[1]
if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
# allocate new
new_cache_k, new_cache_v = extend_kv_cache(bsz,
self.num_heads, # Support GQA
self.head_dim,
cache_k.size(2),
kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
dtype=cache_k.dtype,
device=hidden_states.device)
new_cache_k[:] = cache_k
new_cache_v[:] = cache_v
cache_k = new_cache_k
cache_v = new_cache_v
key_states, value_states = append_kv_cache(cache_k, cache_v, key_states, value_states)
elif use_cache:
max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
new_key_states, new_value_states = init_kv_cache(bsz,
self.num_heads,
self.head_dim,
kv_seq_len,
max_cache_length,
dtype=key_states.dtype,
device=hidden_states.device)
new_key_states[:] = key_states
new_value_states[:] = value_states
key_states = new_key_states
value_states = new_value_states
past_key_value = (key_states, value_states) if use_cache else None
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
attn_weights = torch.clamp(attn_weights, min=-1024., max=1024.)
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
log4Error.invalidInputError(
f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, "
f"but is {attn_weights.size()}"
)
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
log4Error.invalidInputError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, "
f"but is {attention_mask.size()}"
)
attn_weights = attn_weights + attention_mask
attn_weights = torch.max(
attn_weights,
torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
)
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32)\
.to(query_states.dtype)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
log4Error.invalidInputError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, "
f"but is {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value

View file

@ -39,6 +39,7 @@ import torch.nn.functional as F
from bigdl.llm.utils.common import invalidInputError
from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
from bigdl.llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb
from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@ -57,6 +58,19 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
KV_CACHE_ALLOC_BLOCK_LENGTH = 256
def llama_rms_norm_forward(self, hidden_states):
if hidden_states.device.type == "xpu" and not (self.training and hidden_states.requires_grad):
hidden_states, _ = torch.ops.torch_ipex.rms_norm(hidden_states,
[self.weight.size(0)], self.weight)
else:
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)
return hidden_states
def llama_attention_forward_4_31(
self,
hidden_states: torch.Tensor,
@ -103,9 +117,20 @@ def llama_attention_forward_4_31(
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
cos, sin, position_ids, "llama")
use_fuse_rope = query_states.device.type == "xpu"
use_fuse_rope = use_fuse_rope and not (self.training and query_states.requires_grad)
use_fuse_rope = use_fuse_rope and self.config.rope_scaling is None
if use_fuse_rope:
query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states,
key_states,
position_ids,
"llama")
else:
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
cos, sin, position_ids, "llama")
if past_key_value is not None:
# reuse k, v, self_attention

View file

@ -0,0 +1,217 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Some parts of this file is adapted from
# https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py
#
# Copyright (c) Alibaba Cloud.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
import importlib
import math
from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from transformers.utils import logging
try:
from einops import rearrange
except ImportError:
rearrange = None
from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
from bigdl.llm.utils.common import invalidInputError
apply_rotary_emb_func = None
flash_attn_unpadded_func = None
logger = logging.get_logger(__name__)
KV_CACHE_ALLOC_BLOCK_LENGTH = 256
def _rotate_half(x):
from einops import rearrange
x = rearrange(x, "... (j d) -> ... j d", j=2)
x1, x2 = x.unbind(dim=-2)
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(t, freqs):
if apply_rotary_emb_func is not None:
t_ = t.float()
freqs = freqs.squeeze(0).squeeze(1)
cos = freqs[:, : freqs.shape[-1] // 2].cos()
sin = freqs[:, : freqs.shape[-1] // 2].sin()
output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
return output
else:
rot_dim = freqs.shape[-1]
t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
t_ = t_.float()
t_pass_ = t_pass_.float()
t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin())
return torch.cat((t_, t_pass_), dim=-1).type_as(t)
def qwen_attention_forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
layer_past: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
):
mixed_x_layer = self.c_attn(hidden_states)
query, key, value = mixed_x_layer.split(self.split_size, dim=2)
query = self._split_heads(query, self.num_heads, self.head_dim)
key = self._split_heads(key, self.num_heads, self.head_dim)
value = self._split_heads(value, self.num_heads, self.head_dim)
kv_seq_len = hidden_states.size()[1]
if layer_past:
# layer past[0] shape: bs * seq_len * head_num * dim
kv_seq_len += layer_past[0].shape[1]
if (
self.use_dynamic_ntk
and kv_seq_len == hidden_states.size()[1]
and not self.training
):
context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
ntk_alpha = 2 ** math.ceil(context_value) - 1
ntk_alpha = max(ntk_alpha, 1)
self._ntk_cached = ntk_alpha
else:
ntk_alpha = self._ntk_cached
rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(
hidden_states.device
)
if rotary_pos_emb is not None:
if isinstance(rotary_pos_emb, tuple):
rotary_pos_emb = rotary_pos_emb
else:
rotary_pos_emb = (rotary_pos_emb,) * 2
if rotary_pos_emb is not None:
q_pos_emb, k_pos_emb = rotary_pos_emb
# Slice the pos emb for current inference
cur_len = query.shape[1]
q_pos_emb = q_pos_emb[:, -cur_len:, :, :]
k_pos_emb = k_pos_emb[:, -cur_len:, :, :]
query = apply_rotary_pos_emb(query, q_pos_emb)
key = apply_rotary_pos_emb(key, k_pos_emb)
bsz, _, n_heads, head_dim = key.size()
if layer_past is not None:
# past_key, past_value = layer_past[0], layer_past[1]
# key = torch.cat((past_key, key), dim=1)
# value = torch.cat((past_value, value), dim=1)
cache_k = layer_past[0].transpose(1, 2)
cache_v = layer_past[1].transpose(1, 2)
if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
# allocate new
new_cache_k, new_cache_v = extend_kv_cache(bsz,
self.num_heads, # Support GQA
self.head_dim,
cache_k.size(2),
kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
dtype=cache_k.dtype,
device=hidden_states.device)
new_cache_k[:] = cache_k
new_cache_v[:] = cache_v
cache_k = new_cache_k
cache_v = new_cache_v
key_states, value_states = append_kv_cache(cache_k, cache_v,
key.transpose(1, 2), value.transpose(1, 2))
key = key_states.transpose(1, 2)
value = value_states.transpose(1, 2)
elif use_cache:
max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
new_key_states, new_value_states = init_kv_cache(bsz,
self.num_heads,
self.head_dim,
kv_seq_len,
max_cache_length,
dtype=key.dtype,
device=hidden_states.device)
new_key_states[:] = key.transpose(1, 2)
new_value_states[:] = value.transpose(1, 2)
key = new_key_states.transpose(1, 2)
value = new_value_states.transpose(1, 2)
if use_cache:
present = (key, value)
else:
present = None
if self.use_logn_attn and not self.training:
if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
seq_start = key.size(1) - query.size(1)
seq_end = key.size(1)
logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
query = query * logn_tensor.expand_as(query)
if (
self.use_flash_attn
and flash_attn_unpadded_func is not None
and not self.is_fp32
and query.is_cuda
):
q, k, v = query, key, value
context_layer = self.core_attention_flash(q, k, v)
context_layer = rearrange(
context_layer, "b s h d -> b s (h d)"
).contiguous()
else:
query = query.permute(0, 2, 1, 3)
key = key.permute(0, 2, 1, 3)
value = value.permute(0, 2, 1, 3)
attn_output, attn_weight = self._attn(
query, key, value, attention_mask, head_mask
)
context_layer = self._merge_heads(
attn_output, self.num_heads, self.head_dim
)
attn_output = self.c_proj(context_layer)
outputs = (attn_output, present)
if output_attentions:
if (
self.use_flash_attn
and flash_attn_unpadded_func is not None
and not self.is_fp32
):
invalidInputError("Cannot output attentions while using flash-attn")
else:
outputs += (attn_weight,)
return outputs

View file

@ -71,7 +71,7 @@ def rotate_every_two(x):
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
if model_family in ["llama", "baichuan", "internlm"]:
if model_family in ["llama", "baichuan", "internlm", "aquila"]:
# The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
@ -97,3 +97,18 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
else:
invalidInputError(False,
f"{model_family} is not supported.")
def apply_rotary_pos_emb_no_cache_xpu(q, k, position_ids, model_family):
if q.device.type != "xpu":
invalidInputError(False,
f"only xpu is supported in this function")
import linear_q4_0
q_embed = torch.empty(q.shape, dtype=q.dtype, device=q.device)
k_embed = torch.empty(k.shape, dtype=k.dtype, device=k.device)
if model_family in ["llama", "baichuan", "internlm", "aquila", "gpt_neox"]:
linear_q4_0.apply_rotary_embedding_half_qk(q, k, position_ids, q_embed, k_embed)
return q_embed, k_embed
else:
invalidInputError(False,
f"{model_family} is not supported.")

View file

@ -36,6 +36,7 @@ import torch
from bigdl.llm.transformers.low_bit_linear import LowBitLinear
from peft.tuners.lora import LoraLayer
from bigdl.llm.utils.common import invalidInputError
import functools
class LoraLowBitLinear(LowBitLinear, LoraLayer):
@ -94,13 +95,11 @@ class LoraLowBitLinear(LowBitLinear, LoraLayer):
return result
@staticmethod
def _create_new_module(lora_config, adapter_name, target, **kwargs):
bias = kwargs.pop("bias", False)
def _create_new_module(create_new_module_func, lora_config, adapter_name, target, **kwargs):
if isinstance(target, LowBitLinear):
low_bit_kwargs = kwargs.copy()
bias = low_bit_kwargs.pop("bias", False)
low_bit_kwargs.update(
{
"qtype": target.qtype,
@ -112,9 +111,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
bias=bias,
**low_bit_kwargs)
else:
invalidInputError(False,
f"Target module {target} is not supported. "
f"Currently, only `LowBitLinear` are supported.")
new_module = create_new_module_func(lora_config, adapter_name, target, **kwargs)
return new_module
@ -124,7 +121,8 @@ from peft.tuners.lora import LoraModel
def get_peft_model(*args, **kwargs):
old_create_new_module = LoraModel._create_new_module
LoraModel._create_new_module = _create_new_module
LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module,
old_create_new_module))
try:
from peft import get_peft_model as get_peft_model_original
model = get_peft_model_original(*args, **kwargs)
@ -181,7 +179,8 @@ class PeftModel:
def from_pretrained(*args,
**kwargs):
old_create_new_module = LoraModel._create_new_module
LoraModel._create_new_module = _create_new_module
LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module,
old_create_new_module))
from peft import PeftModel
try:
model = PeftModel.from_pretrained(*args, **kwargs)