Merge remote-tracking branch 'upstream/main'
This commit is contained in:
commit
4aee952b10
26 changed files with 1181 additions and 42 deletions
27
.github/workflows/manually_build.yml
vendored
27
.github/workflows/manually_build.yml
vendored
|
|
@ -12,6 +12,7 @@ on:
|
|||
- all
|
||||
- bigdl-llm-xpu
|
||||
- bigdl-llm-cpu
|
||||
- bigdl-llm-serving-cpu
|
||||
- bigdl-ppml-gramine-base
|
||||
- bigdl-ppml-trusted-bigdl-llm-gramine-base
|
||||
- bigdl-ppml-trusted-bigdl-llm-gramine-ref
|
||||
|
|
@ -114,6 +115,32 @@ jobs:
|
|||
sudo docker push 10.239.45.10/arda/${image}:${TAG}
|
||||
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
|
||||
|
||||
bigdl-llm-serving-cpu:
|
||||
if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }}
|
||||
runs-on: [self-hosted, Shire]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: docker login
|
||||
run: |
|
||||
docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
|
||||
- name: bigdl-llm-serving-cpu
|
||||
run: |
|
||||
echo "##############################################################"
|
||||
echo "####### bigdl-llm-serving-cpu ########"
|
||||
echo "##############################################################"
|
||||
export image=intelanalytics/bigdl-llm-serving-cpu
|
||||
cd docker/llm/serving/cpu/docker
|
||||
sudo docker build \
|
||||
--no-cache=true \
|
||||
--build-arg http_proxy=${HTTP_PROXY} \
|
||||
--build-arg https_proxy=${HTTPS_PROXY} \
|
||||
--build-arg no_proxy=${NO_PROXY} \
|
||||
-t ${image}:${TAG} -f ./Dockerfile .
|
||||
sudo docker push ${image}:${TAG}
|
||||
sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
|
||||
sudo docker push 10.239.45.10/arda/${image}:${TAG}
|
||||
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
|
||||
|
||||
bigdl-ppml-gramine-base:
|
||||
if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }}
|
||||
runs-on: [self-hosted, Shire]
|
||||
|
|
|
|||
|
|
@ -9,12 +9,13 @@
|
|||
|
||||
**[`bigdl-llm`](python/llm)** is a library for running **LLM** (large language model) on Intel **XPU** (from *Laptop* to *GPU* to *Cloud*) using **INT4** with very low latency[^1] (for any **PyTorch** model).
|
||||
|
||||
> *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [gptq](https://github.com/IST-DASLab/gptq), [ggml](https://github.com/ggerganov/ggml), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.*
|
||||
> *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [ggml](https://github.com/ggerganov/ggml), [gptq](https://github.com/IST-DASLab/gptq), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.*
|
||||
|
||||
### Latest update
|
||||
- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/gpu/qlora_finetuning).
|
||||
- `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/gpu).
|
||||
- `bigdl-llm` tutorial is released [here](https://github.com/intel-analytics/bigdl-llm-tutorial).
|
||||
- Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly-v1/Dolly-v2, StarCoder, Whisper, InternLM, QWen, Baichuan, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models).
|
||||
- Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models).
|
||||
|
||||
### `bigdl-llm` Demos
|
||||
See the ***optimized performance*** of `chatglm2-6b` and `llama-2-13b-chat` models on 12th Gen Intel Core CPU and Intel Arc GPU below.
|
||||
|
|
|
|||
|
|
@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
|
|||
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
ARG TINI_VERSION=v0.18.0
|
||||
|
||||
# Disable pip's cache behavior
|
||||
ARG PIP_NO_CACHE_DIR=false
|
||||
|
||||
ADD ./entrypoint.sh /opt/entrypoint.sh
|
||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
|
||||
# Install Serving Dependencies
|
||||
RUN mkdir /llm && \
|
||||
cd /llm && \
|
||||
|
|
@ -13,7 +16,11 @@ RUN mkdir /llm && \
|
|||
cd FastChat && \
|
||||
git checkout dev-2023-09-22 && \
|
||||
pip3 install -e ".[model_worker,webui]" && \
|
||||
cd /llm
|
||||
cd /llm && \
|
||||
chmod +x /opt/entrypoint.sh && \
|
||||
chmod +x /sbin/tini && \
|
||||
cp /sbin/tini /usr/bin/tini
|
||||
|
||||
|
||||
WORKDIR /llm/
|
||||
ENTRYPOINT [ "/opt/entrypoint.sh" ]
|
||||
200
docker/llm/serving/cpu/docker/entrypoint.sh
Normal file
200
docker/llm/serving/cpu/docker/entrypoint.sh
Normal file
|
|
@ -0,0 +1,200 @@
|
|||
#!/bin/bash
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 [-m --mode <controller|worker>] [-h --help]"
|
||||
echo "-h: Print help message."
|
||||
echo "Controller mode reads the following env:"
|
||||
echo "CONTROLLER_HOST (default: localhost)."
|
||||
echo "CONTROLLER_PORT (default: 21001)."
|
||||
echo "API_HOST (default: localhost)."
|
||||
echo "API_PORT (default: 8000)."
|
||||
echo "Worker mode reads the following env:"
|
||||
echo "CONTROLLER_HOST (default: localhost)."
|
||||
echo "CONTROLLER_PORT (default: 21001)."
|
||||
echo "WORKER_HOST (default: localhost)."
|
||||
echo "WORKER_PORT (default: 21002)."
|
||||
echo "MODEL_PATH (default: empty)."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
|
||||
calculate_total_cores() {
|
||||
local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"
|
||||
|
||||
if [[ -f "$cpuset_file" ]]; then
|
||||
local cpuset_cpus=$(cat "$cpuset_file")
|
||||
cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
|
||||
|
||||
local total_cores=0
|
||||
IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
|
||||
for cpu in "${cpu_list[@]}"; do
|
||||
if [[ $cpu =~ - ]]; then
|
||||
# Range of CPUs
|
||||
local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
|
||||
local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
|
||||
local range_cores=$((end_cpu - start_cpu + 1))
|
||||
total_cores=$((total_cores + range_cores))
|
||||
else
|
||||
# Single CPU
|
||||
total_cores=$((total_cores + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
echo $total_cores
|
||||
return
|
||||
fi
|
||||
# Kubernetes core-binding will use this file
|
||||
cpuset_file="/sys/fs/cgroup/cpuset.cpus"
|
||||
if [[ -f "$cpuset_file" ]]; then
|
||||
local cpuset_cpus=$(cat "$cpuset_file")
|
||||
cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
|
||||
|
||||
local total_cores=0
|
||||
IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
|
||||
for cpu in "${cpu_list[@]}"; do
|
||||
if [[ $cpu =~ - ]]; then
|
||||
# Range of CPUs
|
||||
local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
|
||||
local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
|
||||
local range_cores=$((end_cpu - start_cpu + 1))
|
||||
total_cores=$((total_cores + range_cores))
|
||||
else
|
||||
# Single CPU
|
||||
total_cores=$((total_cores + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
echo $total_cores
|
||||
return
|
||||
else
|
||||
echo -1
|
||||
return
|
||||
fi
|
||||
}
|
||||
|
||||
# Default values
|
||||
controller_host="localhost"
|
||||
controller_port="21001"
|
||||
api_host="localhost"
|
||||
api_port="8000"
|
||||
worker_host="localhost"
|
||||
worker_port="21002"
|
||||
model_path=""
|
||||
mode=""
|
||||
omp_num_threads=""
|
||||
dispatch_method="shortest_queue" # shortest_queue or lottery
|
||||
|
||||
# Update rootCA config if needed
|
||||
update-ca-certificates
|
||||
|
||||
# Remember the value of `OMP_NUM_THREADS`:
|
||||
if [[ -n "${OMP_NUM_THREADS}" ]]; then
|
||||
omp_num_threads="${OMP_NUM_THREADS}"
|
||||
fi
|
||||
|
||||
# We do not have any arguments, just run bash
|
||||
if [ "$#" == 0 ]; then
|
||||
echo "[INFO] no command is passed in"
|
||||
echo "[INFO] enter pass-through mode"
|
||||
exec /usr/bin/tini -s -- "bash"
|
||||
else
|
||||
# Parse command-line options
|
||||
options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@")
|
||||
if [ $? != 0 ]; then
|
||||
usage
|
||||
fi
|
||||
eval set -- "$options"
|
||||
|
||||
while true; do
|
||||
case "$1" in
|
||||
-m|--mode)
|
||||
mode="$2"
|
||||
[[ $mode == "controller" || $mode == "worker" ]] || usage
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
;;
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -n $CONTROLLER_HOST ]]; then
|
||||
controller_host=$CONTROLLER_HOST
|
||||
fi
|
||||
|
||||
if [[ -n $CONTROLLER_PORT ]]; then
|
||||
controller_port=$CONTROLLER_PORT
|
||||
fi
|
||||
|
||||
if [[ -n $API_HOST ]]; then
|
||||
api_host=$API_HOST
|
||||
fi
|
||||
|
||||
if [[ -n $API_PORT ]]; then
|
||||
api_port=$API_PORT
|
||||
fi
|
||||
|
||||
if [[ -n $WORKER_HOST ]]; then
|
||||
worker_host=$WORKER_HOST
|
||||
fi
|
||||
|
||||
if [[ -n $WORKER_PORT ]]; then
|
||||
worker_port=$WORKER_PORT
|
||||
fi
|
||||
|
||||
if [[ -n $MODEL_PATH ]]; then
|
||||
model_path=$MODEL_PATH
|
||||
fi
|
||||
|
||||
if [[ -n $DISPATCH_METHOD ]]; then
|
||||
dispatch_method=$DISPATCH_METHOD
|
||||
fi
|
||||
|
||||
controller_address="http://$controller_host:$controller_port"
|
||||
# Execute logic based on options
|
||||
if [[ $mode == "controller" ]]; then
|
||||
# Logic for controller mode
|
||||
# Boot Controller
|
||||
api_address="http://$api_host:$api_port"
|
||||
echo "Controller address: $controller_address"
|
||||
echo "OpenAI API address: $api_address"
|
||||
python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
|
||||
# Boot openai api server
|
||||
python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
|
||||
else
|
||||
# Logic for non-controller(worker) mode
|
||||
worker_address="http://$worker_host:$worker_port"
|
||||
# Apply optimizations from bigdl-nano
|
||||
source bigdl-nano-init -t
|
||||
# First check if user have set OMP_NUM_THREADS by themselves
|
||||
if [[ -n "${omp_num_threads}" ]]; then
|
||||
echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
|
||||
export OMP_NUM_THREADS=$omp_num_threads
|
||||
else
|
||||
# Use calculate_total_cores to acquire cpuset settings
|
||||
# Set OMP_NUM_THREADS to correct numbers
|
||||
cores=$(calculate_total_cores)
|
||||
if [[ $cores == -1 || $cores == 0 ]]; then
|
||||
echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
|
||||
else
|
||||
echo "Setting OMP_NUM_THREADS to $cores"
|
||||
export OMP_NUM_THREADS=$cores
|
||||
fi
|
||||
fi
|
||||
if [[ -z "${model_path}" ]]; then
|
||||
echo "Please set env MODEL_PATH used for worker"
|
||||
usage
|
||||
fi
|
||||
echo "Worker address: $worker_address"
|
||||
echo "Controller address: $controller_address"
|
||||
python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
|
||||
fi
|
||||
fi
|
||||
|
||||
235
docker/llm/serving/cpu/kubernetes/README.md
Normal file
235
docker/llm/serving/cpu/kubernetes/README.md
Normal file
|
|
@ -0,0 +1,235 @@
|
|||
## Deployment bigdl-llm serving service in K8S environment
|
||||
|
||||
|
||||
## Image
|
||||
|
||||
To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT`
|
||||
|
||||
## Before deployment
|
||||
|
||||
### Models
|
||||
|
||||
In this document, we will use `vicuna-7b-v1.5` as the deployment model.
|
||||
|
||||
After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used.
|
||||
|
||||
You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
|
||||
|
||||
### Kubernetes config
|
||||
|
||||
We recommend to setup your kubernetes cluster before deployment. Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/). Also, it would be great to also set the `topology management policy` to `single-numa-node`.
|
||||
|
||||
### Machine config
|
||||
|
||||
Set hyper-threading to off, ensure that only physical cores are used during deployment.
|
||||
|
||||
## Deployment
|
||||
|
||||
### Reminder on `OMP_NUM_THREADS`
|
||||
|
||||
The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`. However, this only happens correctly if the `core-binding` feature is enabled. If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file.
|
||||
|
||||
|
||||
### Controller
|
||||
|
||||
We use the following yaml file for controller deployment:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: bigdl-fschat-a1234bd-controller
|
||||
labels:
|
||||
fastchat-appid: a1234bd
|
||||
fastchat-app-type: controller
|
||||
spec:
|
||||
dnsPolicy: "ClusterFirst"
|
||||
containers:
|
||||
- name: fastchat-controller # fixed
|
||||
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: CONTROLLER_HOST # fixed
|
||||
value: "0.0.0.0"
|
||||
- name: CONTROLLER_PORT # fixed
|
||||
value: "21005"
|
||||
- name: API_HOST # fixed
|
||||
value: "0.0.0.0"
|
||||
- name: API_PORT # fixed
|
||||
value: "8000"
|
||||
ports:
|
||||
- containerPort: 21005
|
||||
name: con-port
|
||||
- containerPort: 8000
|
||||
name: api-port
|
||||
resources:
|
||||
requests:
|
||||
memory: 16Gi
|
||||
cpu: 4
|
||||
limits:
|
||||
memory: 16Gi
|
||||
cpu: 4
|
||||
args: ["-m", "controller"]
|
||||
restartPolicy: "Never"
|
||||
---
|
||||
# Service for the controller
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: bigdl-a1234bd-fschat-controller-service
|
||||
spec:
|
||||
# You may also want to change this to use the cluster's feature
|
||||
type: NodePort
|
||||
selector:
|
||||
fastchat-appid: a1234bd
|
||||
fastchat-app-type: controller
|
||||
ports:
|
||||
- name: cont-port
|
||||
protocol: TCP
|
||||
port: 21005
|
||||
targetPort: 21005
|
||||
- name: api-port
|
||||
protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
```
|
||||
|
||||
### Worker
|
||||
|
||||
We use the following deployment for worker deployment:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: bigdl-fschat-a1234bd-worker-deployment
|
||||
spec:
|
||||
# Change this to the number you want
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
fastchat: worker
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
fastchat: worker
|
||||
spec:
|
||||
dnsPolicy: "ClusterFirst"
|
||||
containers:
|
||||
- name: fastchat-worker # fixed
|
||||
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: CONTROLLER_HOST # fixed
|
||||
value: bigdl-a1234bd-fschat-controller-service
|
||||
- name: CONTROLLER_PORT # fixed
|
||||
value: "21005"
|
||||
- name: WORKER_HOST # fixed
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: status.podIP
|
||||
- name: WORKER_PORT # fixed
|
||||
value: "21841"
|
||||
- name: MODEL_PATH # Change this
|
||||
value: "/llm/models/vicuna-7b-v1.5-bigdl/"
|
||||
- name: OMP_NUM_THREADS
|
||||
value: "16"
|
||||
resources:
|
||||
requests:
|
||||
memory: 32Gi
|
||||
cpu: 16
|
||||
limits:
|
||||
memory: 32Gi
|
||||
cpu: 16
|
||||
args: ["-m", "worker"]
|
||||
volumeMounts:
|
||||
- name: llm-models
|
||||
mountPath: /llm/models/
|
||||
restartPolicy: "Always"
|
||||
volumes:
|
||||
- name: llm-models
|
||||
hostPath:
|
||||
path: /home/llm/models # change this in other envs
|
||||
```
|
||||
|
||||
You may want to change the `MODEL_PATH` variable in the yaml. Also, please remember to change the volume path accordingly.
|
||||
|
||||
|
||||
### Testing
|
||||
|
||||
#### Using openai-python
|
||||
|
||||
First, install openai-python:
|
||||
```bash
|
||||
pip install --upgrade openai
|
||||
```
|
||||
|
||||
Then, interact with model vicuna-7b-v1.5-bigdl:
|
||||
```python
|
||||
import openai
|
||||
openai.api_key = "EMPTY"
|
||||
openai.api_base = "http://localhost:8000/v1"
|
||||
|
||||
model = "vicuna-7b-v1.5-bigdl"
|
||||
prompt = "Once upon a time"
|
||||
|
||||
# create a completion
|
||||
completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64)
|
||||
# print the completion
|
||||
print(prompt + completion.choices[0].text)
|
||||
|
||||
# create a chat completion
|
||||
completion = openai.ChatCompletion.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hello! What is your name?"}]
|
||||
)
|
||||
# print the completion
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
#### cURL
|
||||
cURL is another good tool for observing the output of the api.
|
||||
|
||||
For the following examples, you may also change the service deployment address.
|
||||
|
||||
List Models:
|
||||
```bash
|
||||
curl http://localhost:8000/v1/models
|
||||
```
|
||||
|
||||
If you have `jq` installed, you can use it to format the output like this:
|
||||
```bash
|
||||
curl http://localhost:8000/v1/models | jq
|
||||
```
|
||||
|
||||
Chat Completions:
|
||||
```bash
|
||||
curl http://localhost:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "YOUR_MODEL",
|
||||
"messages": [{"role": "user", "content": "Hello! What is your name?"}]
|
||||
}'
|
||||
```
|
||||
|
||||
Text Completions:
|
||||
```bash
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "YOUR_MODEL",
|
||||
"prompt": "Once upon a time",
|
||||
"max_tokens": 41,
|
||||
"temperature": 0.5
|
||||
}'
|
||||
```
|
||||
|
||||
Embeddings:
|
||||
```bash
|
||||
curl http://localhost:8000/v1/embeddings \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "YOUR_MODEL",
|
||||
"input": "Hello world!"
|
||||
}'
|
||||
```
|
||||
1
docker/llm/serving/cpu/kubernetes/clean.sh
Normal file
1
docker/llm/serving/cpu/kubernetes/clean.sh
Normal file
|
|
@ -0,0 +1 @@
|
|||
kubectl delete -f deployment.yaml
|
||||
109
docker/llm/serving/cpu/kubernetes/deployment.yaml
Normal file
109
docker/llm/serving/cpu/kubernetes/deployment.yaml
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: bigdl-fschat-a1234bd-controller
|
||||
labels:
|
||||
fastchat-appid: a1234bd
|
||||
fastchat-app-type: controller
|
||||
spec:
|
||||
dnsPolicy: "ClusterFirst"
|
||||
containers:
|
||||
- name: fastchat-controller # fixed
|
||||
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: CONTROLLER_HOST # fixed
|
||||
value: "0.0.0.0"
|
||||
- name: CONTROLLER_PORT # fixed
|
||||
value: "21005"
|
||||
- name: API_HOST # fixed
|
||||
value: "0.0.0.0"
|
||||
- name: API_PORT # fixed
|
||||
value: "8000"
|
||||
ports:
|
||||
- containerPort: 21005
|
||||
name: con-port
|
||||
- containerPort: 8000
|
||||
name: api-port
|
||||
resources:
|
||||
requests:
|
||||
memory: 16Gi
|
||||
cpu: 4
|
||||
limits:
|
||||
memory: 16Gi
|
||||
cpu: 4
|
||||
args: ["-m", "controller"]
|
||||
restartPolicy: "Never"
|
||||
---
|
||||
# Service for the controller
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: bigdl-a1234bd-fschat-controller-service
|
||||
spec:
|
||||
# You may also want to change this to use the cluster's feature
|
||||
type: NodePort
|
||||
selector:
|
||||
fastchat-appid: a1234bd
|
||||
fastchat-app-type: controller
|
||||
ports:
|
||||
- name: cont-port
|
||||
protocol: TCP
|
||||
port: 21005
|
||||
targetPort: 21005
|
||||
- name: api-port
|
||||
protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: bigdl-fschat-a1234bd-worker-deployment
|
||||
spec:
|
||||
# Change this to the number you want
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
fastchat: worker
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
fastchat: worker
|
||||
spec:
|
||||
dnsPolicy: "ClusterFirst"
|
||||
containers:
|
||||
- name: fastchat-worker # fixed
|
||||
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: CONTROLLER_HOST # fixed
|
||||
value: bigdl-a1234bd-fschat-controller-service
|
||||
- name: CONTROLLER_PORT # fixed
|
||||
value: "21005"
|
||||
- name: WORKER_HOST # fixed
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: status.podIP
|
||||
- name: WORKER_PORT # fixed
|
||||
value: "21841"
|
||||
- name: MODEL_PATH # Change this
|
||||
value: "/llm/models/vicuna-7b-v1.5-bigdl/"
|
||||
- name: OMP_NUM_THREADS
|
||||
value: "16"
|
||||
resources:
|
||||
requests:
|
||||
memory: 32Gi
|
||||
cpu: 16
|
||||
limits:
|
||||
memory: 32Gi
|
||||
cpu: 16
|
||||
args: ["-m", "worker"]
|
||||
volumeMounts:
|
||||
- name: llm-models
|
||||
mountPath: /llm/models/
|
||||
restartPolicy: "Always"
|
||||
volumes:
|
||||
- name: llm-models
|
||||
hostPath:
|
||||
path: /home/llm/models # change this in other envs
|
||||
|
|
@ -38,12 +38,12 @@ subtrees:
|
|||
title: "Key Features"
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: doc/LLM/Overview/KeyFeatures/optimize_model
|
||||
- file: doc/LLM/Overview/KeyFeatures/transformers_style_api
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: doc/LLM/Overview/KeyFeatures/hugging_face_format
|
||||
- file: doc/LLM/Overview/KeyFeatures/native_format
|
||||
- file: doc/LLM/Overview/KeyFeatures/optimize_model
|
||||
- file: doc/LLM/Overview/KeyFeatures/langchain_api
|
||||
# - file: doc/LLM/Overview/KeyFeatures/cli
|
||||
- file: doc/LLM/Overview/KeyFeatures/gpu_supports
|
||||
|
|
|
|||
|
|
@ -3,12 +3,12 @@ BigDL-LLM Key Features
|
|||
|
||||
You may run the LLMs using ``bigdl-llm`` through one of the following APIs:
|
||||
|
||||
* `PyTorch API <./optimize_model.html>`_
|
||||
* |transformers_style_api|_
|
||||
|
||||
* |hugging_face_transformers_format|_
|
||||
* `Native Format <./native_format.html>`_
|
||||
|
||||
* `General PyTorch Model Supports <./langchain_api.html>`_
|
||||
* `LangChain API <./langchain_api.html>`_
|
||||
* `GPU Supports <./gpu_supports.html>`_
|
||||
|
||||
|
|
|
|||
|
|
@ -1,22 +1,27 @@
|
|||
## General PyTorch Model Supports
|
||||
## PyTorch API
|
||||
|
||||
You may apply BigDL-LLM optimizations on any Pytorch models, not only Hugging Face *Transformers* models for acceleration. With BigDL-LLM, PyTorch models (in FP16/BF16/FP32) can be optimized with low-bit quantizations (supported precisions include INT4/INT5/INT8).
|
||||
In general, you just need one-line `optimize_model` to easily optimize any loaded PyTorch model, regardless of the library or API you are using. With BigDL-LLM, PyTorch models (in FP16/BF16/FP32) can be optimized with low-bit quantizations (supported precisions include INT4, INT5, INT8, etc).
|
||||
|
||||
You can easily enable BigDL-LLM INT4 optimizations on any Pytorch models just as follows:
|
||||
First, use any PyTorch APIs you like to load your model. To help you better understand the process, here we use [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) library `LlamaForCausalLM` to load a popular model [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) as an example:
|
||||
|
||||
```python
|
||||
# Create or load any Pytorch model
|
||||
model = ...
|
||||
# Create or load any Pytorch model, take Llama-2-7b-chat-hf as an example
|
||||
from transformers import LlamaForCausalLM
|
||||
model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
|
||||
```
|
||||
|
||||
# Add only two lines to enable BigDL-LLM INT4 optimizations on model
|
||||
Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default:
|
||||
```python
|
||||
from bigdl.llm import optimize_model
|
||||
|
||||
# With only one line to enable BigDL-LLM INT4 optimization
|
||||
model = optimize_model(model)
|
||||
```
|
||||
|
||||
After optimizing the model, you may straightly run the optimized model with no API changed and less inference latency.
|
||||
After optimizing the model, BigDL-LLM does not require any change in the inference code. You can use any libraries to run the optimized model with very low latency.
|
||||
|
||||
```eval_rst
|
||||
.. seealso::
|
||||
|
||||
See the examples for Hugging Face *Transformers* models `here <https://github.com/intel-analytics/BigDL/blob/main/python/llm/example/transformers/general_int4>`_. And examples for other general Pytorch models can be found `here <https://github.com/intel-analytics/BigDL/blob/main/python/llm/example/pytorch-model>`_.
|
||||
* For more detailed usage of ``optimize_model``, please refer to the `API documentation <https://bigdl.readthedocs.io/en/latest/doc/PythonAPI/LLM/optimize.html>`_.
|
||||
```
|
||||
|
|
|
|||
|
|
@ -5,9 +5,11 @@
|
|||
Install BigDL-LLM for CPU supports using pip through:
|
||||
|
||||
```bash
|
||||
pip install bigdl-llm[all]
|
||||
pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option
|
||||
```
|
||||
|
||||
Please refer to [Environment Setup](#environment-setup) for more information.
|
||||
|
||||
```eval_rst
|
||||
.. note::
|
||||
|
||||
|
|
@ -43,7 +45,7 @@ First we recommend using [Conda](https://docs.conda.io/en/latest/miniconda.html)
|
|||
conda create -n llm python=3.9
|
||||
conda activate llm
|
||||
|
||||
pip install bigdl-llm[all] # install bigdl-llm for CPU with 'all' option
|
||||
pip install --pre --upgrade bigdl-llm[all] # install the latest bigdl-llm nightly build with 'all' option
|
||||
```
|
||||
|
||||
Then for running a LLM model with BigDL-LLM optimizations (taking an `example.py` an example):
|
||||
|
|
|
|||
|
|
@ -5,9 +5,11 @@
|
|||
Install BigDL-LLM for GPU supports using pip through:
|
||||
|
||||
```bash
|
||||
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
|
||||
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu # install bigdl-llm for GPU
|
||||
```
|
||||
|
||||
Please refer to [Environment Setup](#environment-setup) for more information.
|
||||
|
||||
```eval_rst
|
||||
.. note::
|
||||
|
||||
|
|
@ -25,6 +27,12 @@ BigDL-LLM for GPU supports has been verified on:
|
|||
* Intel Arc™ A-Series Graphics
|
||||
* Intel Data Center GPU Flex Series
|
||||
|
||||
```eval_rst
|
||||
.. note::
|
||||
|
||||
We currently supoort the Ubuntu 20.04 operating system or later. Windows supoort is in progress.
|
||||
```
|
||||
|
||||
To apply Intel GPU acceleration, there're several steps for tools installation and environment preparation:
|
||||
|
||||
* Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered.
|
||||
|
|
|
|||
|
|
@ -32,8 +32,8 @@ BigDL-LLM
|
|||
|
||||
+++
|
||||
|
||||
:bdg-link:`PyTorch <./Overview/KeyFeatures/optimize_model.html>` |
|
||||
:bdg-link:`transformers-style <./Overview/KeyFeatures/transformers_style_api.html>` |
|
||||
:bdg-link:`Optimize Model <./Overview/KeyFeatures/optimize_model.html>` |
|
||||
:bdg-link:`LangChain <./Overview/KeyFeatures/langchain_api.html>` |
|
||||
:bdg-link:`GPU <./Overview/KeyFeatures/gpu_supports.html>`
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,6 @@ BigDL-LLM API
|
|||
.. toctree::
|
||||
:maxdepth: 3
|
||||
|
||||
optimize.rst
|
||||
transformers.rst
|
||||
langchain.rst
|
||||
optimize.rst
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
BigDL-LLM Optimize API
|
||||
BigDL-LLM PyTorch API
|
||||
=====================
|
||||
|
||||
llm.optimize
|
||||
|
|
|
|||
|
|
@ -24,9 +24,10 @@ BigDL-LLM: low-Bit LLM library
|
|||
============================================
|
||||
Latest update
|
||||
============================================
|
||||
- **[New]** ``bigdl-llm`` now supports QLoRA fintuning on Intel GPU; see the the example `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/gpu/qlora_finetuning>`_.
|
||||
- ``bigdl-llm`` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/gpu>`_.
|
||||
- ``bigdl-llm`` tutorial is released `here <https://github.com/intel-analytics/bigdl-llm-tutorial>`_.
|
||||
- Over 20 models have been verified on ``bigdl-llm``, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly-v1/Dolly-v2, StarCoder, Whisper, InternLM, QWen, Baichuan, MOSS* and more; see the complete list `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/README.md#verified-models>`_.
|
||||
- Over 20 models have been verified on ``bigdl-llm``, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS* and more; see the complete list `here <https://github.com/intel-analytics/BigDL/tree/main/python/llm/README.md#verified-models>`_.
|
||||
|
||||
|
||||
============================================
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# Q-Lora (experimental support)
|
||||
# Finetuning LLAMA Using Q-Lora (experimental support)
|
||||
|
||||
This example demonstrates how to finetune a llama2-7b model use Big-LLM 4bit optimizations using [Intel GPUs](../README.md).
|
||||
|
||||
|
|
@ -7,7 +7,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi
|
|||
|
||||
## Example: Finetune llama2-7b using qlora
|
||||
|
||||
This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing)
|
||||
This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing). The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
|
||||
|
||||
### 1. Install
|
||||
|
||||
|
|
@ -26,13 +26,13 @@ pip install peft==0.5.0
|
|||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
### 3. Run
|
||||
### 3. Finetune model
|
||||
|
||||
```
|
||||
python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
|
||||
```
|
||||
|
||||
### Sample Output
|
||||
#### Sample Output
|
||||
```log
|
||||
{'loss': 1.6134, 'learning_rate': 0.0002, 'epoch': 0.03}
|
||||
{'loss': 1.3038, 'learning_rate': 0.00017777777777777779, 'epoch': 0.06}
|
||||
|
|
@ -47,4 +47,12 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
|
|||
{'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32}
|
||||
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:45<00:00, 1.13s/it]
|
||||
TrainOutput(global_step=200, training_loss=1.211241865158081, metrics={'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32})
|
||||
```
|
||||
```
|
||||
|
||||
### 4. Merge the adapter into the original model
|
||||
|
||||
```
|
||||
python ./export_merged_model.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --adapter_path ./outputs/checkpoint-200 --output_path ./outputs/checkpoint-200-merged
|
||||
```
|
||||
|
||||
Then you can use `./outputs/checkpoint-200-merged` as a normal huggingface transformer model to do inference.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,93 @@
|
|||
#
|
||||
# Copyright 2016 The BigDL Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# This file is adapted from https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py
|
||||
#
|
||||
# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
from transformers import LlamaTokenizer # noqa: F402
|
||||
from bigdl.llm.transformers.qlora import PeftModel
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
import argparse
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
|
||||
parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
|
||||
help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
|
||||
', or the path to the huggingface checkpoint folder')
|
||||
parser.add_argument('--adapter_path', type=str,)
|
||||
parser.add_argument('--output_path', type=str,)
|
||||
|
||||
args = parser.parse_args()
|
||||
base_model = model_path = args.repo_id_or_model_path
|
||||
adapter_path = args.adapter_path
|
||||
tokenizer = LlamaTokenizer.from_pretrained(base_model)
|
||||
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model,
|
||||
# load_in_low_bit="nf4", # should load the orignal model
|
||||
torch_dtype=torch.float16,
|
||||
device_map={"": "cpu"},
|
||||
)
|
||||
|
||||
first_weight = base_model.model.layers[0].self_attn.q_proj.weight
|
||||
first_weight_old = first_weight.clone()
|
||||
|
||||
lora_model = PeftModel.from_pretrained(
|
||||
base_model,
|
||||
adapter_path,
|
||||
device_map={"": "cpu"},
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
lora_weight = lora_model.base_model.model.model.layers[
|
||||
0
|
||||
].self_attn.q_proj.weight
|
||||
|
||||
assert torch.allclose(first_weight_old, first_weight)
|
||||
|
||||
# merge weights - new merging method from peft
|
||||
lora_model = lora_model.merge_and_unload()
|
||||
|
||||
lora_model.train(False)
|
||||
|
||||
# did we do anything?
|
||||
assert not torch.allclose(first_weight_old, first_weight)
|
||||
|
||||
lora_model_sd = lora_model.state_dict()
|
||||
deloreanized_sd = {
|
||||
k.replace("base_model.model.", ""): v
|
||||
for k, v in lora_model_sd.items()
|
||||
if "lora" not in k
|
||||
}
|
||||
|
||||
base_model.save_pretrained(args.output_path, state_dict=deloreanized_sd)
|
||||
tokenizer.save_pretrained(args.output_path)
|
||||
|
|
@ -45,8 +45,9 @@ if __name__ == "__main__":
|
|||
data = load_dataset(dataset_path)
|
||||
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
load_in_4bit=True,
|
||||
load_in_low_bit="nf4",
|
||||
optimize_model=False,
|
||||
torch_dtype=torch.float16,
|
||||
modules_to_not_convert=["lm_head"],)
|
||||
model = model.to('xpu')
|
||||
model.gradient_checkpointing_enable()
|
||||
|
|
@ -71,7 +72,8 @@ if __name__ == "__main__":
|
|||
warmup_steps=20,
|
||||
max_steps=200,
|
||||
learning_rate=2e-4,
|
||||
fp16=False, # fp16 is not supported yet
|
||||
save_steps=100,
|
||||
fp16=True,
|
||||
logging_steps=20,
|
||||
output_dir="outputs",
|
||||
optim="adamw_hf", # paged_adamw_8bit is not supported yet
|
||||
|
|
|
|||
|
|
@ -47,7 +47,11 @@ function starcoder {
|
|||
}
|
||||
|
||||
function chatglm {
|
||||
command="$lib_dir/main-chatglm_vnni -t $threads -n $n_predict ${filteredArguments[*]}"
|
||||
if [[ $(lscpu | grep "amx_int8") ]]; then
|
||||
command="$lib_dir/main-chatglm_amx -t $threads -n $n_predict ${filteredArguments[*]}"
|
||||
else
|
||||
command="$lib_dir/main-chatglm_vnni -t $threads -n $n_predict ${filteredArguments[*]}"
|
||||
fi
|
||||
echo "$command"
|
||||
eval "$command"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -135,6 +135,7 @@ def convert_forward(m, target_m, new_forward):
|
|||
def optimize(model):
|
||||
from packaging import version
|
||||
from bigdl.llm.transformers.models.llama import llama_attention_forward_4_31
|
||||
from bigdl.llm.transformers.models.llama import llama_rms_norm_forward
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
|
||||
# All huggingface format models are inherited from `PreTrainedModel`
|
||||
|
|
@ -149,11 +150,16 @@ def optimize(model):
|
|||
model,
|
||||
transformers.models.llama.modeling_llama.LlamaAttention,
|
||||
llama_attention_forward_4_31,)
|
||||
convert_forward(
|
||||
model,
|
||||
transformers.models.llama.modeling_llama.LlamaRMSNorm,
|
||||
llama_rms_norm_forward,)
|
||||
else:
|
||||
# todo implement 4.28.0 ~ 4.30.2
|
||||
pass
|
||||
|
||||
if "chatglm2" in model.config._name_or_path:
|
||||
if "chatglm-18b" in model.config._name_or_path or "chatglm2" in model.config._name_or_path:
|
||||
# chatglm-18b or chatglm2-6b
|
||||
modeling_module_name = model.__class__.__module__
|
||||
module = importlib.import_module(modeling_module_name)
|
||||
from bigdl.llm.transformers.models.chatglm2 import chatglm2_attention_forward_8eb45c
|
||||
|
|
@ -166,6 +172,7 @@ def optimize(model):
|
|||
module.CoreAttention,
|
||||
core_attn_forward_8eb45c)
|
||||
elif "chatglm" in model.config._name_or_path:
|
||||
# chatglm-6b
|
||||
modeling_module_name = model.__class__.__module__
|
||||
module = importlib.import_module(modeling_module_name)
|
||||
from bigdl.llm.transformers.models.chatglm import chatglm_attention_forward
|
||||
|
|
@ -280,4 +287,20 @@ def optimize(model):
|
|||
module.InternLMAttention,
|
||||
internlm_attention_forward
|
||||
)
|
||||
elif model.config.model_type == "qwen":
|
||||
modeling_module_name = model.__class__.__module__
|
||||
module = importlib.import_module(modeling_module_name)
|
||||
from bigdl.llm.transformers.models.qwen import qwen_attention_forward
|
||||
convert_forward(model,
|
||||
module.QWenAttention,
|
||||
qwen_attention_forward
|
||||
)
|
||||
elif model.config.model_type == "aquila":
|
||||
modeling_module_name = model.__class__.__module__
|
||||
module = importlib.import_module(modeling_module_name)
|
||||
from bigdl.llm.transformers.models.aquila import aquila_attention_forward
|
||||
convert_forward(model,
|
||||
module.AquilaAttention,
|
||||
aquila_attention_forward
|
||||
)
|
||||
return model
|
||||
|
|
|
|||
157
python/llm/src/bigdl/llm/transformers/models/aquila.py
Normal file
157
python/llm/src/bigdl/llm/transformers/models/aquila.py
Normal file
|
|
@ -0,0 +1,157 @@
|
|||
#
|
||||
# Copyright 2016 The BigDL Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Some parts of this file is adapted from
|
||||
# https://huggingface.co/BAAI/AquilaChat-7B/blob/main/modeling_aquila.py
|
||||
#
|
||||
# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
|
||||
from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb
|
||||
from bigdl.dllib.utils import log4Error
|
||||
|
||||
KV_CACHE_ALLOC_BLOCK_LENGTH = 256
|
||||
|
||||
|
||||
def aquila_attention_forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||
output_attentions: bool = False,
|
||||
use_cache: bool = False,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
bsz, q_len, _ = hidden_states.size()
|
||||
|
||||
query_states = self.q_proj(hidden_states)\
|
||||
.view(bsz, q_len, self.num_heads, self.head_dim)\
|
||||
.transpose(1, 2)
|
||||
key_states = self.k_proj(hidden_states)\
|
||||
.view(bsz, q_len, self.num_heads, self.head_dim)\
|
||||
.transpose(1, 2)
|
||||
value_states = self.v_proj(hidden_states)\
|
||||
.view(bsz, q_len, self.num_heads, self.head_dim)\
|
||||
.transpose(1, 2)
|
||||
|
||||
kv_seq_len = key_states.shape[-2]
|
||||
if past_key_value is not None:
|
||||
kv_seq_len += past_key_value[0].shape[-2]
|
||||
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
|
||||
cos, sin, position_ids, "aquila")
|
||||
# [bsz, nh, t, hd]
|
||||
|
||||
if past_key_value is not None:
|
||||
# reuse k, v, self_attention
|
||||
cache_k = past_key_value[0]
|
||||
cache_v = past_key_value[1]
|
||||
if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
|
||||
# allocate new
|
||||
new_cache_k, new_cache_v = extend_kv_cache(bsz,
|
||||
self.num_heads, # Support GQA
|
||||
self.head_dim,
|
||||
cache_k.size(2),
|
||||
kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
|
||||
dtype=cache_k.dtype,
|
||||
device=hidden_states.device)
|
||||
new_cache_k[:] = cache_k
|
||||
new_cache_v[:] = cache_v
|
||||
cache_k = new_cache_k
|
||||
cache_v = new_cache_v
|
||||
|
||||
key_states, value_states = append_kv_cache(cache_k, cache_v, key_states, value_states)
|
||||
|
||||
elif use_cache:
|
||||
max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
|
||||
new_key_states, new_value_states = init_kv_cache(bsz,
|
||||
self.num_heads,
|
||||
self.head_dim,
|
||||
kv_seq_len,
|
||||
max_cache_length,
|
||||
dtype=key_states.dtype,
|
||||
device=hidden_states.device)
|
||||
new_key_states[:] = key_states
|
||||
new_value_states[:] = value_states
|
||||
key_states = new_key_states
|
||||
value_states = new_value_states
|
||||
|
||||
past_key_value = (key_states, value_states) if use_cache else None
|
||||
|
||||
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
|
||||
|
||||
attn_weights = torch.clamp(attn_weights, min=-1024., max=1024.)
|
||||
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
|
||||
log4Error.invalidInputError(
|
||||
f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, "
|
||||
f"but is {attn_weights.size()}"
|
||||
)
|
||||
|
||||
if attention_mask is not None:
|
||||
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
|
||||
log4Error.invalidInputError(
|
||||
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, "
|
||||
f"but is {attention_mask.size()}"
|
||||
)
|
||||
attn_weights = attn_weights + attention_mask
|
||||
attn_weights = torch.max(
|
||||
attn_weights,
|
||||
torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
|
||||
)
|
||||
|
||||
# upcast attention to fp32
|
||||
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32)\
|
||||
.to(query_states.dtype)
|
||||
attn_output = torch.matmul(attn_weights, value_states)
|
||||
|
||||
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
|
||||
log4Error.invalidInputError(
|
||||
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, "
|
||||
f"but is {attn_output.size()}"
|
||||
)
|
||||
|
||||
attn_output = attn_output.transpose(1, 2)
|
||||
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
||||
|
||||
attn_output = self.o_proj(attn_output)
|
||||
|
||||
if not output_attentions:
|
||||
attn_weights = None
|
||||
|
||||
return attn_output, attn_weights, past_key_value
|
||||
|
|
@ -39,6 +39,7 @@ import torch.nn.functional as F
|
|||
from bigdl.llm.utils.common import invalidInputError
|
||||
from bigdl.llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
|
||||
from bigdl.llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb
|
||||
from bigdl.llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu
|
||||
|
||||
|
||||
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
||||
|
|
@ -57,6 +58,19 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|||
KV_CACHE_ALLOC_BLOCK_LENGTH = 256
|
||||
|
||||
|
||||
def llama_rms_norm_forward(self, hidden_states):
|
||||
if hidden_states.device.type == "xpu" and not (self.training and hidden_states.requires_grad):
|
||||
hidden_states, _ = torch.ops.torch_ipex.rms_norm(hidden_states,
|
||||
[self.weight.size(0)], self.weight)
|
||||
else:
|
||||
input_dtype = hidden_states.dtype
|
||||
hidden_states = hidden_states.to(torch.float32)
|
||||
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
||||
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
|
||||
return self.weight * hidden_states.to(input_dtype)
|
||||
return hidden_states
|
||||
|
||||
|
||||
def llama_attention_forward_4_31(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
|
|
@ -103,9 +117,20 @@ def llama_attention_forward_4_31(
|
|||
kv_seq_len = key_states.shape[-2]
|
||||
if past_key_value is not None:
|
||||
kv_seq_len += past_key_value[0].shape[-2]
|
||||
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
|
||||
cos, sin, position_ids, "llama")
|
||||
|
||||
use_fuse_rope = query_states.device.type == "xpu"
|
||||
use_fuse_rope = use_fuse_rope and not (self.training and query_states.requires_grad)
|
||||
use_fuse_rope = use_fuse_rope and self.config.rope_scaling is None
|
||||
|
||||
if use_fuse_rope:
|
||||
query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states,
|
||||
key_states,
|
||||
position_ids,
|
||||
"llama")
|
||||
else:
|
||||
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
|
||||
cos, sin, position_ids, "llama")
|
||||
|
||||
if past_key_value is not None:
|
||||
# reuse k, v, self_attention
|
||||
|
|
|
|||
217
python/llm/src/bigdl/llm/transformers/models/qwen.py
Normal file
217
python/llm/src/bigdl/llm/transformers/models/qwen.py
Normal file
|
|
@ -0,0 +1,217 @@
|
|||
#
|
||||
# Copyright 2016 The BigDL Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Some parts of this file is adapted from
|
||||
# https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py
|
||||
#
|
||||
# Copyright (c) Alibaba Cloud.
|
||||
#
|
||||
# This source code is licensed under the license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
#
|
||||
|
||||
import importlib
|
||||
import math
|
||||
from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.checkpoint
|
||||
from transformers.utils import logging
|
||||
|
||||
try:
|
||||
from einops import rearrange
|
||||
except ImportError:
|
||||
rearrange = None
|
||||
|
||||
from bigdl.llm.transformers.models.utils import extend_kv_cache, init_kv_cache, append_kv_cache
|
||||
from bigdl.llm.utils.common import invalidInputError
|
||||
|
||||
apply_rotary_emb_func = None
|
||||
|
||||
flash_attn_unpadded_func = None
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
KV_CACHE_ALLOC_BLOCK_LENGTH = 256
|
||||
|
||||
|
||||
def _rotate_half(x):
|
||||
from einops import rearrange
|
||||
|
||||
x = rearrange(x, "... (j d) -> ... j d", j=2)
|
||||
x1, x2 = x.unbind(dim=-2)
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
|
||||
def apply_rotary_pos_emb(t, freqs):
|
||||
if apply_rotary_emb_func is not None:
|
||||
t_ = t.float()
|
||||
freqs = freqs.squeeze(0).squeeze(1)
|
||||
cos = freqs[:, : freqs.shape[-1] // 2].cos()
|
||||
sin = freqs[:, : freqs.shape[-1] // 2].sin()
|
||||
output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
|
||||
return output
|
||||
else:
|
||||
rot_dim = freqs.shape[-1]
|
||||
t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
|
||||
t_ = t_.float()
|
||||
t_pass_ = t_pass_.float()
|
||||
t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin())
|
||||
return torch.cat((t_, t_pass_), dim=-1).type_as(t)
|
||||
|
||||
|
||||
def qwen_attention_forward(
|
||||
self,
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]],
|
||||
layer_past: Optional[Tuple[torch.Tensor]] = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
):
|
||||
mixed_x_layer = self.c_attn(hidden_states)
|
||||
query, key, value = mixed_x_layer.split(self.split_size, dim=2)
|
||||
|
||||
query = self._split_heads(query, self.num_heads, self.head_dim)
|
||||
key = self._split_heads(key, self.num_heads, self.head_dim)
|
||||
value = self._split_heads(value, self.num_heads, self.head_dim)
|
||||
|
||||
kv_seq_len = hidden_states.size()[1]
|
||||
|
||||
if layer_past:
|
||||
# layer past[0] shape: bs * seq_len * head_num * dim
|
||||
kv_seq_len += layer_past[0].shape[1]
|
||||
if (
|
||||
self.use_dynamic_ntk
|
||||
and kv_seq_len == hidden_states.size()[1]
|
||||
and not self.training
|
||||
):
|
||||
context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
|
||||
ntk_alpha = 2 ** math.ceil(context_value) - 1
|
||||
ntk_alpha = max(ntk_alpha, 1)
|
||||
self._ntk_cached = ntk_alpha
|
||||
else:
|
||||
ntk_alpha = self._ntk_cached
|
||||
rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(
|
||||
hidden_states.device
|
||||
)
|
||||
|
||||
if rotary_pos_emb is not None:
|
||||
if isinstance(rotary_pos_emb, tuple):
|
||||
rotary_pos_emb = rotary_pos_emb
|
||||
else:
|
||||
rotary_pos_emb = (rotary_pos_emb,) * 2
|
||||
|
||||
if rotary_pos_emb is not None:
|
||||
q_pos_emb, k_pos_emb = rotary_pos_emb
|
||||
# Slice the pos emb for current inference
|
||||
cur_len = query.shape[1]
|
||||
q_pos_emb = q_pos_emb[:, -cur_len:, :, :]
|
||||
k_pos_emb = k_pos_emb[:, -cur_len:, :, :]
|
||||
query = apply_rotary_pos_emb(query, q_pos_emb)
|
||||
key = apply_rotary_pos_emb(key, k_pos_emb)
|
||||
|
||||
bsz, _, n_heads, head_dim = key.size()
|
||||
|
||||
if layer_past is not None:
|
||||
# past_key, past_value = layer_past[0], layer_past[1]
|
||||
# key = torch.cat((past_key, key), dim=1)
|
||||
# value = torch.cat((past_value, value), dim=1)
|
||||
cache_k = layer_past[0].transpose(1, 2)
|
||||
cache_v = layer_past[1].transpose(1, 2)
|
||||
if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3):
|
||||
# allocate new
|
||||
new_cache_k, new_cache_v = extend_kv_cache(bsz,
|
||||
self.num_heads, # Support GQA
|
||||
self.head_dim,
|
||||
cache_k.size(2),
|
||||
kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH,
|
||||
dtype=cache_k.dtype,
|
||||
device=hidden_states.device)
|
||||
new_cache_k[:] = cache_k
|
||||
new_cache_v[:] = cache_v
|
||||
cache_k = new_cache_k
|
||||
cache_v = new_cache_v
|
||||
|
||||
key_states, value_states = append_kv_cache(cache_k, cache_v,
|
||||
key.transpose(1, 2), value.transpose(1, 2))
|
||||
key = key_states.transpose(1, 2)
|
||||
value = value_states.transpose(1, 2)
|
||||
elif use_cache:
|
||||
max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH
|
||||
new_key_states, new_value_states = init_kv_cache(bsz,
|
||||
self.num_heads,
|
||||
self.head_dim,
|
||||
kv_seq_len,
|
||||
max_cache_length,
|
||||
dtype=key.dtype,
|
||||
device=hidden_states.device)
|
||||
new_key_states[:] = key.transpose(1, 2)
|
||||
new_value_states[:] = value.transpose(1, 2)
|
||||
key = new_key_states.transpose(1, 2)
|
||||
value = new_value_states.transpose(1, 2)
|
||||
|
||||
if use_cache:
|
||||
present = (key, value)
|
||||
else:
|
||||
present = None
|
||||
|
||||
if self.use_logn_attn and not self.training:
|
||||
if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
|
||||
self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
|
||||
seq_start = key.size(1) - query.size(1)
|
||||
seq_end = key.size(1)
|
||||
logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
|
||||
query = query * logn_tensor.expand_as(query)
|
||||
|
||||
if (
|
||||
self.use_flash_attn
|
||||
and flash_attn_unpadded_func is not None
|
||||
and not self.is_fp32
|
||||
and query.is_cuda
|
||||
):
|
||||
q, k, v = query, key, value
|
||||
context_layer = self.core_attention_flash(q, k, v)
|
||||
|
||||
context_layer = rearrange(
|
||||
context_layer, "b s h d -> b s (h d)"
|
||||
).contiguous()
|
||||
else:
|
||||
query = query.permute(0, 2, 1, 3)
|
||||
key = key.permute(0, 2, 1, 3)
|
||||
value = value.permute(0, 2, 1, 3)
|
||||
attn_output, attn_weight = self._attn(
|
||||
query, key, value, attention_mask, head_mask
|
||||
)
|
||||
context_layer = self._merge_heads(
|
||||
attn_output, self.num_heads, self.head_dim
|
||||
)
|
||||
|
||||
attn_output = self.c_proj(context_layer)
|
||||
outputs = (attn_output, present)
|
||||
if output_attentions:
|
||||
if (
|
||||
self.use_flash_attn
|
||||
and flash_attn_unpadded_func is not None
|
||||
and not self.is_fp32
|
||||
):
|
||||
invalidInputError("Cannot output attentions while using flash-attn")
|
||||
else:
|
||||
outputs += (attn_weight,)
|
||||
|
||||
return outputs
|
||||
|
|
@ -71,7 +71,7 @@ def rotate_every_two(x):
|
|||
|
||||
|
||||
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
|
||||
if model_family in ["llama", "baichuan", "internlm"]:
|
||||
if model_family in ["llama", "baichuan", "internlm", "aquila"]:
|
||||
# The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
|
||||
cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
|
||||
sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
|
||||
|
|
@ -97,3 +97,18 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
|
|||
else:
|
||||
invalidInputError(False,
|
||||
f"{model_family} is not supported.")
|
||||
|
||||
|
||||
def apply_rotary_pos_emb_no_cache_xpu(q, k, position_ids, model_family):
|
||||
if q.device.type != "xpu":
|
||||
invalidInputError(False,
|
||||
f"only xpu is supported in this function")
|
||||
import linear_q4_0
|
||||
q_embed = torch.empty(q.shape, dtype=q.dtype, device=q.device)
|
||||
k_embed = torch.empty(k.shape, dtype=k.dtype, device=k.device)
|
||||
if model_family in ["llama", "baichuan", "internlm", "aquila", "gpt_neox"]:
|
||||
linear_q4_0.apply_rotary_embedding_half_qk(q, k, position_ids, q_embed, k_embed)
|
||||
return q_embed, k_embed
|
||||
else:
|
||||
invalidInputError(False,
|
||||
f"{model_family} is not supported.")
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ import torch
|
|||
from bigdl.llm.transformers.low_bit_linear import LowBitLinear
|
||||
from peft.tuners.lora import LoraLayer
|
||||
from bigdl.llm.utils.common import invalidInputError
|
||||
import functools
|
||||
|
||||
|
||||
class LoraLowBitLinear(LowBitLinear, LoraLayer):
|
||||
|
|
@ -94,13 +95,11 @@ class LoraLowBitLinear(LowBitLinear, LoraLayer):
|
|||
return result
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _create_new_module(lora_config, adapter_name, target, **kwargs):
|
||||
|
||||
bias = kwargs.pop("bias", False)
|
||||
def _create_new_module(create_new_module_func, lora_config, adapter_name, target, **kwargs):
|
||||
|
||||
if isinstance(target, LowBitLinear):
|
||||
low_bit_kwargs = kwargs.copy()
|
||||
bias = low_bit_kwargs.pop("bias", False)
|
||||
low_bit_kwargs.update(
|
||||
{
|
||||
"qtype": target.qtype,
|
||||
|
|
@ -112,9 +111,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
|
|||
bias=bias,
|
||||
**low_bit_kwargs)
|
||||
else:
|
||||
invalidInputError(False,
|
||||
f"Target module {target} is not supported. "
|
||||
f"Currently, only `LowBitLinear` are supported.")
|
||||
new_module = create_new_module_func(lora_config, adapter_name, target, **kwargs)
|
||||
|
||||
return new_module
|
||||
|
||||
|
|
@ -124,7 +121,8 @@ from peft.tuners.lora import LoraModel
|
|||
|
||||
def get_peft_model(*args, **kwargs):
|
||||
old_create_new_module = LoraModel._create_new_module
|
||||
LoraModel._create_new_module = _create_new_module
|
||||
LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module,
|
||||
old_create_new_module))
|
||||
try:
|
||||
from peft import get_peft_model as get_peft_model_original
|
||||
model = get_peft_model_original(*args, **kwargs)
|
||||
|
|
@ -181,7 +179,8 @@ class PeftModel:
|
|||
def from_pretrained(*args,
|
||||
**kwargs):
|
||||
old_create_new_module = LoraModel._create_new_module
|
||||
LoraModel._create_new_module = _create_new_module
|
||||
LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module,
|
||||
old_create_new_module))
|
||||
from peft import PeftModel
|
||||
try:
|
||||
model = PeftModel.from_pretrained(*args, **kwargs)
|
||||
|
|
|
|||
Loading…
Reference in a new issue