diff --git a/docker/llm/finetune/lora/README.md b/docker/llm/finetune/lora/README.md index 59d0fa0f..98b694cf 100644 --- a/docker/llm/finetune/lora/README.md +++ b/docker/llm/finetune/lora/README.md @@ -109,3 +109,4 @@ Example responce: ```json {"quote_list":{"bigdl-lora-finetuning-job-worker-0":"BAACAIEAAAAAAA...","bigdl-lora-finetuning-job-worker-1":"BAACAIEAAAAAAA...","launcher":"BAACAIEAAAAAA..."}} ``` + diff --git a/docker/llm/finetune/lora/cpu/README.md b/docker/llm/finetune/lora/cpu/README.md new file mode 100644 index 00000000..8eb8e486 --- /dev/null +++ b/docker/llm/finetune/lora/cpu/README.md @@ -0,0 +1,57 @@ +## Run BF16-Optimized Lora Finetuning on Kubernetes with OneCCL + +[Alpaca Lora](https://github.com/tloen/alpaca-lora/tree/main) uses [low-rank adaption](https://arxiv.org/pdf/2106.09685.pdf) to speed up the finetuning process of base model [Llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b), and tries to reproduce the standard Alpaca, a general finetuned LLM. This is on top of Hugging Face transformers with Pytorch backend, which natively requires a number of expensive GPU resources and takes significant time. + +By constract, BigDL here provides a CPU optimization to accelerate the lora finetuning of Llama2-7b, in the power of mixed-precision and distributed training. Detailedly, [Intel OneCCL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html), an available Hugging Face backend, is able to speed up the Pytorch computation with BF16 datatype on CPUs, as well as parallel processing on Kubernetes enabled by [Intel MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html). + +The architecture is illustrated in the following: + +![image](https://github.com/Jasonzzt/BigDL/assets/60865256/b66416bc-ad07-49af-8cb0-8967dffb5f58) + +As above, BigDL implements its MPI training with [Kubeflow MPI operator](https://github.com/kubeflow/mpi-operator/tree/master), which encapsulates the deployment as MPIJob CRD, and assists users to handle the construction of a MPI worker cluster on Kubernetes, such as public key distribution, SSH connection, and log collection. + +Now, let's go to deploy a Lora finetuning to create a LLM from Llama2-7b. + +**Note: Please make sure you have already have an available Kubernetes infrastructure and NFS shared storage, and install [Helm CLI](https://helm.sh/docs/helm/helm_install/) for Kubernetes job submission.** + +### 1. Install Kubeflow MPI Operator + +Follow [here](https://github.com/kubeflow/mpi-operator/tree/master#installation) to install a Kubeflow MPI operator in your Kubernetes, which will listen and receive the following MPIJob request at backend. + +### 2. Download Image, Base Model and Finetuning Data + +Follow [here](https://github.com/intel-analytics/BigDL/tree/main/docker/llm/finetune/lora/docker#prepare-bigdl-image-for-lora-finetuning) to prepare BigDL Lora Finetuning image in your cluster. + +As finetuning is from a base model, first download [Llama2-7b model from the public download site of Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b). Then, download [cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. Next, move the downloaded files to a shared directory on your NFS server. + +### 3. Deploy through Helm Chart + +You are allowed to edit and experiment with different parameters in `./kubernetes/values.yaml` to improve finetuning performance and accuracy. For example, you can adjust `trainerNum` and `cpuPerPod` according to node and CPU core numbers in your cluster to make full use of these resources, and different `microBatchSize` result in different training speed and loss (here note that `microBatchSize`×`trainerNum` should not more than 128, as it is the batch size). + +**Note: `dataSubPath` and `modelSubPath` need to have the same names as files under the NFS directory in step 2.** + +After preparing parameters in `./kubernetes/values.yaml`, submit the job as beflow: + +```bash +cd ./kubernetes +helm install bigdl-lora-finetuning . +``` + +### 4. Check Deployment +```bash +kubectl get all -n bigdl-lora-finetuning # you will see launcher and worker pods running +``` + +### 5. Check Finetuning Process + +After deploying successfully, you can find a launcher pod, and then go inside this pod and check the logs collected from all workers. + +```bash +kubectl get all -n bigdl-lora-finetuning # you will see a launcher pod +kubectl exec -it bash -n bigdl-ppml-finetuning # enter launcher pod +cat launcher.log # display logs collected from other workers +``` + +From the log, you can see whether finetuning process has been invoked successfully in all MPI worker pods, and a progress bar with finetuning speed and estimated time will be showed after some data preprocessing steps (this may take quiet a while). + +For the fine-tuned model, it is written by the worker 0 (who holds rank 0), so you can find the model output inside the pod, which can be saved to host by command tools like `kubectl cp` or `scp`. diff --git a/docker/llm/finetune/lora/cpu/docker/Dockerfile b/docker/llm/finetune/lora/cpu/docker/Dockerfile new file mode 100644 index 00000000..d9a568bf --- /dev/null +++ b/docker/llm/finetune/lora/cpu/docker/Dockerfile @@ -0,0 +1,52 @@ +ARG http_proxy +ARG https_proxy + +FROM mpioperator/intel as builder + +ARG http_proxy +ARG https_proxy +ENV PIP_NO_CACHE_DIR=false +ADD ./requirements.txt /ppml/requirements.txt + +RUN mkdir /ppml/data && mkdir /ppml/model && \ +# install pytorch 2.0.1 + apt-get update && \ + apt-get install -y python3-pip python3.9-dev python3-wheel git software-properties-common && \ + pip3 install --upgrade pip && \ + pip install torch==2.0.1 && \ +# install ipex and oneccl + pip install intel_extension_for_pytorch==2.0.100 && \ + pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable && \ +# install transformers etc. + cd /ppml && \ + git clone https://github.com/huggingface/transformers.git && \ + cd transformers && \ + git reset --hard 057e1d74733f52817dc05b673a340b4e3ebea08c && \ + pip install . && \ + pip install -r /ppml/requirements.txt && \ +# install python + add-apt-repository ppa:deadsnakes/ppa -y && \ + apt-get install -y python3.9 && \ + rm /usr/bin/python3 && \ + ln -s /usr/bin/python3.9 /usr/bin/python3 && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + pip install --no-cache requests argparse cryptography==3.3.2 urllib3 && \ + pip install --upgrade requests && \ + pip install setuptools==58.4.0 && \ +# Install OpenSSH for MPI to communicate between containers + apt-get install -y --no-install-recommends openssh-client openssh-server && \ + mkdir -p /var/run/sshd && \ +# Allow OpenSSH to talk to containers without asking for confirmation +# by disabling StrictHostKeyChecking. +# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need +# to disable UserKnownHostsFile to avoid write permissions. +# Disabling StrictModes avoids directory and files read permission checks. + sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +ADD ./bigdl-lora-finetuing-entrypoint.sh /ppml/bigdl-lora-finetuing-entrypoint.sh +ADD ./lora_finetune.py /ppml/lora_finetune.py + +RUN chown -R mpiuser /ppml +USER mpiuser diff --git a/docker/llm/finetune/lora/docker/README.md b/docker/llm/finetune/lora/cpu/docker/README.md similarity index 55% rename from docker/llm/finetune/lora/docker/README.md rename to docker/llm/finetune/lora/cpu/docker/README.md index e1ca3557..e988f8f0 100644 --- a/docker/llm/finetune/lora/docker/README.md +++ b/docker/llm/finetune/lora/cpu/docker/README.md @@ -3,7 +3,7 @@ You can download directly from Dockerhub like: ```bash -docker pull intelanalytics/bigdl-lora-finetuning:2.4.0-SNAPSHOT +docker pull intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT ``` Or build the image from source: @@ -13,8 +13,8 @@ export HTTP_PROXY=your_http_proxy export HTTPS_PROXY=your_https_proxy docker build \ - --build-arg HTTP_PROXY=${HTTP_PROXY} \ - --build-arg HTTPS_PROXY=${HTTPS_PROXY} \ - -t intelanalytics/bigdl-lora-finetuning:2.4.0-SNAPSHOT \ + --build-arg http_proxy=${HTTP_PROXY} \ + --build-arg https_proxy=${HTTPS_PROXY} \ + -t intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT \ -f ./Dockerfile . ``` diff --git a/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh b/docker/llm/finetune/lora/cpu/docker/bigdl-lora-finetuing-entrypoint.sh similarity index 100% rename from docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh rename to docker/llm/finetune/lora/cpu/docker/bigdl-lora-finetuing-entrypoint.sh diff --git a/docker/llm/finetune/lora/docker/lora_finetune.py b/docker/llm/finetune/lora/cpu/docker/lora_finetune.py similarity index 100% rename from docker/llm/finetune/lora/docker/lora_finetune.py rename to docker/llm/finetune/lora/cpu/docker/lora_finetune.py diff --git a/docker/llm/finetune/lora/docker/requirements.txt b/docker/llm/finetune/lora/cpu/docker/requirements.txt similarity index 100% rename from docker/llm/finetune/lora/docker/requirements.txt rename to docker/llm/finetune/lora/cpu/docker/requirements.txt diff --git a/docker/llm/finetune/lora/kubernetes/Chart.yaml b/docker/llm/finetune/lora/cpu/kubernetes/Chart.yaml similarity index 100% rename from docker/llm/finetune/lora/kubernetes/Chart.yaml rename to docker/llm/finetune/lora/cpu/kubernetes/Chart.yaml diff --git a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml b/docker/llm/finetune/lora/cpu/kubernetes/templates/bigdl-lora-finetuning-job.yaml similarity index 98% rename from docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml rename to docker/llm/finetune/lora/cpu/kubernetes/templates/bigdl-lora-finetuning-job.yaml index 4c22b068..4b425b9c 100644 --- a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml +++ b/docker/llm/finetune/lora/cpu/kubernetes/templates/bigdl-lora-finetuning-job.yaml @@ -1,4 +1,3 @@ -{{- if eq .Values.TEEMode "native" }} apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: @@ -90,4 +89,3 @@ spec: - name: nfs-storage persistentVolumeClaim: claimName: nfs-pvc -{{- end }} diff --git a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-namespace.yaml b/docker/llm/finetune/lora/cpu/kubernetes/templates/bigdl-lora-finetuning-namespace.yaml similarity index 100% rename from docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-namespace.yaml rename to docker/llm/finetune/lora/cpu/kubernetes/templates/bigdl-lora-finetuning-namespace.yaml diff --git a/docker/llm/finetune/lora/kubernetes/templates/nfs-pv.yaml b/docker/llm/finetune/lora/cpu/kubernetes/templates/nfs-pv.yaml similarity index 100% rename from docker/llm/finetune/lora/kubernetes/templates/nfs-pv.yaml rename to docker/llm/finetune/lora/cpu/kubernetes/templates/nfs-pv.yaml diff --git a/docker/llm/finetune/lora/kubernetes/templates/nfs-pvc.yaml b/docker/llm/finetune/lora/cpu/kubernetes/templates/nfs-pvc.yaml similarity index 100% rename from docker/llm/finetune/lora/kubernetes/templates/nfs-pvc.yaml rename to docker/llm/finetune/lora/cpu/kubernetes/templates/nfs-pvc.yaml diff --git a/docker/llm/finetune/lora/kubernetes/values.yaml b/docker/llm/finetune/lora/cpu/kubernetes/values.yaml similarity index 55% rename from docker/llm/finetune/lora/kubernetes/values.yaml rename to docker/llm/finetune/lora/cpu/kubernetes/values.yaml index 92df0493..92a5f5e0 100644 --- a/docker/llm/finetune/lora/kubernetes/values.yaml +++ b/docker/llm/finetune/lora/cpu/kubernetes/values.yaml @@ -1,15 +1,9 @@ -imageName: intelanalytics/bigdl-lora-finetuning:2.4.0-SNAPSHOT +imageName: intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT trainerNum: 8 microBatchSize: 8 -TEEMode: tdx # tdx or native nfsServerIp: your_nfs_server_ip nfsPath: a_nfs_shared_folder_path_on_the_server dataSubPath: alpaca_data_cleaned_archive.json # a subpath of the data file under nfs directory modelSubPath: llama-7b-hf # a subpath of the model file (dir) under nfs directory ompNumThreads: 14 cpuPerPod: 42 -attestionApiServicePort: 9870 - -enableTLS: false # true or false -base64ServerCrt: "your_base64_format_server_crt" -base64ServerKey: "your_base64_format_server_key" diff --git a/docker/llm/finetune/lora/docker/Dockerfile b/docker/llm/finetune/lora/docker/Dockerfile deleted file mode 100644 index b9591afc..00000000 --- a/docker/llm/finetune/lora/docker/Dockerfile +++ /dev/null @@ -1,83 +0,0 @@ -ARG HTTP_PROXY -ARG HTTPS_PROXY - -FROM mpioperator/intel as builder - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ADD ./requirements.txt /ppml/requirements.txt - -RUN mkdir /ppml/data && mkdir /ppml/model && mkdir /ppml/output && \ -# install pytorch 2.0.1 - export http_proxy=$HTTP_PROXY && \ - export https_proxy=$HTTPS_PROXY && \ - apt-get update && \ -# Basic dependencies and DCAP - apt-get update && \ - apt install -y build-essential apt-utils wget git sudo vim && \ - mkdir -p /opt/intel/ && \ - cd /opt/intel && \ - wget https://download.01.org/intel-sgx/sgx-dcap/1.16/linux/distro/ubuntu20.04-server/sgx_linux_x64_sdk_2.19.100.3.bin && \ - chmod a+x ./sgx_linux_x64_sdk_2.19.100.3.bin && \ - printf "no\n/opt/intel\n"|./sgx_linux_x64_sdk_2.19.100.3.bin && \ - . /opt/intel/sgxsdk/environment && \ - cd /opt/intel && \ - wget https://download.01.org/intel-sgx/sgx-dcap/1.16/linux/distro/ubuntu20.04-server/sgx_debian_local_repo.tgz && \ - tar xzf sgx_debian_local_repo.tgz && \ - echo 'deb [trusted=yes arch=amd64] file:///opt/intel/sgx_debian_local_repo focal main' | tee /etc/apt/sources.list.d/intel-sgx.list && \ - wget -qO - https://download.01.org/intel-sgx/sgx_repo/ubuntu/intel-sgx-deb.key | apt-key add - && \ - env DEBIAN_FRONTEND=noninteractive apt-get update && apt install -y libsgx-enclave-common-dev libsgx-qe3-logic libsgx-pce-logic libsgx-ae-qe3 libsgx-ae-qve libsgx-urts libsgx-dcap-ql libsgx-dcap-default-qpl libsgx-dcap-quote-verify-dev libsgx-dcap-ql-dev libsgx-dcap-default-qpl-dev libsgx-ra-network libsgx-ra-uefi libtdx-attest libtdx-attest-dev && \ - apt-get install -y python3-pip python3.9-dev python3-wheel && \ - pip3 install --upgrade pip && \ - pip install torch==2.0.1 && \ -# install ipex and oneccl - pip install intel_extension_for_pytorch==2.0.100 && \ - pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable && \ -# install transformers etc. - cd /ppml && \ - apt-get update && \ - apt-get install -y git && \ - git clone https://github.com/huggingface/transformers.git && \ - cd transformers && \ - git reset --hard 057e1d74733f52817dc05b673a340b4e3ebea08c && \ - pip install . && \ - pip install -r /ppml/requirements.txt && \ -# install python - env DEBIAN_FRONTEND=noninteractive apt-get update && \ - apt install software-properties-common -y && \ - add-apt-repository ppa:deadsnakes/ppa -y && \ - apt-get install -y python3.9 && \ - rm /usr/bin/python3 && \ - ln -s /usr/bin/python3.9 /usr/bin/python3 && \ - ln -s /usr/bin/python3 /usr/bin/python && \ - apt-get install -y python3-pip python3.9-dev python3-wheel && \ - pip install --upgrade pip && \ - pip install --no-cache requests argparse cryptography==3.3.2 urllib3 && \ - pip install --upgrade requests && \ - pip install setuptools==58.4.0 && \ -# Install OpenSSH for MPI to communicate between containers - apt-get install -y --no-install-recommends openssh-client openssh-server && \ - mkdir -p /var/run/sshd && \ -# Allow OpenSSH to talk to containers without asking for confirmation -# by disabling StrictHostKeyChecking. -# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need -# to disable UserKnownHostsFile to avoid write permissions. -# Disabling StrictModes avoids directory and files read permission checks. - sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ - echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ - sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ - echo 'port=4050' | tee /etc/tdx-attest.conf && \ - pip install flask && \ - echo "mpiuser ALL = NOPASSWD:SETENV: /opt/intel/oneapi/mpi/2021.9.0/bin/mpirun\nmpiuser ALL = NOPASSWD:SETENV: /usr/bin/python" > /etc/sudoers.d/mpivisudo && \ - chmod 440 /etc/sudoers.d/mpivisudo - -ADD ./bigdl_aa.py /ppml/bigdl_aa.py -ADD ./quote_generator.py /ppml/quote_generator.py -ADD ./worker_quote_generate.py /ppml/worker_quote_generate.py -ADD ./get_worker_quote.sh /ppml/get_worker_quote.sh - -ADD ./bigdl-lora-finetuing-entrypoint.sh /ppml/bigdl-lora-finetuing-entrypoint.sh -ADD ./lora_finetune.py /ppml/lora_finetune.py - -RUN chown -R mpiuser /ppml -USER mpiuser diff --git a/docker/llm/finetune/lora/docker/bigdl_aa.py b/docker/llm/finetune/lora/docker/bigdl_aa.py deleted file mode 100644 index d848fd65..00000000 --- a/docker/llm/finetune/lora/docker/bigdl_aa.py +++ /dev/null @@ -1,58 +0,0 @@ -import quote_generator -from flask import Flask, request -from configparser import ConfigParser -import ssl, os -import base64 -import requests -import subprocess - -app = Flask(__name__) - -@app.route('/gen_quote', methods=['POST']) -def gen_quote(): - data = request.get_json() - user_report_data = data.get('user_report_data') - try: - quote_b = quote_generator.generate_tdx_quote(user_report_data) - quote = base64.b64encode(quote_b).decode('utf-8') - return {'quote': quote} - except Exception as e: - return {'quote': "quote generation failed: %s" % (e)} - -@app.route('/attest', methods=['POST']) -def get_cluster_quote_list(): - data = request.get_json() - user_report_data = data.get('user_report_data') - quote_list = [] - - try: - quote_b = quote_generator.generate_tdx_quote(user_report_data) - quote = base64.b64encode(quote_b).decode("utf-8") - quote_list.append(("launcher", quote)) - except Exception as e: - quote_list.append("launcher", "quote generation failed: %s" % (e)) - - command = "sudo -u mpiuser -E bash /ppml/get_worker_quote.sh %s" % (user_report_data) - output = subprocess.check_output(command, shell=True) - - with open("/ppml/output/quote.log", "r") as quote_file: - for line in quote_file: - line = line.strip() - if line: - parts = line.split(":") - if len(parts) == 2: - quote_list.append((parts[0].strip(), parts[1].strip())) - return {"quote_list": dict(quote_list)} - -if __name__ == '__main__': - print("BigDL-AA: Agent Started.") - port = int(os.environ.get('ATTESTATION_API_SERVICE_PORT')) - enable_tls = os.environ.get('ENABLE_TLS') - if enable_tls == 'true': - context = ssl.SSLContext(ssl.PROTOCOL_TLS) - context.load_cert_chain(certfile='/ppml/keys/server.crt', keyfile='/ppml/keys/server.key') - # https_key_store_token = os.environ.get('HTTPS_KEY_STORE_TOKEN') - # context.load_cert_chain(certfile='/ppml/keys/server.crt', keyfile='/ppml/keys/server.key', password=https_key_store_token) - app.run(host='0.0.0.0', port=port, ssl_context=context) - else: - app.run(host='0.0.0.0', port=port) diff --git a/docker/llm/finetune/lora/docker/get_worker_quote.sh b/docker/llm/finetune/lora/docker/get_worker_quote.sh deleted file mode 100644 index 157e7a67..00000000 --- a/docker/llm/finetune/lora/docker/get_worker_quote.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -set -x -source /opt/intel/oneapi/setvars.sh -export CCL_WORKER_COUNT=$WORLD_SIZE -export CCL_WORKER_AFFINITY=auto -export SAVE_PATH="/ppml/output" - -mpirun \ - -n $WORLD_SIZE \ - -ppn 1 \ - -f /home/mpiuser/hostfile \ - -iface eth0 \ - -genv OMP_NUM_THREADS=$OMP_NUM_THREADS \ - -genv KMP_AFFINITY="granularity=fine,none" \ - -genv KMP_BLOCKTIME=1 \ - -genv TF_ENABLE_ONEDNN_OPTS=1 \ - sudo -E python /ppml/worker_quote_generate.py --user_report_data $1 > $SAVE_PATH/quote.log 2>&1 \ No newline at end of file diff --git a/docker/llm/finetune/lora/docker/quote_generator.py b/docker/llm/finetune/lora/docker/quote_generator.py deleted file mode 100644 index 5aff15fb..00000000 --- a/docker/llm/finetune/lora/docker/quote_generator.py +++ /dev/null @@ -1,88 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import ctypes -import base64 -import os - -def generate_tdx_quote(user_report_data): - # Define the uuid data structure - TDX_UUID_SIZE = 16 - class TdxUuid(ctypes.Structure): - _fields_ = [("d", ctypes.c_uint8 * TDX_UUID_SIZE)] - - # Define the report data structure - TDX_REPORT_DATA_SIZE = 64 - class TdxReportData(ctypes.Structure): - _fields_ = [("d", ctypes.c_uint8 * TDX_REPORT_DATA_SIZE)] - - # Define the report structure - TDX_REPORT_SIZE = 1024 - class TdxReport(ctypes.Structure): - _fields_ = [("d", ctypes.c_uint8 * TDX_REPORT_SIZE)] - - # Load the library - tdx_attest = ctypes.cdll.LoadLibrary("/usr/lib/x86_64-linux-gnu/libtdx_attest.so.1") - - # Set the argument and return types for the function - tdx_attest.tdx_att_get_report.argtypes = [ctypes.POINTER(TdxReportData), ctypes.POINTER(TdxReport)] - tdx_attest.tdx_att_get_report.restype = ctypes.c_uint16 - - tdx_attest.tdx_att_get_quote.argtypes = [ctypes.POINTER(TdxReportData), ctypes.POINTER(TdxUuid), ctypes.c_uint32, ctypes.POINTER(TdxUuid), ctypes.POINTER(ctypes.POINTER(ctypes.c_uint8)), ctypes.POINTER(ctypes.c_uint32), ctypes.c_uint32] - tdx_attest.tdx_att_get_quote.restype = ctypes.c_uint16 - - - # Call the function and check the return code - byte_array_data = bytearray(user_report_data.ljust(64)[:64], "utf-8").replace(b' ', b'\x00') - report_data = TdxReportData() - report_data.d = (ctypes.c_uint8 * 64).from_buffer(byte_array_data) - report = TdxReport() - result = tdx_attest.tdx_att_get_report(ctypes.byref(report_data), ctypes.byref(report)) - if result != 0: - print("Error: " + hex(result)) - - att_key_id_list = None - list_size = 0 - att_key_id = TdxUuid() - p_quote = ctypes.POINTER(ctypes.c_uint8)() - quote_size = ctypes.c_uint32() - flags = 0 - - result = tdx_attest.tdx_att_get_quote(ctypes.byref(report_data), att_key_id_list, list_size, ctypes.byref(att_key_id), ctypes.byref(p_quote), ctypes.byref(quote_size), flags) - - if result != 0: - print("Error: " + hex(result)) - else: - quote = ctypes.string_at(p_quote, quote_size.value) - return quote - -def generate_gramine_quote(user_report_data): - USER_REPORT_PATH = "/dev/attestation/user_report_data" - QUOTE_PATH = "/dev/attestation/quote" - if not os.path.isfile(USER_REPORT_PATH): - print(f"File {USER_REPORT_PATH} not found.") - return "" - if not os.path.isfile(QUOTE_PATH): - print(f"File {QUOTE_PATH} not found.") - return "" - with open(USER_REPORT_PATH, 'w') as out: - out.write(user_report_data) - with open(QUOTE_PATH, "rb") as f: - quote = f.read() - return quote - -if __name__ == "__main__": - print(generate_tdx_quote("ppml")) \ No newline at end of file diff --git a/docker/llm/finetune/lora/docker/worker_quote_generate.py b/docker/llm/finetune/lora/docker/worker_quote_generate.py deleted file mode 100644 index 389a95d0..00000000 --- a/docker/llm/finetune/lora/docker/worker_quote_generate.py +++ /dev/null @@ -1,20 +0,0 @@ -import quote_generator -import argparse -import ssl, os -import base64 -import requests - -parser = argparse.ArgumentParser() -parser.add_argument("--user_report_data", type=str, default="ppml") - -args = parser.parse_args() - -host = os.environ.get('HYDRA_BSTRAP_LOCALHOST').split('.')[0] -user_report_data = args.user_report_data -try: - quote_b = quote_generator.generate_tdx_quote(user_report_data) - quote = base64.b64encode(quote_b).decode('utf-8') -except Exception as e: - quote = "quote generation failed: %s" % (e) - -print("%s: %s"%(host, quote)) \ No newline at end of file diff --git a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml deleted file mode 100644 index ed00ea45..00000000 --- a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml +++ /dev/null @@ -1,162 +0,0 @@ -{{- if eq .Values.TEEMode "tdx" }} -apiVersion: kubeflow.org/v2beta1 -kind: MPIJob -metadata: - name: bigdl-lora-finetuning-job - namespace: bigdl-lora-finetuning -spec: - slotsPerWorker: 1 - runPolicy: - cleanPodPolicy: Running - sshAuthMountPath: /home/mpiuser/.ssh - mpiImplementation: Intel - mpiReplicaSpecs: - Launcher: - replicas: 1 - template: - spec: - volumes: - - name: nfs-storage - persistentVolumeClaim: - claimName: nfs-pvc - - name: dev - hostPath: - path: /dev - {{- if eq .Values.enableTLS true }} - - name: ssl-keys - secret: - secretName: ssl-keys - {{- end }} - runtimeClassName: kata-qemu-tdx - containers: - - image: {{ .Values.imageName }} - name: bigdl-ppml-finetuning-launcher - securityContext: - runAsUser: 0 - privileged: true - command: ["/bin/sh", "-c"] - args: - - | - nohup python /ppml/bigdl_aa.py > /ppml/bigdl_aa.log 2>&1 & - sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh - env: - - name: WORKER_ROLE - value: "launcher" - - name: WORLD_SIZE - value: "{{ .Values.trainerNum }}" - - name: MICRO_BATCH_SIZE - value: "{{ .Values.microBatchSize }}" - - name: MASTER_PORT - value: "42679" - - name: MASTER_ADDR - value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker" - - name: DATA_SUB_PATH - value: "{{ .Values.dataSubPath }}" - - name: OMP_NUM_THREADS - value: "{{ .Values.ompNumThreads }}" - - name: LOCAL_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: HF_DATASETS_CACHE - value: "/ppml/output/cache" - - name: ATTESTATION_API_SERVICE_PORT - value: "{{ .Values.attestionApiServicePort }}" - - name: ENABLE_TLS - value: "{{ .Values.enableTLS }}" - volumeMounts: - - name: nfs-storage - subPath: {{ .Values.modelSubPath }} - mountPath: /ppml/model - - name: nfs-storage - subPath: {{ .Values.dataSubPath }} - mountPath: "/ppml/data/{{ .Values.dataSubPath }}" - - name: dev - mountPath: /dev - {{- if eq .Values.enableTLS true }} - - name: ssl-keys - mountPath: /ppml/keys - {{- end }} - Worker: - replicas: {{ .Values.trainerNum }} - template: - spec: - runtimeClassName: kata-qemu-tdx - containers: - - image: {{ .Values.imageName }} - name: bigdl-ppml-finetuning-worker - securityContext: - runAsUser: 0 - privileged: true - command: ["/bin/sh", "-c"] - args: - - | - chown nobody /home/mpiuser/.ssh/id_rsa & - sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh - env: - - name: WORKER_ROLE - value: "trainer" - - name: WORLD_SIZE - value: "{{ .Values.trainerNum }}" - - name: MICRO_BATCH_SIZE - value: "{{ .Values.microBatchSize }}" - - name: MASTER_PORT - value: "42679" - - name: MASTER_ADDR - value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker" - - name: LOCAL_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - volumeMounts: - - name: nfs-storage - subPath: {{ .Values.modelSubPath }} - mountPath: /ppml/model - - name: nfs-storage - subPath: {{ .Values.dataSubPath }} - mountPath: "/ppml/data/{{ .Values.dataSubPath }}" - - name: dev - mountPath: /dev - resources: - requests: - cpu: {{ .Values.cpuPerPod }} - limits: - cpu: {{ .Values.cpuPerPod }} - volumes: - - name: nfs-storage - persistentVolumeClaim: - claimName: nfs-pvc - - name: dev - hostPath: - path: /dev ---- -apiVersion: v1 -kind: Service -metadata: - name: bigdl-lora-finetuning-launcher-attestation-api-service - namespace: bigdl-lora-finetuning -spec: - selector: - job-name: bigdl-lora-finetuning-job-launcher - training.kubeflow.org/job-name: bigdl-lora-finetuning-job - training.kubeflow.org/job-role: launcher - ports: - - name: launcher-attestation-api-service-port - protocol: TCP - port: {{ .Values.attestionApiServicePort }} - targetPort: {{ .Values.attestionApiServicePort }} - type: ClusterIP ---- -{{- if eq .Values.enableTLS true }} -apiVersion: v1 -kind: Secret -metadata: - name: ssl-keys - namespace: bigdl-lora-finetuning -type: Opaque -data: - server.crt: {{ .Values.base64ServerCrt }} - server.key: {{ .Values.base64ServerKey }} -{{- end }} - -{{- end }}