From 05ea0ecd70cb33c1a39814e98fcc09d9b35825e7 Mon Sep 17 00:00:00 2001 From: ZehuaCao <47251317+Romanticoseu@users.noreply.github.com> Date: Tue, 16 Jan 2024 11:32:54 +0800 Subject: [PATCH] add pv for llm-serving k8s deployment (#9906) --- docker/llm/serving/cpu/kubernetes/README.md | 31 +++++++++++++++++-- .../serving/cpu/kubernetes/deployment.yaml | 19 ++++++++++-- .../llm/serving/cpu/kubernetes/models-pv.yaml | 15 +++++++++ 3 files changed, 61 insertions(+), 4 deletions(-) create mode 100644 docker/llm/serving/cpu/kubernetes/models-pv.yaml diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md index e6a2970c..06237c3c 100644 --- a/docker/llm/serving/cpu/kubernetes/README.md +++ b/docker/llm/serving/cpu/kubernetes/README.md @@ -35,6 +35,27 @@ The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct num If you want to use the vllm AsyncLLMEngine for serving, you should set the args -w vllm_worker in worker part of deployment.yaml. +### PersistentVolume +We use the following yaml file for PersistentVolume deployment: +```yaml +apiVersion: v1 +kind: PersistentVolume +metadata: + name: models-pv + labels: + app: models +spec: + capacity: + storage: 10Gi #Modify according to model size + accessModes: + - ReadWriteMany + storageClassName: models + nfs: + path: YOUR_NFS_PATH + server: YOUR_NFS_SERVER + +``` +Then you should upload model to `YOUR_NFS_PATH` ### Controller @@ -154,8 +175,8 @@ spec: restartPolicy: "Always" volumes: - name: llm-models - hostPath: - path: /home/llm/models # change this in other envs + persistentVolumeClaim: + claimName: models-pvc ``` You may want to change the `MODEL_PATH` variable in the yaml. Also, please remember to change the volume path accordingly. @@ -200,6 +221,12 @@ print(completion.choices[0].message.content) #### cURL cURL is another good tool for observing the output of the api. +Before using cURL, you should set your `http_proxy` and `https_proxy` to empty +```bash +export http_proxy= +export https_proxy= +``` + For the following examples, you may also change the service deployment address. List Models: diff --git a/docker/llm/serving/cpu/kubernetes/deployment.yaml b/docker/llm/serving/cpu/kubernetes/deployment.yaml index c73bc886..7fca1a32 100644 --- a/docker/llm/serving/cpu/kubernetes/deployment.yaml +++ b/docker/llm/serving/cpu/kubernetes/deployment.yaml @@ -1,4 +1,19 @@ apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: models-pvc +spec: + accessModes: + - ReadWriteMany + storageClassName: models + resources: + requests: + storage: 10Gi + # selector: # Optional + # matchLabels: + # app: kubecon-models +--- +apiVersion: v1 kind: Pod metadata: name: bigdl-fschat-a1234bd-controller @@ -105,5 +120,5 @@ spec: restartPolicy: "Always" volumes: - name: llm-models - hostPath: - path: /home/llm/models # change this in other envs \ No newline at end of file + persistentVolumeClaim: + claimName: models-pvc \ No newline at end of file diff --git a/docker/llm/serving/cpu/kubernetes/models-pv.yaml b/docker/llm/serving/cpu/kubernetes/models-pv.yaml new file mode 100644 index 00000000..e8eed0f7 --- /dev/null +++ b/docker/llm/serving/cpu/kubernetes/models-pv.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: models-pv + labels: + app: models +spec: + capacity: + storage: 10Gi #Modify according to model size + accessModes: + - ReadWriteMany + storageClassName: models + nfs: + path: YOUR_NFS_PATH + server: YOUR_NFS_SERVER