From 05ea0ecd70cb33c1a39814e98fcc09d9b35825e7 Mon Sep 17 00:00:00 2001
From: ZehuaCao <47251317+Romanticoseu@users.noreply.github.com>
Date: Tue, 16 Jan 2024 11:32:54 +0800
Subject: [PATCH] add pv for llm-serving k8s deployment (#9906)

---
 docker/llm/serving/cpu/kubernetes/README.md   | 31 +++++++++++++++++--
 .../serving/cpu/kubernetes/deployment.yaml    | 19 ++++++++++--
 .../llm/serving/cpu/kubernetes/models-pv.yaml | 15 +++++++++
 3 files changed, 61 insertions(+), 4 deletions(-)
 create mode 100644 docker/llm/serving/cpu/kubernetes/models-pv.yaml

diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md
index e6a2970c..06237c3c 100644
--- a/docker/llm/serving/cpu/kubernetes/README.md
+++ b/docker/llm/serving/cpu/kubernetes/README.md
@@ -35,6 +35,27 @@ The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct num
 
 If you want to use the vllm AsyncLLMEngine for serving, you should set the args -w vllm_worker in worker part of deployment.yaml.
 
+### PersistentVolume
+We use the following yaml file for PersistentVolume deployment:
+```yaml
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: models-pv
+  labels:
+    app: models
+spec:
+  capacity:
+    storage: 10Gi #Modify according to model size
+  accessModes:
+    - ReadWriteMany
+  storageClassName: models
+  nfs:
+    path: YOUR_NFS_PATH
+    server: YOUR_NFS_SERVER
+
+```
+Then you should upload model to `YOUR_NFS_PATH`
 
 ### Controller
 
@@ -154,8 +175,8 @@ spec:
       restartPolicy: "Always"
       volumes:
       - name: llm-models
-        hostPath:
-          path: /home/llm/models # change this in other envs
+        persistentVolumeClaim:
+          claimName: models-pvc
 ```
 
 You may want to change the `MODEL_PATH` variable in the yaml.  Also, please remember to change the volume path accordingly.
@@ -200,6 +221,12 @@ print(completion.choices[0].message.content)
 #### cURL
 cURL is another good tool for observing the output of the api.
 
+Before using cURL, you should set your `http_proxy` and `https_proxy` to empty
+```bash
+export http_proxy=
+export https_proxy=
+```
+
 For the following examples, you may also change the service deployment address.
 
 List Models:
diff --git a/docker/llm/serving/cpu/kubernetes/deployment.yaml b/docker/llm/serving/cpu/kubernetes/deployment.yaml
index c73bc886..7fca1a32 100644
--- a/docker/llm/serving/cpu/kubernetes/deployment.yaml
+++ b/docker/llm/serving/cpu/kubernetes/deployment.yaml
@@ -1,4 +1,19 @@
 apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: models-pvc
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: models
+  resources:
+    requests:
+      storage: 10Gi
+  # selector: # Optional
+  #   matchLabels:
+  #     app: kubecon-models
+---
+apiVersion: v1
 kind: Pod
 metadata:
   name: bigdl-fschat-a1234bd-controller
@@ -105,5 +120,5 @@ spec:
       restartPolicy: "Always"
       volumes:
       - name: llm-models
-        hostPath:
-          path: /home/llm/models # change this in other envs
\ No newline at end of file
+        persistentVolumeClaim:
+          claimName: models-pvc
\ No newline at end of file
diff --git a/docker/llm/serving/cpu/kubernetes/models-pv.yaml b/docker/llm/serving/cpu/kubernetes/models-pv.yaml
new file mode 100644
index 00000000..e8eed0f7
--- /dev/null
+++ b/docker/llm/serving/cpu/kubernetes/models-pv.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: models-pv
+  labels:
+    app: models
+spec:
+  capacity:
+    storage: 10Gi #Modify according to model size
+  accessModes:
+    - ReadWriteMany
+  storageClassName: models
+  nfs:
+    path: YOUR_NFS_PATH
+    server: YOUR_NFS_SERVER