diff --git a/docker/llm/serving/cpu/docker/entrypoint.sh b/docker/llm/serving/cpu/docker/entrypoint.sh
index 7fd1e5ab..87a691c7 100644
--- a/docker/llm/serving/cpu/docker/entrypoint.sh
+++ b/docker/llm/serving/cpu/docker/entrypoint.sh
@@ -76,6 +76,7 @@ calculate_total_cores() {
 # Default values
 controller_host="localhost"
 controller_port="21001"
+gradio_port="8002"
 api_host="localhost"
 api_port="8000"
 worker_host="localhost"
@@ -155,6 +156,10 @@ else
     api_port=$API_PORT
   fi
 
+  if [[ -n $GRADIO_PORT ]]; then
+    gradio_port=$GRADIO_PORT
+  fi
+
   if [[ -n $WORKER_HOST ]]; then
     worker_host=$WORKER_HOST
   fi
@@ -185,7 +190,9 @@ else
     echo "OpenAI API address: $api_address"
     python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
     # Boot openai api server
-    python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
+    python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address &
+    # Boot gradio_web_server
+    python3 -m fastchat.serve.gradio_web_server --host $controller_host --port $gradio_port --controller-url $controller_address --model-list-mode reload
   else
     # Logic for non-controller(worker) mode
     worker_address="http://$worker_host:$worker_port"
diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md
index 06237c3c..5c08e00d 100644
--- a/docker/llm/serving/cpu/kubernetes/README.md
+++ b/docker/llm/serving/cpu/kubernetes/README.md
@@ -1,6 +1,5 @@
 ## Deployment bigdl-llm serving service in K8S environment
 
-
 ## Image
 
 To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.5.0-SNAPSHOT`
@@ -15,7 +14,7 @@ After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna
 
 You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
 
-For ChatGLM models, users do not need to add `bigdl` into model path.  We have already used the `BigDL-LLM` backend for this model. 
+For ChatGLM models, users do not need to add `bigdl` into model path.  We have already used the `BigDL-LLM` backend for this model.
 
 ### Kubernetes config
 
@@ -36,7 +35,9 @@ The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct num
 If you want to use the vllm AsyncLLMEngine for serving, you should set the args -w vllm_worker in worker part of deployment.yaml.
 
 ### PersistentVolume
+
 We use the following yaml file for PersistentVolume deployment:
+
 ```yaml
 apiVersion: v1
 kind: PersistentVolume
@@ -55,6 +56,7 @@ spec:
     server: YOUR_NFS_SERVER
 
 ```
+
 Then you should upload model to `YOUR_NFS_PATH`
 
 ### Controller
@@ -84,6 +86,8 @@ spec:
       value: "0.0.0.0"
     - name: API_PORT # fixed
       value: "8000"
+    - name: "GRADIO_PORT" # You can change this port
+      value: "8002"
     ports:
       - containerPort: 21005
         name: con-port
@@ -181,21 +185,32 @@ spec:
 
 You may want to change the `MODEL_PATH` variable in the yaml.  Also, please remember to change the volume path accordingly.
 
+### Use gradio_web_ui
+
+We have set port using `GRADIO_PORT` envrionment variable in `deployment.yaml`, you can use this command
+
+```bash
+k port-forward bigdl-fschat-a1234bd-controller --address 0.0.0.0 8002:8002
+```
+
+Then visit http://YOUR_HOST_IP:8002 to access ui.
 
 ### Testing
 
 #### Check pod ip and port mappings
 
-If you need to access the serving on host , you can use `kubectl get nodes -o wide` to get internal ip and `kubectl get service` to get port mappings. 
+If you need to access the serving on host , you can use `kubectl get nodes -o wide` to get internal ip and `kubectl get service` to get port mappings.
 
 #### Using openai-python
 
 First, install openai-python:
+
 ```bash
 pip install --upgrade openai
 ```
 
 Then, interact with model vicuna-7b-v1.5-bigdl:
+
 ```python
 import openai
 openai.api_key = "EMPTY"
@@ -219,9 +234,11 @@ print(completion.choices[0].message.content)
 ```
 
 #### cURL
+
 cURL is another good tool for observing the output of the api.
 
 Before using cURL, you should set your `http_proxy` and `https_proxy` to empty
+
 ```bash
 export http_proxy=
 export https_proxy=
@@ -230,16 +247,19 @@ export https_proxy=
 For the following examples, you may also change the service deployment address.
 
 List Models:
+
 ```bash
 curl http://localhost:8000/v1/models
 ```
 
 If you have `jq` installed, you can use it to format the output like this:
+
 ```bash
 curl http://localhost:8000/v1/models | jq
 ```
 
 Chat Completions:
+
 ```bash
 curl http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
@@ -250,6 +270,7 @@ curl http://localhost:8000/v1/chat/completions \
 ```
 
 Text Completions:
+
 ```bash
 curl http://localhost:8000/v1/completions \
   -H "Content-Type: application/json" \
@@ -262,6 +283,7 @@ curl http://localhost:8000/v1/completions \
 ```
 
 Embeddings:
+
 ```bash
 curl http://localhost:8000/v1/embeddings \
   -H "Content-Type: application/json" \
@@ -269,4 +291,4 @@ curl http://localhost:8000/v1/embeddings \
     "model": "YOUR_MODEL",
     "input": "Hello world!"
   }'
-```
\ No newline at end of file
+```
diff --git a/docker/llm/serving/cpu/kubernetes/deployment.yaml b/docker/llm/serving/cpu/kubernetes/deployment.yaml
index 7fca1a32..1c58f811 100644
--- a/docker/llm/serving/cpu/kubernetes/deployment.yaml
+++ b/docker/llm/serving/cpu/kubernetes/deployment.yaml
@@ -35,6 +35,8 @@ spec:
       value: "0.0.0.0"
     - name: API_PORT # fixed
       value: "8000"
+    - name: "GRADIO_PORT" # You can change this port
+      value: "8002"
     ports:
       - containerPort: 21005
         name: con-port