diff --git a/docker/llm/inference/cpu/docker/Dockerfile b/docker/llm/inference/cpu/docker/Dockerfile
index 1e5143f9..9299f898 100644
--- a/docker/llm/inference/cpu/docker/Dockerfile
+++ b/docker/llm/inference/cpu/docker/Dockerfile
@@ -2,6 +2,7 @@ FROM ubuntu:20.04
 
 ARG http_proxy
 ARG https_proxy
+ARG PIP_NO_CACHE_DIR=false
 
 # Install PYTHON 3.9
 RUN env DEBIAN_FRONTEND=noninteractive apt-get update && \
@@ -12,8 +13,12 @@ RUN env DEBIAN_FRONTEND=noninteractive apt-get update && \
     ln -s /usr/bin/python3.9 /usr/bin/python3 && \
     ln -s /usr/bin/python3 /usr/bin/python && \
     apt-get install -y python3-pip python3.9-dev python3-wheel python3.9-distutils && \
-    pip3 install --no-cache --upgrade requests argparse urllib3 && \
-    pip3 install --pre --upgrade bigdl-llm[all] && \
-    pip3 install --pre --upgrade bigdl-nano
+    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    # Install FastChat from source requires PEP 660 support
+    python3 get-pip.py && \
+    rm get-pip.py && \
+    pip install --upgrade requests argparse urllib3 && \
+    pip install --pre --upgrade bigdl-llm[all] && \
+    pip install --pre --upgrade bigdl-nano
 
-ENTRYPOINT ["/bin/bash"]
\ No newline at end of file
+ENTRYPOINT ["/bin/bash"]
diff --git a/docker/llm/inference/xpu/docker/Dockerfile b/docker/llm/inference/xpu/docker/Dockerfile
index 1a88d64c..0366c933 100644
--- a/docker/llm/inference/xpu/docker/Dockerfile
+++ b/docker/llm/inference/xpu/docker/Dockerfile
@@ -5,6 +5,9 @@ ENV https_proxy $HTTP_PROXY
 
 ENV TZ=Asia/Shanghai
 
+# Disable pip's cache behavior
+ARG PIP_NO_CACHE_DIR=false
+
 RUN apt-get update && \
     apt-get install -y curl wget git gnupg gpg-agent && \
     wget -qO - https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \
@@ -20,8 +23,12 @@ RUN apt-get update && \
     ln -s /usr/bin/python3.9 /usr/bin/python3 && \
     ln -s /usr/bin/python3 /usr/bin/python && \
     apt-get install -y python3-pip python3.9-dev python3-wheel python3.9-distutils && \
-    pip3 install --no-cache --upgrade requests argparse urllib3 && \
-    pip3 install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu && \
+    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    # Install FastChat from source requires PEP 660 support
+    python3 get-pip.py && \
+    rm get-pip.py && \
+    pip install --upgrade requests argparse urllib3 && \
+    pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu && \
     # Install opencl-related repos
     apt-get update && \
     apt-get install -y intel-opencl-icd intel-level-zero-gpu level-zero level-zero-dev
diff --git a/docker/llm/inference/xpu/docker/README.md b/docker/llm/inference/xpu/docker/README.md
index b7a83e96..49fbdfba 100644
--- a/docker/llm/inference/xpu/docker/README.md
+++ b/docker/llm/inference/xpu/docker/README.md
@@ -12,7 +12,7 @@ docker build \
 
 ### Use the image for doing xpu inference
 
-To map the `xpu` into the cotainer, you need to specify `--device=/dev/dri` when booting the container.
+To map the `xpu` into the container, you need to specify `--device=/dev/dri` when booting the container.
 
 An example could be:
 ```bash
diff --git a/docker/llm/serving/cpu/docker/Dockerfile b/docker/llm/serving/cpu/docker/Dockerfile
new file mode 100644
index 00000000..ede2b733
--- /dev/null
+++ b/docker/llm/serving/cpu/docker/Dockerfile
@@ -0,0 +1,19 @@
+FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
+
+ARG http_proxy
+ARG https_proxy
+
+# Disable pip's cache behavior
+ARG PIP_NO_CACHE_DIR=false
+
+# Install Serving Dependencies
+RUN mkdir /llm && \
+    cd /llm && \
+    git clone https://github.com/analytics-zoo/FastChat.git && \
+    cd FastChat && \
+    git checkout dev-2023-09-22 && \
+    pip3 install -e ".[model_worker,webui]" && \
+    cd /llm
+
+
+WORKDIR /llm/
diff --git a/docker/llm/serving/cpu/docker/README.md b/docker/llm/serving/cpu/docker/README.md
new file mode 100644
index 00000000..6024a859
--- /dev/null
+++ b/docker/llm/serving/cpu/docker/README.md
@@ -0,0 +1,35 @@
+## Build/Use BigDL-LLM-serving cpu image
+
+### Build Image
+```bash
+docker build \
+  --build-arg http_proxy=.. \
+  --build-arg https_proxy=.. \
+  --build-arg no_proxy=.. \
+  --rm --no-cache -t intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT .
+```
+
+
+### Use the image for doing cpu serving
+
+
+You could use the following bash script to start the container.  Please be noted that the CPU config is specified for Xeon CPUs, change it accordingly if you are not using a Xeon CPU.
+
+```bash
+#/bin/bash
+export DOCKER_IMAGE=intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+
+sudo docker run -itd \
+        --net=host \
+        --cpuset-cpus="0-47" \
+        --cpuset-mems="0" \
+        --memory="32G" \
+        --name=CONTAINER_NAME \
+        --shm-size="16g" \
+        $DOCKER_IMAGE
+```
+
+
+After the container is booted, you could get into the container through `docker exec`.
+
+To run model-serving using `BigDL-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/BigDL/tree/main/python/llm/src/bigdl/llm/serving).
diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile
new file mode 100644
index 00000000..28bca6fc
--- /dev/null
+++ b/docker/llm/serving/xpu/docker/Dockerfile
@@ -0,0 +1,19 @@
+FROM intelanalytics/bigdl-llm-xpu:2.4.0-SNAPSHOT
+
+ARG http_proxy
+ARG https_proxy
+
+# Disable pip's cache behavior
+ARG PIP_NO_CACHE_DIR=false
+
+# Install Serving Dependencies
+RUN mkdir /llm && \
+    cd /llm && \
+    git clone https://github.com/analytics-zoo/FastChat.git && \
+    cd FastChat && \
+    git checkout dev-2023-09-22 && \
+    pip3 install -e ".[model_worker,webui]" && \
+    cd /llm
+
+
+WORKDIR /llm/
diff --git a/docker/llm/serving/xpu/docker/README.md b/docker/llm/serving/xpu/docker/README.md
new file mode 100644
index 00000000..5a2dcd39
--- /dev/null
+++ b/docker/llm/serving/xpu/docker/README.md
@@ -0,0 +1,46 @@
+## Build/Use BigDL-LLM-serving xpu image
+
+### Build Image
+```bash
+docker build \
+  --build-arg http_proxy=.. \
+  --build-arg https_proxy=.. \
+  --build-arg no_proxy=.. \
+  --rm --no-cache -t intelanalytics/bigdl-llm-serving-xpu:2.4.0-SNAPSHOT .
+```
+
+
+### Use the image for doing xpu serving
+
+
+To map the `xpu` into the container, you need to specify `--device=/dev/dri` when booting the container.
+
+An example could be:
+```bash
+#/bin/bash
+export DOCKER_IMAGE=intelanalytics/bigdl-llm-serving-xpu:2.4.0-SNAPSHOT
+
+sudo docker run -itd \
+        --net=host \
+        --device=/dev/dri \
+        --memory="32G" \
+        --name=CONTAINER_NAME \
+        --shm-size="16g" \
+        $DOCKER_IMAGE
+```
+
+
+After the container is booted, you could get into the container through `docker exec`.
+
+To verify the device is successfully mapped into the container, run `sycl-ls` to check the result. In a machine with Arc A770, the sampled output is:
+
+```bash
+root@arda-arc12:/# sycl-ls
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device 1.2 [2023.16.7.0.21_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i9-13900K 3.0 [2023.16.7.0.21_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics 3.0 [23.17.26241.33]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26241]
+```
+After the container is booted, you could get into the container through `docker exec`.
+
+To run model-serving using `BigDL-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/BigDL/tree/main/python/llm/src/bigdl/llm/serving).