Create serving images (#9048)

* Finished & Tested * Install latest pip from base images * Add blank line * Delete unused comment * fix typos
2023-09-25 15:51:45 +08:00 · 2023-09-25 15:51:45 +08:00 · cc84ed70b3
commit cc84ed70b3
parent b4a1266ef0
7 changed files with 138 additions and 7 deletions
--- a/docker/llm/inference/cpu/docker/Dockerfile
+++ b/docker/llm/inference/cpu/docker/Dockerfile
@ -2,6 +2,7 @@ FROM ubuntu:20.04

 ARG http_proxy
 ARG https_proxy
+ARG PIP_NO_CACHE_DIR=false

 # Install PYTHON 3.9
 RUN env DEBIAN_FRONTEND=noninteractive apt-get update && \
@ -12,8 +13,12 @@ RUN env DEBIAN_FRONTEND=noninteractive apt-get update && \
    ln -s /usr/bin/python3.9 /usr/bin/python3 && \
    ln -s /usr/bin/python3 /usr/bin/python && \
    apt-get install -y python3-pip python3.9-dev python3-wheel python3.9-distutils && \
-    pip3 install --no-cache --upgrade requests argparse urllib3 && \
-    pip3 install --pre --upgrade bigdl-llm[all] && \
-    pip3 install --pre --upgrade bigdl-nano
+    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    # Install FastChat from source requires PEP 660 support
+    python3 get-pip.py && \
+    rm get-pip.py && \
+    pip install --upgrade requests argparse urllib3 && \
+    pip install --pre --upgrade bigdl-llm[all] && \
+    pip install --pre --upgrade bigdl-nano

-ENTRYPOINT ["/bin/bash"]
+ENTRYPOINT ["/bin/bash"]
--- a/docker/llm/inference/xpu/docker/Dockerfile
+++ b/docker/llm/inference/xpu/docker/Dockerfile
@ -5,6 +5,9 @@ ENV https_proxy $HTTP_PROXY

 ENV TZ=Asia/Shanghai

+# Disable pip's cache behavior
+ARG PIP_NO_CACHE_DIR=false
+
 RUN apt-get update && \
    apt-get install -y curl wget git gnupg gpg-agent && \
    wget -qO - https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \
@ -20,8 +23,12 @@ RUN apt-get update && \
    ln -s /usr/bin/python3.9 /usr/bin/python3 && \
    ln -s /usr/bin/python3 /usr/bin/python && \
    apt-get install -y python3-pip python3.9-dev python3-wheel python3.9-distutils && \
-    pip3 install --no-cache --upgrade requests argparse urllib3 && \
-    pip3 install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu && \
+    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    # Install FastChat from source requires PEP 660 support
+    python3 get-pip.py && \
+    rm get-pip.py && \
+    pip install --upgrade requests argparse urllib3 && \
+    pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu && \
    # Install opencl-related repos
    apt-get update && \
    apt-get install -y intel-opencl-icd intel-level-zero-gpu level-zero level-zero-dev
--- a/docker/llm/inference/xpu/docker/README.md
+++ b/docker/llm/inference/xpu/docker/README.md
@ -12,7 +12,7 @@ docker build \

 ### Use the image for doing xpu inference

-To map the `xpu` into the cotainer, you need to specify `--device=/dev/dri` when booting the container.
+To map the `xpu` into the container, you need to specify `--device=/dev/dri` when booting the container.

 An example could be:
 ```bash
--- a/docker/llm/serving/cpu/docker/Dockerfile
+++ b/docker/llm/serving/cpu/docker/Dockerfile
@ -0,0 +1,19 @@
+FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
+
+ARG http_proxy
+ARG https_proxy
+
+# Disable pip's cache behavior
+ARG PIP_NO_CACHE_DIR=false
+
+# Install Serving Dependencies
+RUN mkdir /llm && \
+    cd /llm && \
+    git clone https://github.com/analytics-zoo/FastChat.git && \
+    cd FastChat && \
+    git checkout dev-2023-09-22 && \
+    pip3 install -e ".[model_worker,webui]" && \
+    cd /llm
+
+
+WORKDIR /llm/
--- a/docker/llm/serving/cpu/docker/README.md
+++ b/docker/llm/serving/cpu/docker/README.md
@ -0,0 +1,35 @@
+## Build/Use BigDL-LLM-serving cpu image
+
+### Build Image
+```bash
+docker build \
+  --build-arg http_proxy=.. \
+  --build-arg https_proxy=.. \
+  --build-arg no_proxy=.. \
+  --rm --no-cache -t intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT .
+```
+
+
+### Use the image for doing cpu serving
+
+
+You could use the following bash script to start the container.  Please be noted that the CPU config is specified for Xeon CPUs, change it accordingly if you are not using a Xeon CPU.
+
+```bash
+#/bin/bash
+export DOCKER_IMAGE=intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+
+sudo docker run -itd \
+        --net=host \
+        --cpuset-cpus="0-47" \
+        --cpuset-mems="0" \
+        --memory="32G" \
+        --name=CONTAINER_NAME \
+        --shm-size="16g" \
+        $DOCKER_IMAGE
+```
+
+
+After the container is booted, you could get into the container through `docker exec`.
+
+To run model-serving using `BigDL-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/BigDL/tree/main/python/llm/src/bigdl/llm/serving).
--- a/docker/llm/serving/xpu/docker/Dockerfile
+++ b/docker/llm/serving/xpu/docker/Dockerfile
@ -0,0 +1,19 @@
+FROM intelanalytics/bigdl-llm-xpu:2.4.0-SNAPSHOT
+
+ARG http_proxy
+ARG https_proxy
+
+# Disable pip's cache behavior
+ARG PIP_NO_CACHE_DIR=false
+
+# Install Serving Dependencies
+RUN mkdir /llm && \
+    cd /llm && \
+    git clone https://github.com/analytics-zoo/FastChat.git && \
+    cd FastChat && \
+    git checkout dev-2023-09-22 && \
+    pip3 install -e ".[model_worker,webui]" && \
+    cd /llm
+
+
+WORKDIR /llm/
--- a/docker/llm/serving/xpu/docker/README.md
+++ b/docker/llm/serving/xpu/docker/README.md
@ -0,0 +1,46 @@
+## Build/Use BigDL-LLM-serving xpu image
+
+### Build Image
+```bash
+docker build \
+  --build-arg http_proxy=.. \
+  --build-arg https_proxy=.. \
+  --build-arg no_proxy=.. \
+  --rm --no-cache -t intelanalytics/bigdl-llm-serving-xpu:2.4.0-SNAPSHOT .
+```
+
+
+### Use the image for doing xpu serving
+
+
+To map the `xpu` into the container, you need to specify `--device=/dev/dri` when booting the container.
+
+An example could be:
+```bash
+#/bin/bash
+export DOCKER_IMAGE=intelanalytics/bigdl-llm-serving-xpu:2.4.0-SNAPSHOT
+
+sudo docker run -itd \
+        --net=host \
+        --device=/dev/dri \
+        --memory="32G" \
+        --name=CONTAINER_NAME \
+        --shm-size="16g" \
+        $DOCKER_IMAGE
+```
+
+
+After the container is booted, you could get into the container through `docker exec`.
+
+To verify the device is successfully mapped into the container, run `sycl-ls` to check the result. In a machine with Arc A770, the sampled output is:
+
+```bash
+root@arda-arc12:/# sycl-ls
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device 1.2 [2023.16.7.0.21_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i9-13900K 3.0 [2023.16.7.0.21_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics 3.0 [23.17.26241.33]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26241]
+```
+After the container is booted, you could get into the container through `docker exec`.
+
+To run model-serving using `BigDL-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/BigDL/tree/main/python/llm/src/bigdl/llm/serving).