LLM: Add llm inference_cpp_xpu_docker (#10933)

* test_cpp_docker * update * update * update * update * add sudo * update nodejs version * no need npm * remove blinker * new cpp docker * restore * add line * add manually_build * update and add mtl * update for workdir llm * add benchmark part * update readme * update 1024-128 * update readme * update * fix * update * update * update readme too * update readme * no change * update dir_name * update readme
2024-05-15 11:10:22 +08:00 · 2024-05-15 11:10:22 +08:00 · 86cec80b51
commit 86cec80b51
parent 4053a6ef94
13 changed files with 495 additions and 8 deletions
--- a/.github/workflows/manually_build.yml
+++ b/.github/workflows/manually_build.yml
@ -12,6 +12,7 @@ on:
        - all
        - ipex-llm-cpu
        - ipex-llm-xpu
+        - ipex-llm-inference-cpp-xpu
        - ipex-llm-serving-cpu
        - ipex-llm-serving-xpu
        - ipex-llm-finetune-lora-cpu
@ -193,6 +194,36 @@ jobs:
        sudo docker push ${image}:latest
        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} ${image}:latest

+  ipex-llm-inference-cpp-xpu:
+    if: ${{ inputs.artifact == 'ipex-llm-inference-cpp-xpu' || inputs.artifact == 'all' }}
+    runs-on: [self-hosted, Shire]
+    
+    steps:
+    - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
+    - name: docker login
+      run: |
+        docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
+    - name: ipex-llm-inference-cpp-xpu
+      run: |
+        echo "##############################################################"
+        echo "####### ipex-llm-inference-cpp-xpu ########"
+        echo "##############################################################"
+        export image=intelanalytics/ipex-llm-inference-cpp-xpu
+        cd docker/llm/inference-cpp/
+        sudo docker build \
+          --no-cache=true \
+          --build-arg http_proxy=${HTTP_PROXY} \
+          --build-arg https_proxy=${HTTPS_PROXY} \
+          --build-arg no_proxy=${NO_PROXY} \
+          -t ${image}:${TAG} -f ./Dockerfile .
+        sudo docker push ${image}:${TAG}
+        sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+        sudo docker push 10.239.45.10/arda/${image}:${TAG}
+        # tag 'latest'
+        sudo docker tag ${image}:${TAG} ${image}:latest
+        sudo docker push ${image}:latest
+        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} ${image}:latest
+
  ipex-llm-cpu:
    if: ${{ inputs.artifact == 'ipex-llm-cpu' || inputs.artifact == 'all' }}
    runs-on: [self-hosted, Shire]
--- a/.github/workflows/manually_build_for_testing.yml
+++ b/.github/workflows/manually_build_for_testing.yml
@ -18,7 +18,7 @@ on:
        - ipex-llm-finetune-qlora-cpu
        - ipex-llm-finetune-xpu
        - ipex-llm-xpu
-        - ipex-llm-cpp-xpu
+        - ipex-llm-inference-cpp-xpu
        - ipex-llm-cpu
        - ipex-llm-serving-xpu
        - ipex-llm-serving-cpu
@ -147,8 +147,8 @@ jobs:
        sudo docker push 10.239.45.10/arda/${image}:${TAG}
        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}

-  ipex-llm-cpp-xpu:
-    if: ${{ github.event.inputs.artifact == 'ipex-llm-cpp-xpu' || github.event.inputs.artifact == 'all' }}
+  ipex-llm-inference-cpp-xpu:
+    if: ${{ github.event.inputs.artifact == 'ipex-llm-inference-cpp-xpu' || github.event.inputs.artifact == 'all' }}
    runs-on: [self-hosted, Shire]
    
    steps:
@ -158,13 +158,13 @@ jobs:
    - name: docker login
      run: |
        docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
-    - name: ipex-llm-cpp-xpu
+    - name: ipex-llm-inference-cpp-xpu
      run: |
        echo "##############################################################"
-        echo "####### ipex-llm-cpp-xpu ########"
+        echo "####### ipex-llm-inference-cpp-xpu ########"
        echo "##############################################################"
-        export image=intelanalytics/ipex-llm-cpp-xpu
-        cd docker/llm/cpp/
+        export image=intelanalytics/ipex-llm-inference-cpp-xpu
+        cd docker/llm/inference-cpp/
        sudo docker build \
          --no-cache=true \
          --build-arg http_proxy=${HTTP_PROXY} \
--- a/docker/llm/inference-cpp/Dockerfile
+++ b/docker/llm/inference-cpp/Dockerfile
@ -0,0 +1,65 @@
+FROM intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04
+
+ARG http_proxy
+ARG https_proxy
+
+ENV TZ=Asia/Shanghai
+ENV PYTHONUNBUFFERED=1
+
+# Disable pip's cache behavior
+ARG PIP_NO_CACHE_DIR=false
+
+RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list && \
+    apt-get update && \
+    apt-get install -y curl wget git gnupg gpg-agent sudo && \
+    wget -qO - https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \
+    echo 'deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc' | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+    rm /etc/apt/sources.list.d/intel-graphics.list && \
+    # Install PYTHON 3.11 and IPEX-LLM[xpu]
+    ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
+    env DEBIAN_FRONTEND=noninteractive apt-get update && \
+    apt install software-properties-common libunwind8-dev vim less -y && \
+    add-apt-repository ppa:deadsnakes/ppa -y && \
+    apt-get install -y python3.11 git curl wget && \
+    rm /usr/bin/python3 && \
+    ln -s /usr/bin/python3.11 /usr/bin/python3 && \
+    ln -s /usr/bin/python3 /usr/bin/python && \
+    apt-get install -y python3-pip python3.11-dev python3-wheel python3.11-distutils && \
+    curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    python3 get-pip.py && \
+    rm get-pip.py && \
+    pip install --upgrade requests argparse urllib3 && \
+    pip install --pre --upgrade ipex-llm[cpp] && \
+    # Fix Trivy CVE Issues
+    pip install transformers==4.36.2 && \
+    pip install transformers_stream_generator einops tiktoken && \
+    # Install opencl-related repos
+    apt-get update && \
+    apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \
+    # install nodejs and npm and get webui
+    apt purge nodejs -y && \
+    apt purge libnode-dev -y && \
+    apt autoremove -y && \
+    apt clean -y && \
+    curl -sL https://deb.nodesource.com/setup_18.x | sudo -E bash - && \
+    apt install -y nodejs && \
+    mkdir -p /llm/scripts && cd /llm && \
+    git clone https://github.com/open-webui/open-webui.git && \
+    cd /llm/open-webui/ && \
+    cp -RPp .env.example .env && \
+    # Build frontend
+    npm i && \
+    npm run build && \
+    # Install Dependencies
+    cd ./backend && \
+    # remove blinker to avoid error
+    find /usr/lib/python3/dist-packages/ -name 'blinker*' -exec rm -rf {} + && \
+    pip install -r requirements.txt -U
+
+COPY ./start-llama-cpp.sh /llm/scripts/start-llama-cpp.sh
+COPY ./start-ollama.sh /llm/scripts/start-ollama.sh
+COPY ./start-open-webui.sh /llm/scripts/start-open-webui.sh
+COPY ./benchmark_llama-cpp.sh /llm/scripts/benchmark_llama-cpp.sh
+
+WORKDIR /llm/
--- a/docker/llm/inference-cpp/README.md
+++ b/docker/llm/inference-cpp/README.md
@ -0,0 +1,165 @@
+## Run llama.cpp/Ollama/open-webui with Docker on Intel GPU
+
+### Install Docker
+
+1. Linux Installation
+
+    Follow the instructions in this [guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/DockerGuides/docker_windows_gpu.html#linux) to install Docker on Linux.
+
+2. Windows Installation
+
+    For Windows installation, refer to this [guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/DockerGuides/docker_windows_gpu.html#install-docker-desktop-for-windows).
+
+#### Setting Docker on windows
+If you want to run this image on windows, please refer to (this document)[https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/docker_windows_gpu.html#install-docker-on-windows] to set up Docker on windows. Then run below steps on wls ubuntu. And you need to enable `--net=host`,follow [this guide](https://docs.docker.com/network/drivers/host/#docker-desktop) so that you can easily access the service running on the docker. The [v6.1x kernel version wsl]( https://learn.microsoft.com/en-us/community/content/wsl-user-msft-kernel-v6#1---building-the-microsoft-linux-kernel-v61x) is recommended to use.Otherwise, you may encounter the blocking issue before loading the model to GPU.
+
+### Pull the latest image
+```bash
+# This image will be updated every day
+docker pull intelanalytics/ipex-llm-inference-cpp-xpu:latest
+```
+
+### Start Docker Container
+
+To map the `xpu` into the container, you need to specify `--device=/dev/dri` when booting the container. Select the device you are running(device type:(Max, Flex, Arc, iGPU)). And change the `/path/to/models` to mount the models. `bench_model` is used to benchmark quickly. If want to benchmark, make sure it on the `/path/to/models`.
+
+An Linux example could be:
+```bash
+#/bin/bash
+export DOCKER_IMAGE=intelanalytics/ipex-llm-inference-cpp-xpu:latest
+export CONTAINER_NAME=ipex-llm-inference-cpp-xpu-container
+sudo docker run -itd \
+        --net=host \
+        --device=/dev/dri \
+        -v /path/to/models:/models \
+        -e no_proxy=localhost,127.0.0.1 \
+        --memory="32G" \
+        --name=$CONTAINER_NAME \
+        -e bench_model="mistral-7b-v0.1.Q4_0.gguf" \
+        -e DEVICE=Arc \
+        --shm-size="16g" \
+        $DOCKER_IMAGE
+```
+
+An Windows example could be:
+```bash
+#/bin/bash
+export DOCKER_IMAGE=intelanalytics/ipex-llm-inference-cpp-xpu:latest
+export CONTAINER_NAME=ipex-llm-inference-cpp-xpu-container
+sudo docker run -itd \
+        --net=host \
+        --device=/dev/dri \
+        --privileged \
+        -v /path/to/models:/models \
+        -v /usr/lib/wsl:/usr/lib/wsl \
+        -e no_proxy=localhost,127.0.0.1 \
+        --memory="32G" \
+        --name=$CONTAINER_NAME \
+        -e bench_model="mistral-7b-v0.1.Q4_0.gguf" \
+        -e DEVICE=Arc \
+        --shm-size="16g" \
+        $DOCKER_IMAGE
+```
+
+
+After the container is booted, you could get into the container through `docker exec`.
+
+```bash
+docker exec -it ipex-llm-inference-cpp-xpu-container /bin/bash
+```
+
+To verify the device is successfully mapped into the container, run `sycl-ls` to check the result. In a machine with Arc A770, the sampled output is:
+
+```bash
+root@arda-arc12:/# sycl-ls
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device 1.2 [2023.16.7.0.21_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i9-13900K 3.0 [2023.16.7.0.21_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics 3.0 [23.17.26241.33]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26241]
+```
+
+
+### Quick benchmark for llama.cpp
+
+Notice that the performance on windows wsl docker is a little slower than on windows host, ant it's caused by the implementation of wsl kernel.
+
+```bash
+bash /llm/scripts/benchmark_llama-cpp.sh
+
+# benchmark results
+llama_print_timings:        load time =    xxx ms
+llama_print_timings:      sample time =       xxx ms /    xxx runs   (    xxx ms per token, xxx tokens per second)
+llama_print_timings: prompt eval time =     xxx ms /    xxx tokens (    xxx ms per token,   xxx tokens per second)
+llama_print_timings:        eval time =     xxx ms /    128 runs   (   xxx ms per token,    xxx tokens per second)
+llama_print_timings:       total time =     xxx ms /    xxx tokens
+```
+
+
+### Running llama.cpp inference with IPEX-LLM on Intel GPU
+
+```bash
+cd /llm/scripts/
+# set the recommended Env
+source ipex-llm-init --gpu --device $DEVICE
+# mount models and change the model_path in `start-llama-cpp.sh`
+bash start-llama-cpp.sh
+```
+
+Please refer to this [documentation](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/llama_cpp_quickstart.html) for more details.
+
+
+### Running Ollama serving with IPEX-LLM on Intel GPU
+
+Running the ollama on the background, you can see the ollama.log in `/root/ollama/ollama.log`
+```bash
+cd /llm/scripts/
+# set the recommended Env
+source ipex-llm-init --gpu --device $DEVICE
+bash start-ollama.sh # ctrl+c to exit
+```
+
+#### Run Ollama models (interactive)
+
+```bash
+cd /llm/ollama
+# create a file named Modelfile
+FROM /models/mistral-7b-v0.1.Q4_0.gguf
+TEMPLATE [INST] {{ .Prompt }} [/INST]
+PARAMETER num_predict 64
+
+# create example and run it on console
+./ollama create example -f Modelfile
+./ollama run example
+```
+
+#### Pull models from ollama to serve
+
+```bash
+cd /llm/ollama
+./ollama pull llama2
+```
+
+Use the Curl to Test:
+```bash
+curl http://localhost:11434/api/generate -d '
+{ 
+   "model": "llama2", 
+   "prompt": "What is AI?", 
+   "stream": false
+}'
+```
+
+Please refer to this [documentation](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/ollama_quickstart.html#pull-model) for more details.
+
+
+### Running Open WebUI with Intel GPU
+
+Start the ollama and load the model first, then use the open-webui to chat.
+If you have difficulty accessing the huggingface repositories, you may use a mirror, e.g. add export HF_ENDPOINT=https://hf-mirror.com before running bash start.sh.
+```bash
+cd /llm/scripts/
+bash start-open-webui.sh
+# INFO:     Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit)
+```
+
+For how to log-in or other guide, Please refer to this [documentation](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/open_webui_with_ollama_quickstart.html) for more details.
--- a/docker/llm/inference-cpp/benchmark_llama-cpp.sh
+++ b/docker/llm/inference-cpp/benchmark_llama-cpp.sh
@ -0,0 +1,27 @@
+# init llama-cpp first
+mkdir -p /llm/llama-cpp
+cd /llm/llama-cpp
+init-llama-cpp
+
+# change the model_path to run
+if [[ "$DEVICE" == "Arc" || "$DEVICE" == "ARC" ]]; then
+    source ipex-llm-init -g --device Arc
+    python run.py
+elif [[ "$DEVICE" == "Flex" || "$DEVICE" == "FLEX" ]]; then
+    source ipex-llm-init -g --device Flex
+    python run.py
+elif [[ "$DEVICE" == "Max" || "$DEVICE" == "MAX" ]]; then
+    source ipex-llm-init -g --device Max
+    python run.py
+else
+    echo "Invalid DEVICE specified."
+fi
+model="/models/"$bench_model
+
+promt_1024_128="It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill. Once a pasta is in vicinity of a plate"
+
+# warm-up two times
+./main -m $model -n 128 --prompt "${promt_1024_128}"  -t 8 -e -ngl 999 --color --ctx-size 1024 --no-mmap --temp 0
+./main -m $model -n 128 --prompt "${promt_1024_128}"  -t 8 -e -ngl 999 --color --ctx-size 1024 --no-mmap --temp 0
+
+./main -m $model -n 128 --prompt "${promt_1024_128}"  -t 8 -e -ngl 999 --color --ctx-size 1024 --no-mmap --temp 0
--- a/docker/llm/inference-cpp/start-llama-cpp.sh
+++ b/docker/llm/inference-cpp/start-llama-cpp.sh
@ -0,0 +1,8 @@
+# init llama-cpp first
+mkdir -p /llm/llama-cpp
+cd /llm/llama-cpp
+init-llama-cpp
+
+# change the model_path to run
+model="/models/mistral-7b-v0.1.Q4_0.gguf"
+./main -m $model -n 32 --prompt "What is AI?" -t 8 -e -ngl 999 --color
--- a/docker/llm/inference-cpp/start-ollama.sh
+++ b/docker/llm/inference-cpp/start-ollama.sh
@ -0,0 +1,9 @@
+# init ollama first
+mkdir -p /llm/ollama
+cd /llm/ollama
+init-ollama
+export OLLAMA_NUM_GPU=999
+export ZES_ENABLE_SYSMAN=1
+
+# start ollama service
+(./ollama serve > ollama.log) &
--- a/docker/llm/inference-cpp/start-open-webui.sh
+++ b/docker/llm/inference-cpp/start-open-webui.sh
@ -0,0 +1,2 @@
+cd /llm/open-webui/backend
+bash start.sh > open-webui.log
--- a/docker/llm/inference/xpu/docker/Dockerfile
+++ b/docker/llm/inference/xpu/docker/Dockerfile
@ -61,4 +61,4 @@ RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-P
    pip install omegaconf && \
    chmod +x /llm/benchmark.sh

-WORKDIR /llm/
+WORKDIR /llm/
--- a/docs/readthedocs/source/_templates/sidebar_quicklinks.html
+++ b/docs/readthedocs/source/_templates/sidebar_quicklinks.html
@ -65,6 +65,9 @@
                        <a href="doc/LLM/Quickstart/deepspeed_autotp_fastapi_quickstart.html">Run IPEX-LLM serving on Multiple Intel GPUs
                            using DeepSpeed AutoTP and FastApi</a>
                    </li>
+                    <li>
+                        <a href="doc/LLM/Quickstart/docker_cpp_xpu_quickstart.html">Run llama.cpp/Ollama/open-webui with Docker on Intel GPU</a>
+                    </li>
                </ul>
            </li>
            <li>
--- a/docs/readthedocs/source/_toc.yml
+++ b/docs/readthedocs/source/_toc.yml
@ -41,6 +41,7 @@ subtrees:
                - file: doc/LLM/Quickstart/fastchat_quickstart
                - file: doc/LLM/Quickstart/axolotl_quickstart
                - file: doc/LLM/Quickstart/deepspeed_autotp_fastapi_quickstart
+                - file: doc/LLM/Quickstart/docker_cpp_xpu_quickstart
          - file: doc/LLM/Overview/KeyFeatures/index
            title: "Key Features"
            subtrees:
--- a/docs/readthedocs/source/doc/LLM/Quickstart/docker_cpp_xpu_quickstart.md
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/docker_cpp_xpu_quickstart.md
@ -0,0 +1,175 @@
+## Run llama.cpp/Ollama/open-webui with Docker on Intel GPU
+
+## Quick Start
+
+### Install Docker
+
+1. Linux Installation
+
+    Follow the instructions in this [guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/DockerGuides/docker_windows_gpu.html#linux) to install Docker on Linux.
+
+2. Windows Installation
+
+    For Windows installation, refer to this [guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/DockerGuides/docker_windows_gpu.html#install-docker-desktop-for-windows).
+
+#### Setting Docker on windows
+If you want to run this image on windows, please refer to (this document)[https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/docker_windows_gpu.html#install-docker-on-windows] to set up Docker on windows. And run below steps on wls ubuntu. And you need to enable `--net=host`,follow [this guide](https://docs.docker.com/network/drivers/host/#docker-desktop) so that you can easily access the service running on the docker. The [v6.1x kernel version wsl]( https://learn.microsoft.com/en-us/community/content/wsl-user-msft-kernel-v6#1---building-the-microsoft-linux-kernel-v61x) is recommended to use.Otherwise, you may encounter the blocking issue before loading the model to GPU.
+
+### Pull the latest image
+```bash
+# This image will be updated every day
+docker pull intelanalytics/ipex-llm-inference-cpp-xpu:latest
+```
+
+### Start Docker Container
+
+```eval_rst
+.. tabs::
+   .. tab:: Linux
+
+      To map the `xpu` into the container, you need to specify `--device=/dev/dri` when booting the container. Select the device you are running(device type:(Max, Flex, Arc, iGPU)). And change the `/path/to/models` to mount the models. `bench_model` is used to benchmark quickly. If want to benchmark, make sure it on the `/path/to/models`
+
+      .. code-block:: bash
+
+        #/bin/bash
+        export DOCKER_IMAGE=intelanalytics/ipex-llm-inference-cpp-xpu:latest
+        export CONTAINER_NAME=ipex-llm-inference-cpp-xpu-container
+        sudo docker run -itd \
+                --net=host \
+                --device=/dev/dri \
+                -v /path/to/models:/models \
+                -e no_proxy=localhost,127.0.0.1 \
+                --memory="32G" \
+                --name=$CONTAINER_NAME \
+                -e bench_model="mistral-7b-v0.1.Q4_0.gguf" \
+                -e DEVICE=Arc \
+                --shm-size="16g" \
+                $DOCKER_IMAGE
+   
+   .. tab:: Windows
+
+      To map the `xpu` into the container, you need to specify `--device=/dev/dri` when booting the container. And change the `/path/to/models` to mount the models. Then add `--privileged ` and map the `/usr/lib/wsl` to the docker.
+
+      .. code-block:: bash
+
+        #/bin/bash
+        export DOCKER_IMAGE=intelanalytics/ipex-llm-inference-cpp-xpu:latest
+        export CONTAINER_NAME=ipex-llm-inference-cpp-xpu-container
+        sudo docker run -itd \
+                --net=host \
+                --device=/dev/dri \
+                --privileged \
+                -v /path/to/models:/models \
+                -v /usr/lib/wsl:/usr/lib/wsl \
+                -e no_proxy=localhost,127.0.0.1 \
+                --memory="32G" \
+                --name=$CONTAINER_NAME \
+                -e bench_model="mistral-7b-v0.1.Q4_0.gguf" \
+                -e DEVICE=Arc \
+                --shm-size="16g" \
+                $DOCKER_IMAGE
+
+```
+
+
+After the container is booted, you could get into the container through `docker exec`.
+
+```bash
+docker exec -it ipex-llm-inference-cpp-xpu-container /bin/bash
+```
+
+To verify the device is successfully mapped into the container, run `sycl-ls` to check the result. In a machine with Arc A770, the sampled output is:
+
+```bash
+root@arda-arc12:/# sycl-ls
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device 1.2 [2023.16.7.0.21_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i9-13900K 3.0 [2023.16.7.0.21_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics 3.0 [23.17.26241.33]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26241]
+```
+
+
+### Quick benchmark for llama.cpp
+
+Notice that the performance on windows wsl docker is a little slower than on windows host, ant it's caused by the implementation of wsl kernel.
+
+```bash
+bash /llm/scripts/benchmark_llama-cpp.sh
+
+# benchmark results
+llama_print_timings:        load time =    xxx ms
+llama_print_timings:      sample time =       xxx ms /    xxx runs   (    xxx ms per token, xxx tokens per second)
+llama_print_timings: prompt eval time =     xxx ms /    xxx tokens (    xxx ms per token,   xxx tokens per second)
+llama_print_timings:        eval time =     xxx ms /    128 runs   (   xxx ms per token,    xxx tokens per second)
+llama_print_timings:       total time =     xxx ms /    xxx tokens
+```
+
+
+### Running llama.cpp inference with IPEX-LLM on Intel GPU
+
+```bash
+cd /llm/scripts/
+# set the recommended Env
+source ipex-llm-init --gpu --device $DEVICE
+# mount models and change the model_path in `start-llama-cpp.sh`
+bash start-llama-cpp.sh
+```
+
+Please refer to this [documentation](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/llama_cpp_quickstart.html) for more details.
+
+
+### Running Ollama serving with IPEX-LLM on Intel GPU
+
+Running the ollama on the background, you can see the ollama.log in `/root/ollama/ollama.log`
+```bash
+cd /llm/scripts/
+# set the recommended Env
+source ipex-llm-init --gpu --device $DEVICE
+bash start-ollama.sh # ctrl+c to exit
+```
+
+#### Run Ollama models (interactive)
+
+```bash
+cd /llm/ollama
+# create a file named Modelfile
+FROM /models/mistral-7b-v0.1.Q4_0.gguf
+TEMPLATE [INST] {{ .Prompt }} [/INST]
+PARAMETER num_predict 64
+
+# create example and run it on console
+./ollama create example -f Modelfile
+./ollama run example
+```
+
+#### Pull models from ollama to serve
+
+```bash
+cd /llm/ollama
+./ollama pull llama2
+```
+
+Use the Curl to Test:
+```bash
+curl http://localhost:11434/api/generate -d '
+{ 
+   "model": "llama2", 
+   "prompt": "What is AI?", 
+   "stream": false
+}'
+```
+
+Please refer to this [documentation](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/ollama_quickstart.html#pull-model) for more details.
+
+
+### Running Open WebUI with Intel GPU
+
+Start the ollama and load the model first, then use the open-webui to chat.
+If you have difficulty accessing the huggingface repositories, you may use a mirror, e.g. add export HF_ENDPOINT=https://hf-mirror.com before running bash start.sh.
+```bash
+cd /llm/scripts/
+bash start-open-webui.sh
+# INFO:     Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit)
+```
+
+For how to log-in or other guide, Please refer to this [documentation](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/open_webui_with_ollama_quickstart.html) for more details.
--- a/docs/readthedocs/source/doc/LLM/Quickstart/index.rst
+++ b/docs/readthedocs/source/doc/LLM/Quickstart/index.rst
@ -26,6 +26,7 @@ This section includes efficient guide to show you how to:
 * `Run IPEX-LLM Serving with FastChat <./fastchat_quickstart.html>`_
 * `Finetune LLM with Axolotl on Intel GPU <./axolotl_quickstart.html>`_
 * `Run IPEX-LLM serving on Multiple Intel GPUs using DeepSpeed AutoTP and FastApi <./deepspeed_autotp_fastapi_quickstart.html>`
+* `Run llama.cpp/Ollama/open-webui with Docker on Intel GPU <./docker_cpp_xpu_quickstart.html>`


 .. |bigdl_llm_migration_guide| replace:: ``bigdl-llm`` Migration Guide