From 77b04efcc5ec37d1aafe2e0481078a31c500ad07 Mon Sep 17 00:00:00 2001
From: Ch1y0q <qiyue2001@gmail.com>
Date: Fri, 30 Aug 2024 09:26:47 +0800
Subject: [PATCH] add notes for `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS`
 (#11936)

* add notes for `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS`

* also update other quickstart
---
 docs/mddocs/Quickstart/continue_quickstart.md            | 3 ++-
 docs/mddocs/Quickstart/fastchat_quickstart.md            | 6 ++++++
 docs/mddocs/Quickstart/install_linux_gpu.md              | 8 ++++++--
 .../Quickstart/llama3_llamacpp_ollama_quickstart.md      | 9 +++++++++
 docs/mddocs/Quickstart/llama_cpp_quickstart.md           | 5 +++++
 docs/mddocs/Quickstart/ollama_quickstart.md              | 6 ++++++
 docs/mddocs/Quickstart/vLLM_quickstart.md                | 3 ++-
 7 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/docs/mddocs/Quickstart/continue_quickstart.md b/docs/mddocs/Quickstart/continue_quickstart.md
index d3feb289..239af91f 100644
--- a/docs/mddocs/Quickstart/continue_quickstart.md
+++ b/docs/mddocs/Quickstart/continue_quickstart.md
@@ -33,11 +33,12 @@ Visit [Run Ollama with IPEX-LLM on Intel GPU](./ollama_quickstart.md), and follo
 > If the `Continue` plugin is not installed on the same machine where Ollama is running (which means `Continue` needs to connect to a remote Ollama service), you must configure the Ollama service to accept connections from any IP address. To achieve this, set or export the environment variable `OLLAMA_HOST=0.0.0.0` before executing the command `ollama serve`. 
 
 > [!TIP]
-> If your local LLM is running on Intel Arc™ A-Series Graphics with Linux OS (Kernel 6.2), it is recommended to additionaly set the following environment variable for optimal performance before executing `ollama serve`:
+> If your local LLM is running on Intel Arc™ A-Series Graphics with Linux OS (Kernel 6.2), setting the following environment variable before starting the service may potentially improve performance.
 >
 > ```bash
 > export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 > ```
+> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html).
 
 ### 2. Pull and Prepare the Model
 
diff --git a/docs/mddocs/Quickstart/fastchat_quickstart.md b/docs/mddocs/Quickstart/fastchat_quickstart.md
index 43145739..abae5ccd 100644
--- a/docs/mddocs/Quickstart/fastchat_quickstart.md
+++ b/docs/mddocs/Quickstart/fastchat_quickstart.md
@@ -60,6 +60,7 @@ python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path REPO_ID_OR_YOU
 # Available low_bit format including sym_int4, sym_int8, fp16 etc.
 source /opt/intel/oneapi/setvars.sh
 export USE_XETLA=OFF
+# [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 
 python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --low-bit "sym_int4" --trust-remote-code --device "xpu"
@@ -87,6 +88,7 @@ python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7
 source /opt/intel/oneapi/setvars.sh
 export ENABLE_SDP_FUSION=1
 export SYCL_CACHE_PERSISTENT=1
+# [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "fp16" --trust-remote-code --device "xpu" --speculative
 ```
@@ -117,10 +119,14 @@ python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MO
 # On GPU
 source /opt/intel/oneapi/setvars.sh
 export USE_XETLA=OFF
+# [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu --load-in-low-bit "sym_int4" --enforce-eager
 ```
 
+> [!NOTE]
+> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html).
+
 #### Launch multiple workers
 
 Sometimes we may want to start multiple workers for the best performance.  For running in CPU, you may want to seperate multiple workers in different sockets.  Assuming each socket have 48 physicall cores, then you may want to start two workers using the following example:
diff --git a/docs/mddocs/Quickstart/install_linux_gpu.md b/docs/mddocs/Quickstart/install_linux_gpu.md
index 6723ebe1..88574d8d 100644
--- a/docs/mddocs/Quickstart/install_linux_gpu.md
+++ b/docs/mddocs/Quickstart/install_linux_gpu.md
@@ -242,8 +242,9 @@ To use GPU acceleration on Linux, several environment variables are required or
 
   # Recommended Environment Variables for optimal performance
   export USE_XETLA=OFF
-  export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
   export SYCL_CACHE_PERSISTENT=1
+  # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
+  export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
   ```
 
 - For **Intel Data Center GPU Max**:
@@ -257,9 +258,10 @@ To use GPU acceleration on Linux, several environment variables are required or
 
   # Recommended Environment Variables for optimal performance
   export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
-  export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
   export SYCL_CACHE_PERSISTENT=1
   export ENABLE_SDP_FUSION=1
+  # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
+  export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
   ```
 
   Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`
@@ -267,6 +269,8 @@ To use GPU acceleration on Linux, several environment variables are required or
 > [!NOTE]
 > Please refer to [this guide](../Overview/install_gpu.md#runtime-configuration-1) for more details regarding runtime configuration.
 
+> [!NOTE]
+> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html).
 
 ## A Quick Example
 
diff --git a/docs/mddocs/Quickstart/llama3_llamacpp_ollama_quickstart.md b/docs/mddocs/Quickstart/llama3_llamacpp_ollama_quickstart.md
index 40fcd31f..ee76c7e8 100644
--- a/docs/mddocs/Quickstart/llama3_llamacpp_ollama_quickstart.md
+++ b/docs/mddocs/Quickstart/llama3_llamacpp_ollama_quickstart.md
@@ -51,6 +51,7 @@ To use GPU acceleration, several environment variables are required or recommend
   ```bash
   source /opt/intel/oneapi/setvars.sh
   export SYCL_CACHE_PERSISTENT=1
+  # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
   export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
   # [optional] if you want to run on single GPU, use below command to limit GPU may improve performance
   export ONEAPI_DEVICE_SELECTOR=level_zero:0
@@ -62,12 +63,16 @@ To use GPU acceleration, several environment variables are required or recommend
 
   ```cmd
   set SYCL_CACHE_PERSISTENT=1
+  rem under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
   set SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
   ```
 
 > [!TIP]
 > When your machine has multi GPUs and you want to run on one of them, you need to set `ONEAPI_DEVICE_SELECTOR=level_zero:[gpu_id]`, here `[gpu_id]` varies based on your requirement. For more details, you can refer to [this section](../Overview/KeyFeatures/multi_gpus_selection.md#2-oneapi-device-selector).
 
+> [!NOTE]
+> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html).
+
 ##### Run llama3
 
 Under your current directory, exceuting below command to do inference with Llama3:
@@ -131,6 +136,7 @@ Launch the Ollama service:
   export OLLAMA_NUM_GPU=999
   source /opt/intel/oneapi/setvars.sh
   export SYCL_CACHE_PERSISTENT=1
+  # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
   export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
   # [optional] if you want to run on single GPU, use below command to limit GPU may improve performance
   export ONEAPI_DEVICE_SELECTOR=level_zero:0
@@ -147,6 +153,7 @@ Launch the Ollama service:
   set ZES_ENABLE_SYSMAN=1
   set OLLAMA_NUM_GPU=999
   set SYCL_CACHE_PERSISTENT=1
+  rem under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
   set SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 
   ollama serve
@@ -160,6 +167,8 @@ Launch the Ollama service:
 > [!TIP]
 > When your machine has multi GPUs and you want to run on one of them, you need to set `ONEAPI_DEVICE_SELECTOR=level_zero:[gpu_id]`, here `[gpu_id]` varies based on your requirement. For more details, you can refer to [this section](../Overview/KeyFeatures/multi_gpus_selection.md#2-oneapi-device-selector).
 
+> [!NOTE]
+> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html).
 
 ##### 2.2.2 Using Ollama Run Llama3
 
diff --git a/docs/mddocs/Quickstart/llama_cpp_quickstart.md b/docs/mddocs/Quickstart/llama_cpp_quickstart.md
index a08dbe0a..dbbe6d3d 100644
--- a/docs/mddocs/Quickstart/llama_cpp_quickstart.md
+++ b/docs/mddocs/Quickstart/llama_cpp_quickstart.md
@@ -115,6 +115,7 @@ To use GPU acceleration, several environment variables are required or recommend
   ```bash
   source /opt/intel/oneapi/setvars.sh
   export SYCL_CACHE_PERSISTENT=1
+  # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
   export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
   # [optional] if you want to run on single GPU, use below command to limit GPU may improve performance
   export ONEAPI_DEVICE_SELECTOR=level_zero:0
@@ -126,12 +127,16 @@ To use GPU acceleration, several environment variables are required or recommend
 
   ```cmd
   set SYCL_CACHE_PERSISTENT=1
+  rem under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
   set SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
   ```
 
 > [!TIP]
 > When your machine has multi GPUs and you want to run on one of them, you need to set `ONEAPI_DEVICE_SELECTOR=level_zero:[gpu_id]`, here `[gpu_id]` varies based on your requirement. For more details, you can refer to [this section](../Overview/KeyFeatures/multi_gpus_selection.md#2-oneapi-device-selector).
 
+> [!NOTE]
+> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html).
+
 ### 3. Example: Running community GGUF models with IPEX-LLM
 
 Here we provide a simple example to show how to run a community GGUF model with IPEX-LLM.
diff --git a/docs/mddocs/Quickstart/ollama_quickstart.md b/docs/mddocs/Quickstart/ollama_quickstart.md
index 221873a9..520e170d 100644
--- a/docs/mddocs/Quickstart/ollama_quickstart.md
+++ b/docs/mddocs/Quickstart/ollama_quickstart.md
@@ -72,6 +72,7 @@ You may launch the Ollama service as below:
   export ZES_ENABLE_SYSMAN=1
   source /opt/intel/oneapi/setvars.sh
   export SYCL_CACHE_PERSISTENT=1
+  # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
   export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
   # [optional] if you want to run on single GPU, use below command to limit GPU may improve performance
   export ONEAPI_DEVICE_SELECTOR=level_zero:0
@@ -88,6 +89,7 @@ You may launch the Ollama service as below:
   set no_proxy=localhost,127.0.0.1
   set ZES_ENABLE_SYSMAN=1
   set SYCL_CACHE_PERSISTENT=1
+  rem under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation
   set SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 
   ollama serve
@@ -102,6 +104,10 @@ You may launch the Ollama service as below:
 > [!TIP]
 > When your machine has multi GPUs and you want to run on one of them, you need to set `ONEAPI_DEVICE_SELECTOR=level_zero:[gpu_id]`, here `[gpu_id]` varies based on your requirement. For more details, you can refer to [this section](../Overview/KeyFeatures/multi_gpus_selection.md#2-oneapi-device-selector).
 
+> [!NOTE]
+> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html).
+
+
 The console will display messages similar to the following:
 
 <a href="https://llm-assets.readthedocs.io/en/latest/_images/ollama_serve.png" target="_blank">
diff --git a/docs/mddocs/Quickstart/vLLM_quickstart.md b/docs/mddocs/Quickstart/vLLM_quickstart.md
index 764b35c1..0982ba6b 100644
--- a/docs/mddocs/Quickstart/vLLM_quickstart.md
+++ b/docs/mddocs/Quickstart/vLLM_quickstart.md
@@ -171,11 +171,12 @@ Below shows an example output using `Qwen1.5-7B-Chat` with low-bit format `sym_i
 </a>
 
 > [!TIP]
-> If your local LLM is running on Intel Arc™ A-Series Graphics with Linux OS (Kernel 6.2), it is recommended to additionaly set the following environment variable for optimal performance before starting the service:
+> If your local LLM is running on Intel Arc™ A-Series Graphics with Linux OS (Kernel 6.2), setting the following environment variable before starting the service may potentially improve performance.
 >
 > ```bash
 > export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 > ```
+> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html).
 
 ### 4. About Tensor Parallel