From 0cfdd399e71fe0211d3c063abbc5070e766c534f Mon Sep 17 00:00:00 2001
From: Guancheng Fu <110874468+gc-fu@users.noreply.github.com>
Date: Thu, 24 Apr 2025 10:21:17 +0800
Subject: [PATCH] Update README.md (#13104)

---
 python/llm/example/GPU/vLLM-Serving/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/llm/example/GPU/vLLM-Serving/README.md b/python/llm/example/GPU/vLLM-Serving/README.md
index 34a54ba4..b29da5a4 100644
--- a/python/llm/example/GPU/vLLM-Serving/README.md
+++ b/python/llm/example/GPU/vLLM-Serving/README.md
@@ -64,6 +64,9 @@ pip install ray
 export USE_XETLA=OFF
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 export SYCL_CACHE_PERSISTENT=1
+
+# If you are using woq_int4, be sure to setup the following environment variable based on the cards you want to use:
+export ONEAPI_DEVICE_SELECTOR=level_zero:0,1,2,3   # In case of four cards
 ```
 ### 3. Offline inference/Service