diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 3f20fe55..85bb5446 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -10,6 +10,7 @@ ENV PYTHONUNBUFFERED=1 # Disable pip's cache behavior ARG PIP_NO_CACHE_DIR=false ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch +ADD ./oneccl-binding.patch /tmp/oneccl-binding.patch RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ @@ -57,6 +58,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO bash oneccl_wks_installer_2024.0.0.5.1.sh && \ git clone https://github.com/intel/torch-ccl -b v2.1.300+xpu && \ cd torch-ccl && \ + patch -p1 < /tmp/oneccl-binding.patch && \ USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py install && \ apt-get update && \ apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \ diff --git a/docker/llm/serving/xpu/docker/oneccl-binding.patch b/docker/llm/serving/xpu/docker/oneccl-binding.patch index 4b8410dc..6df5e850 100644 --- a/docker/llm/serving/xpu/docker/oneccl-binding.patch +++ b/docker/llm/serving/xpu/docker/oneccl-binding.patch @@ -1,14 +1,32 @@ diff --git a/src/gpu/dpcpp_ccl.cpp b/src/gpu/dpcpp_ccl.cpp -index 3bd8087..c5b5ce3 100644 +index bb1f236..289d490 100644 --- a/src/gpu/dpcpp_ccl.cpp +++ b/src/gpu/dpcpp_ccl.cpp -@@ -689,7 +689,8 @@ c10::intrusive_ptr XPUCCLStubs::allreduce_(std::v +@@ -80,6 +80,16 @@ int get_sync_only(int init_value = 0) { + return tmp_sync_only; + } + ++char* get_cached_env(const char* var_name) { ++ static char* cached_value = nullptr; ++ if (!cached_value) { ++ cached_value = std::getenv(var_name); ++ if (!cached_value) { ++ cached_value = "0"; ++ } ++ } ++ return cached_value; ++} + + #define CCL_KERNEL_SUBMIT(cmd, q) \ + ({bool profile_barrier = (xpu::is_profiler_enabled()); \ +@@ -759,6 +769,10 @@ c10::intrusive_ptr XPUCCLStubs::allreduce_impl(st stream, attr), stream.get_native()); }); -- // printf("Use One CCL allreduce.\n"); -+ stream.get_native().wait(); -+ // printf("Use One CCL allreduce.\n"); ++ const char* env_value = get_cached_env("IPEX_LLM_CCL_ENABLE_NATIVE_WAIT"); ++ if (env_value && std::string(env_value) == "1") { ++ stream.get_native().wait(); ++ } + // printf("Use One CCL allreduce.\n"); return ret_evt; }, - c10d::OpType::ALLREDUCE);