Update 1ccl_for_multi_arc.patch (#13199)
This commit is contained in:
		
							parent
							
								
									bb50cd0881
								
							
						
					
					
						commit
						3accc31b86
					
				
					 1 changed files with 30 additions and 21 deletions
				
			
		| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
From dfe1851b59df6859829b447353307b7c916ccee0 Mon Sep 17 00:00:00 2001
 | 
					From c205e7d25a4b0a57214b0cebec0b0b30ae4c9d0f Mon Sep 17 00:00:00 2001
 | 
				
			||||||
From: junhansh <junhan.shi@intel.com>
 | 
					From: YongZhuIntel <yong.zhu@intel.com>
 | 
				
			||||||
Date: Mon, 28 Apr 2025 23:33:11 +0800
 | 
					Date: Tue, 6 May 2025 13:06:06 +0800
 | 
				
			||||||
Subject: [PATCH] oneccl for Arc770 V2025.0.0.6.7
 | 
					Subject: [PATCH] oneccl for Arc770 V2025.0.0.6.8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
allreduce optimization with LL256 for Arc770 dGPU
 | 
					allreduce optimization with LL256 for Arc770 dGPU
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -23,17 +23,19 @@ Revert "optimize req_workgroup calculate" for hang issue
 | 
				
			||||||
This reverts commit 20bfd0e0a37f93dfb8bb9c092cd5a0b35e868bfa.
 | 
					This reverts commit 20bfd0e0a37f93dfb8bb9c092cd5a0b35e868bfa.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
fix_fdset_buffer_overflow_issue
 | 
					fix_fdset_buffer_overflow_issue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					control usm and p2p by env variable
 | 
				
			||||||
---
 | 
					---
 | 
				
			||||||
 src/CMakeLists.txt               |   2 +
 | 
					 src/CMakeLists.txt               |   2 +
 | 
				
			||||||
 src/coll/coll.cpp                |  30 +-
 | 
					 src/coll/coll.cpp                |  30 +-
 | 
				
			||||||
 src/coll/coll_param.cpp          |   1 +
 | 
					 src/coll/coll_param.cpp          |   1 +
 | 
				
			||||||
 src/coll/selection/selection.cpp |   5 +
 | 
					 src/coll/selection/selection.cpp |   5 +
 | 
				
			||||||
 src/common/env/env.cpp           |   1 +
 | 
					 src/common/env/env.cpp           |   2 +
 | 
				
			||||||
 src/common/env/env.hpp           |   1 +
 | 
					 src/common/env/env.hpp           |   2 +
 | 
				
			||||||
 src/common/env/vars.hpp          |   1 +
 | 
					 src/common/env/vars.hpp          |   2 +
 | 
				
			||||||
 src/dg2/dg2_allreduce.cpp        | 640 +++++++++++++++++++++++++++++++
 | 
					 src/dg2/dg2_allreduce.cpp        | 644 +++++++++++++++++++++++++++++++
 | 
				
			||||||
 src/dg2/dg2_allreduce.hpp        |  13 +
 | 
					 src/dg2/dg2_allreduce.hpp        |  13 +
 | 
				
			||||||
 9 files changed, 691 insertions(+), 3 deletions(-)
 | 
					 9 files changed, 698 insertions(+), 3 deletions(-)
 | 
				
			||||||
 create mode 100644 src/dg2/dg2_allreduce.cpp
 | 
					 create mode 100644 src/dg2/dg2_allreduce.cpp
 | 
				
			||||||
 create mode 100644 src/dg2/dg2_allreduce.hpp
 | 
					 create mode 100644 src/dg2/dg2_allreduce.hpp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -137,47 +139,50 @@ index 5deae74..f4d9302 100644
 | 
				
			||||||
         LOG_WARN("Applying topo algorithm, but device family is not recognized");
 | 
					         LOG_WARN("Applying topo algorithm, but device family is not recognized");
 | 
				
			||||||
 #ifndef CCL_BF16_GPU_TRUNCATE
 | 
					 #ifndef CCL_BF16_GPU_TRUNCATE
 | 
				
			||||||
diff --git a/src/common/env/env.cpp b/src/common/env/env.cpp
 | 
					diff --git a/src/common/env/env.cpp b/src/common/env/env.cpp
 | 
				
			||||||
index 11413cf..b00641c 100644
 | 
					index 11413cf..0c3f7d8 100644
 | 
				
			||||||
--- a/src/common/env/env.cpp
 | 
					--- a/src/common/env/env.cpp
 | 
				
			||||||
+++ b/src/common/env/env.cpp
 | 
					+++ b/src/common/env/env.cpp
 | 
				
			||||||
@@ -468,6 +468,7 @@ void env_data::parse() {
 | 
					@@ -468,6 +468,8 @@ void env_data::parse() {
 | 
				
			||||||
     }
 | 
					     }
 | 
				
			||||||
     p.env_2_enum(CCL_STAGING_BUFFER, staging_buffer_names, staging_buffer);
 | 
					     p.env_2_enum(CCL_STAGING_BUFFER, staging_buffer_names, staging_buffer);
 | 
				
			||||||
     p.env_2_type(CCL_OP_SYNC, enable_op_sync);
 | 
					     p.env_2_type(CCL_OP_SYNC, enable_op_sync);
 | 
				
			||||||
+    p.env_2_type(CCL_DG2_ALLREDUCE, enable_dg2_allreduce);
 | 
					+    p.env_2_type(CCL_DG2_ALLREDUCE, enable_dg2_allreduce);
 | 
				
			||||||
 | 
					+    p.env_2_type(CCL_DG2_USM, enable_dg2_usm);
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
     p.env_2_type(CCL_CHUNK_COUNT, chunk_count);
 | 
					     p.env_2_type(CCL_CHUNK_COUNT, chunk_count);
 | 
				
			||||||
     CCL_THROW_IF_NOT(chunk_count >= 1, "incorrect ", CCL_CHUNK_COUNT, " ", chunk_count);
 | 
					     CCL_THROW_IF_NOT(chunk_count >= 1, "incorrect ", CCL_CHUNK_COUNT, " ", chunk_count);
 | 
				
			||||||
diff --git a/src/common/env/env.hpp b/src/common/env/env.hpp
 | 
					diff --git a/src/common/env/env.hpp b/src/common/env/env.hpp
 | 
				
			||||||
index baff33f..538d8e5 100644
 | 
					index baff33f..a5785e9 100644
 | 
				
			||||||
--- a/src/common/env/env.hpp
 | 
					--- a/src/common/env/env.hpp
 | 
				
			||||||
+++ b/src/common/env/env.hpp
 | 
					+++ b/src/common/env/env.hpp
 | 
				
			||||||
@@ -177,6 +177,7 @@ public:
 | 
					@@ -177,6 +177,8 @@ public:
 | 
				
			||||||
     bool enable_strict_order;
 | 
					     bool enable_strict_order;
 | 
				
			||||||
     ccl_staging_buffer staging_buffer;
 | 
					     ccl_staging_buffer staging_buffer;
 | 
				
			||||||
     bool enable_op_sync;
 | 
					     bool enable_op_sync;
 | 
				
			||||||
+    int enable_dg2_allreduce;
 | 
					+    int enable_dg2_allreduce;
 | 
				
			||||||
 | 
					+    int enable_dg2_usm;
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
     size_t chunk_count;
 | 
					     size_t chunk_count;
 | 
				
			||||||
     size_t min_chunk_size;
 | 
					     size_t min_chunk_size;
 | 
				
			||||||
diff --git a/src/common/env/vars.hpp b/src/common/env/vars.hpp
 | 
					diff --git a/src/common/env/vars.hpp b/src/common/env/vars.hpp
 | 
				
			||||||
index 73dcf77..84ab518 100644
 | 
					index 73dcf77..a1bc4ca 100644
 | 
				
			||||||
--- a/src/common/env/vars.hpp
 | 
					--- a/src/common/env/vars.hpp
 | 
				
			||||||
+++ b/src/common/env/vars.hpp
 | 
					+++ b/src/common/env/vars.hpp
 | 
				
			||||||
@@ -579,6 +579,7 @@ constexpr const char* CCL_BUFFER_CACHE = "CCL_BUFFER_CACHE";
 | 
					@@ -579,6 +579,8 @@ constexpr const char* CCL_BUFFER_CACHE = "CCL_BUFFER_CACHE";
 | 
				
			||||||
 constexpr const char* CCL_STRICT_ORDER = "CCL_STRICT_ORDER";
 | 
					 constexpr const char* CCL_STRICT_ORDER = "CCL_STRICT_ORDER";
 | 
				
			||||||
 constexpr const char* CCL_STAGING_BUFFER = "CCL_STAGING_BUFFER";
 | 
					 constexpr const char* CCL_STAGING_BUFFER = "CCL_STAGING_BUFFER";
 | 
				
			||||||
 constexpr const char* CCL_OP_SYNC = "CCL_OP_SYNC";
 | 
					 constexpr const char* CCL_OP_SYNC = "CCL_OP_SYNC";
 | 
				
			||||||
+constexpr const char* CCL_DG2_ALLREDUCE = "CCL_DG2_ALLREDUCE";
 | 
					+constexpr const char* CCL_DG2_ALLREDUCE = "CCL_DG2_ALLREDUCE";
 | 
				
			||||||
 | 
					+constexpr const char* CCL_DG2_USM = "CCL_DG2_USM";
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 constexpr const char* CCL_CHUNK_COUNT = "CCL_CHUNK_COUNT";
 | 
					 constexpr const char* CCL_CHUNK_COUNT = "CCL_CHUNK_COUNT";
 | 
				
			||||||
 constexpr const char* CCL_MIN_CHUNK_SIZE = "CCL_MIN_CHUNK_SIZE";
 | 
					 constexpr const char* CCL_MIN_CHUNK_SIZE = "CCL_MIN_CHUNK_SIZE";
 | 
				
			||||||
diff --git a/src/dg2/dg2_allreduce.cpp b/src/dg2/dg2_allreduce.cpp
 | 
					diff --git a/src/dg2/dg2_allreduce.cpp b/src/dg2/dg2_allreduce.cpp
 | 
				
			||||||
new file mode 100644
 | 
					new file mode 100644
 | 
				
			||||||
index 0000000..73e114b
 | 
					index 0000000..84f84b4
 | 
				
			||||||
--- /dev/null
 | 
					--- /dev/null
 | 
				
			||||||
+++ b/src/dg2/dg2_allreduce.cpp
 | 
					+++ b/src/dg2/dg2_allreduce.cpp
 | 
				
			||||||
@@ -0,0 +1,640 @@
 | 
					@@ -0,0 +1,644 @@
 | 
				
			||||||
+#include <fcntl.h>
 | 
					+#include <fcntl.h>
 | 
				
			||||||
+#include <unistd.h>
 | 
					+#include <unistd.h>
 | 
				
			||||||
+#include <sys/un.h>
 | 
					+#include <sys/un.h>
 | 
				
			||||||
| 
						 | 
					@ -355,7 +360,7 @@ index 0000000..73e114b
 | 
				
			||||||
+        if (ret <= 0) {
 | 
					+        if (ret <= 0) {
 | 
				
			||||||
+	   std::cerr << "poll failed: " << strerror(errno) << "\n";
 | 
					+	   std::cerr << "poll failed: " << strerror(errno) << "\n";
 | 
				
			||||||
+	   break;
 | 
					+	   break;
 | 
				
			||||||
+        }
 | 
					+	}
 | 
				
			||||||
+
 | 
					+
 | 
				
			||||||
+        if (pfd.revents & POLL_IN) {
 | 
					+        if (pfd.revents & POLL_IN) {
 | 
				
			||||||
+           int peer_rank;
 | 
					+           int peer_rank;
 | 
				
			||||||
| 
						 | 
					@ -382,7 +387,8 @@ index 0000000..73e114b
 | 
				
			||||||
+
 | 
					+
 | 
				
			||||||
+void create_shared_buf(void *send_buf, void *recv_buf, size_t byte_count)
 | 
					+void create_shared_buf(void *send_buf, void *recv_buf, size_t byte_count)
 | 
				
			||||||
+{
 | 
					+{
 | 
				
			||||||
+    printf("-----> current rank: %d, world size: %d, byte_count: %lu\n", world_rank, world_size, byte_count);
 | 
					+    bool is_p2p = ccl::global_data::env().enable_dg2_usm ? false : true;
 | 
				
			||||||
 | 
					+    printf("-----> current rank: %d, world size: %d, byte_count: %lu,is_p2p:%d\n", world_rank, world_size, byte_count,is_p2p);
 | 
				
			||||||
+
 | 
					+
 | 
				
			||||||
+    pthread_t tid;
 | 
					+    pthread_t tid;
 | 
				
			||||||
+    char sock_path[64];
 | 
					+    char sock_path[64];
 | 
				
			||||||
| 
						 | 
					@ -395,7 +401,10 @@ index 0000000..73e114b
 | 
				
			||||||
+    pthread_create(&tid, nullptr, thread_func, &world_rank);
 | 
					+    pthread_create(&tid, nullptr, thread_func, &world_rank);
 | 
				
			||||||
+
 | 
					+
 | 
				
			||||||
+    size_t buf_size = LL256_BUF_SIZE;
 | 
					+    size_t buf_size = LL256_BUF_SIZE;
 | 
				
			||||||
+    host_buf = sycl::aligned_alloc_device(getpagesize(), buf_size, q);
 | 
					+    if(is_p2p)
 | 
				
			||||||
 | 
					+        host_buf = sycl::aligned_alloc_device(getpagesize(), buf_size, q);
 | 
				
			||||||
 | 
					+    else
 | 
				
			||||||
 | 
					+        host_buf = sycl::aligned_alloc_host(getpagesize(), buf_size, q);
 | 
				
			||||||
+
 | 
					+
 | 
				
			||||||
+    host_bufs[world_rank] = host_buf;
 | 
					+    host_bufs[world_rank] = host_buf;
 | 
				
			||||||
+
 | 
					+
 | 
				
			||||||
| 
						 | 
					@ -838,5 +847,5 @@ index 0000000..0506445
 | 
				
			||||||
+
 | 
					+
 | 
				
			||||||
+void dg2_clear();
 | 
					+void dg2_clear();
 | 
				
			||||||
-- 
 | 
					-- 
 | 
				
			||||||
2.34.1
 | 
					2.25.1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue