From 1c6bcb9c265b248325ef50ae1a52066bfbaa0239 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Wed, 11 Jan 2023 13:30:29 +0800
Subject: [PATCH] fix(ci): close the collective comm test

GitOrigin-RevId: de74c8eac133d4bc9923c671d86e8606746e7400
---
 src/opr-mm/test/collective_comm.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/opr-mm/test/collective_comm.cpp b/src/opr-mm/test/collective_comm.cpp
index 2f93e4b5b..ab42c4eef 100644
--- a/src/opr-mm/test/collective_comm.cpp
+++ b/src/opr-mm/test/collective_comm.cpp
@@ -27,6 +27,19 @@ SymbolVarArray make_reduce_scatter_sum_output(const SymbolVarArray& inputs) {
     return opr::Split::make(rdc, opr::Split::Options::make_average(0, inputs.size()));
 }
 
+// Here is a deadlock problem in these tests, which is caused by cuda/nccl:
+// 1. MemcpyAsync waits for the nccl kernel to finish executing to free command queue
+// slots; 2. nccl kernel waits for peer nccl kernel to launch; 3. the peer nccl kernel
+// needs to call host register before launching; 4. host register waits for a lock; 5.
+// the lock is held by MemcpyAsync, deadlock happened. But in current distributed traing
+// scenario, the collective communication occurs in multiple processes rather than a
+// process. This problem will not happened. After discussion, we decide to close these
+// tests now. As for the test of distributed training, we use the python test to do this
+// work.
+
+// TODO: reopen the collective communication
+#if 0
+
 TEST(TestOprCollectiveComm, AllReduce) {
     REQUIRE_GPU(2);
 
@@ -1779,3 +1792,5 @@ TEST(TestOprCollectiveComm, AllToAllWithGradThisNodeOnly) {
     MGB_ASSERT_TENSOR_EQ(host_expect_grad0, host_grad0);
     MGB_ASSERT_TENSOR_EQ(host_expect_grad1, host_grad1);
 }
+
+#endif
-- 
GitLab