diff --git a/src/opr-mm/test/collective_comm.cpp b/src/opr-mm/test/collective_comm.cpp index 2f93e4b5bbb27453a3e6943944aca3e28716853a..ab42c4eef0dee28e1ff35de68a1d8c2b8fabed12 100644 --- a/src/opr-mm/test/collective_comm.cpp +++ b/src/opr-mm/test/collective_comm.cpp @@ -27,6 +27,19 @@ SymbolVarArray make_reduce_scatter_sum_output(const SymbolVarArray& inputs) { return opr::Split::make(rdc, opr::Split::Options::make_average(0, inputs.size())); } +// Here is a deadlock problem in these tests, which is caused by cuda/nccl: +// 1. MemcpyAsync waits for the nccl kernel to finish executing to free command queue +// slots; 2. nccl kernel waits for peer nccl kernel to launch; 3. the peer nccl kernel +// needs to call host register before launching; 4. host register waits for a lock; 5. +// the lock is held by MemcpyAsync, deadlock happened. But in current distributed traing +// scenario, the collective communication occurs in multiple processes rather than a +// process. This problem will not happened. After discussion, we decide to close these +// tests now. As for the test of distributed training, we use the python test to do this +// work. + +// TODO: reopen the collective communication +#if 0 + TEST(TestOprCollectiveComm, AllReduce) { REQUIRE_GPU(2); @@ -1779,3 +1792,5 @@ TEST(TestOprCollectiveComm, AllToAllWithGradThisNodeOnly) { MGB_ASSERT_TENSOR_EQ(host_expect_grad0, host_grad0); MGB_ASSERT_TENSOR_EQ(host_expect_grad1, host_grad1); } + +#endif