From 1c6bcb9c265b248325ef50ae1a52066bfbaa0239 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 11 Jan 2023 13:30:29 +0800 Subject: [PATCH] fix(ci): close the collective comm test GitOrigin-RevId: de74c8eac133d4bc9923c671d86e8606746e7400 --- src/opr-mm/test/collective_comm.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/opr-mm/test/collective_comm.cpp b/src/opr-mm/test/collective_comm.cpp index 2f93e4b5b..ab42c4eef 100644 --- a/src/opr-mm/test/collective_comm.cpp +++ b/src/opr-mm/test/collective_comm.cpp @@ -27,6 +27,19 @@ SymbolVarArray make_reduce_scatter_sum_output(const SymbolVarArray& inputs) { return opr::Split::make(rdc, opr::Split::Options::make_average(0, inputs.size())); } +// Here is a deadlock problem in these tests, which is caused by cuda/nccl: +// 1. MemcpyAsync waits for the nccl kernel to finish executing to free command queue +// slots; 2. nccl kernel waits for peer nccl kernel to launch; 3. the peer nccl kernel +// needs to call host register before launching; 4. host register waits for a lock; 5. +// the lock is held by MemcpyAsync, deadlock happened. But in current distributed traing +// scenario, the collective communication occurs in multiple processes rather than a +// process. This problem will not happened. After discussion, we decide to close these +// tests now. As for the test of distributed training, we use the python test to do this +// work. + +// TODO: reopen the collective communication +#if 0 + TEST(TestOprCollectiveComm, AllReduce) { REQUIRE_GPU(2); @@ -1779,3 +1792,5 @@ TEST(TestOprCollectiveComm, AllToAllWithGradThisNodeOnly) { MGB_ASSERT_TENSOR_EQ(host_expect_grad0, host_grad0); MGB_ASSERT_TENSOR_EQ(host_expect_grad1, host_grad1); } + +#endif -- GitLab