Fix nccl_test_op failure on hopper (#51390)

* add sync * Fix nccl_op_test

Fix nccl_test_op failure on hopper (#51390)
* add sync * Fix nccl_op_test
b5fd7fc1 · Shijie · GitHub · 94cd1ba2 · b5fd7fc1
隐藏空白更改
内联并排

Showing with 10 addition and 21 deletion

paddle/fluid/operators/nccl/nccl_op_test.cu.cc paddle/fluid/operators/nccl/nccl_op_test.cu.cc +10 -21

未找到文件。
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -55,26 +55,13 @@ class NCCLTester : public ::testing::Test {
      gpu_list_.emplace_back(i);
    }
-    paddle::platform::CPUPlace cpu_place;
+    p::CPUPlace cpu_place;
-    for (size_t i = 0; i < gpu_list_.size(); ++i) {
+    f::InitDevices();
-      p::CUDAPlace place(i);
+    pool_ptr_ = &p::DeviceContextPool::Instance();
-      auto *ctx = new phi::GPUContext(place);
-      ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                            .GetAllocator(place, ctx->stream())
-                            .get());
-      ctx->PartialInitWithAllocator();
-      dev_ctxs_.emplace_back(ctx);
-    }
    NCCLInitOp();
  }
-  void TearDown() override {
-    for (auto &device_context : dev_ctxs_) {
-      delete device_context;
-    }
-  }
  void NCCLInitOp() {
    paddle::platform::CPUPlace cpu_place;
    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
@@ -104,7 +91,7 @@ class NCCLTester : public ::testing::Test {
    const f::OpDesc *op1 = &op_desc;
    p::CUDAPlace place(gpu_id);
-    auto &ctx = dev_ctxs_.at(gpu_id);
+    const auto &ctx = pool_ptr_->Get(place);
    auto *send_tensor = scope->Var("st")->GetMutable<phi::DenseTensor>();
    auto *recv_tensor = scope->Var("rt")->GetMutable<phi::DenseTensor>();
@@ -138,7 +125,7 @@ class NCCLTester : public ::testing::Test {
  void testNcclBcastOp();
 public:
-  std::vector<p::DeviceContext *> dev_ctxs_;
+  p::DeviceContextPool *pool_ptr_;
  f::Scope g_scope_;
  std::mutex mu_;
  std::vector<int> gpu_list_;
@@ -185,7 +172,7 @@ void NCCLTester::testNcclAllReduceOp() {
    result_tensor->Resize(kDims);
    auto *ct = result_tensor->mutable_data<float>(cpu_place);
-    auto *dev_ctx = static_cast<phi::GPUContext *>(dev_ctxs_[i]);
+    auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
    paddle::memory::Copy(cpu_place,
                         ct,
                         p::CUDAPlace(gpu_list_[i]),
@@ -242,12 +229,14 @@ void NCCLTester::testNcclReduceOp() {
  result_tensor->Resize(kDims);
  auto *ct = result_tensor->mutable_data<float>(cpu_place);
+  auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
  paddle::memory::Copy(cpu_place,
                       ct,
                       p::CUDAPlace(gpu_list_[kRoot]),
                       rt,
                       recv_tensor.numel() * sizeof(float),
-                       nullptr);
+                       dev_ctx->stream());
+  dev_ctx->Wait();
  for (int64_t j = 0; j < phi::product(kDims); ++j) {
    ASSERT_NEAR(ct[j], expected_result, 1e-5);
@@ -298,7 +287,7 @@ void NCCLTester::testNcclBcastOp() {
  result_tensor->Resize(kDims);
  auto *ct = result_tensor->mutable_data<float>(cpu_place);
-  auto *dev_ctx = static_cast<phi::GPUContext *>(dev_ctxs_[idx]);
+  auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
  paddle::memory::Copy(cpu_place,
                       ct,
                       p::CUDAPlace(gpu_list_[idx]),