未验证 提交 b5fd7fc1 编写于 作者: S Shijie 提交者: GitHub

Fix nccl_test_op failure on hopper (#51390)

* add sync

* Fix nccl_op_test
上级 94cd1ba2
...@@ -55,26 +55,13 @@ class NCCLTester : public ::testing::Test { ...@@ -55,26 +55,13 @@ class NCCLTester : public ::testing::Test {
gpu_list_.emplace_back(i); gpu_list_.emplace_back(i);
} }
paddle::platform::CPUPlace cpu_place; p::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list_.size(); ++i) { f::InitDevices();
p::CUDAPlace place(i); pool_ptr_ = &p::DeviceContextPool::Instance();
auto *ctx = new phi::GPUContext(place);
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->PartialInitWithAllocator();
dev_ctxs_.emplace_back(ctx);
}
NCCLInitOp(); NCCLInitOp();
} }
void TearDown() override {
for (auto &device_context : dev_ctxs_) {
delete device_context;
}
}
void NCCLInitOp() { void NCCLInitOp() {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
std::unique_ptr<f::OpDesc> op1(new f::OpDesc); std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
...@@ -104,7 +91,7 @@ class NCCLTester : public ::testing::Test { ...@@ -104,7 +91,7 @@ class NCCLTester : public ::testing::Test {
const f::OpDesc *op1 = &op_desc; const f::OpDesc *op1 = &op_desc;
p::CUDAPlace place(gpu_id); p::CUDAPlace place(gpu_id);
auto &ctx = dev_ctxs_.at(gpu_id); const auto &ctx = pool_ptr_->Get(place);
auto *send_tensor = scope->Var("st")->GetMutable<phi::DenseTensor>(); auto *send_tensor = scope->Var("st")->GetMutable<phi::DenseTensor>();
auto *recv_tensor = scope->Var("rt")->GetMutable<phi::DenseTensor>(); auto *recv_tensor = scope->Var("rt")->GetMutable<phi::DenseTensor>();
...@@ -138,7 +125,7 @@ class NCCLTester : public ::testing::Test { ...@@ -138,7 +125,7 @@ class NCCLTester : public ::testing::Test {
void testNcclBcastOp(); void testNcclBcastOp();
public: public:
std::vector<p::DeviceContext *> dev_ctxs_; p::DeviceContextPool *pool_ptr_;
f::Scope g_scope_; f::Scope g_scope_;
std::mutex mu_; std::mutex mu_;
std::vector<int> gpu_list_; std::vector<int> gpu_list_;
...@@ -185,7 +172,7 @@ void NCCLTester::testNcclAllReduceOp() { ...@@ -185,7 +172,7 @@ void NCCLTester::testNcclAllReduceOp() {
result_tensor->Resize(kDims); result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
auto *dev_ctx = static_cast<phi::GPUContext *>(dev_ctxs_[i]); auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
paddle::memory::Copy(cpu_place, paddle::memory::Copy(cpu_place,
ct, ct,
p::CUDAPlace(gpu_list_[i]), p::CUDAPlace(gpu_list_[i]),
...@@ -242,12 +229,14 @@ void NCCLTester::testNcclReduceOp() { ...@@ -242,12 +229,14 @@ void NCCLTester::testNcclReduceOp() {
result_tensor->Resize(kDims); result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
paddle::memory::Copy(cpu_place, paddle::memory::Copy(cpu_place,
ct, ct,
p::CUDAPlace(gpu_list_[kRoot]), p::CUDAPlace(gpu_list_[kRoot]),
rt, rt,
recv_tensor.numel() * sizeof(float), recv_tensor.numel() * sizeof(float),
nullptr); dev_ctx->stream());
dev_ctx->Wait();
for (int64_t j = 0; j < phi::product(kDims); ++j) { for (int64_t j = 0; j < phi::product(kDims); ++j) {
ASSERT_NEAR(ct[j], expected_result, 1e-5); ASSERT_NEAR(ct[j], expected_result, 1e-5);
...@@ -298,7 +287,7 @@ void NCCLTester::testNcclBcastOp() { ...@@ -298,7 +287,7 @@ void NCCLTester::testNcclBcastOp() {
result_tensor->Resize(kDims); result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
auto *dev_ctx = static_cast<phi::GPUContext *>(dev_ctxs_[idx]); auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
paddle::memory::Copy(cpu_place, paddle::memory::Copy(cpu_place,
ct, ct,
p::CUDAPlace(gpu_list_[idx]), p::CUDAPlace(gpu_list_[idx]),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册