未验证 提交 b5fd7fc1 编写于 作者: S Shijie 提交者: GitHub

Fix nccl_test_op failure on hopper (#51390)

* add sync

* Fix nccl_op_test
上级 94cd1ba2
......@@ -55,26 +55,13 @@ class NCCLTester : public ::testing::Test {
gpu_list_.emplace_back(i);
}
paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list_.size(); ++i) {
p::CUDAPlace place(i);
auto *ctx = new phi::GPUContext(place);
ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, ctx->stream())
.get());
ctx->PartialInitWithAllocator();
dev_ctxs_.emplace_back(ctx);
}
p::CPUPlace cpu_place;
f::InitDevices();
pool_ptr_ = &p::DeviceContextPool::Instance();
NCCLInitOp();
}
void TearDown() override {
for (auto &device_context : dev_ctxs_) {
delete device_context;
}
}
void NCCLInitOp() {
paddle::platform::CPUPlace cpu_place;
std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
......@@ -104,7 +91,7 @@ class NCCLTester : public ::testing::Test {
const f::OpDesc *op1 = &op_desc;
p::CUDAPlace place(gpu_id);
auto &ctx = dev_ctxs_.at(gpu_id);
const auto &ctx = pool_ptr_->Get(place);
auto *send_tensor = scope->Var("st")->GetMutable<phi::DenseTensor>();
auto *recv_tensor = scope->Var("rt")->GetMutable<phi::DenseTensor>();
......@@ -138,7 +125,7 @@ class NCCLTester : public ::testing::Test {
void testNcclBcastOp();
public:
std::vector<p::DeviceContext *> dev_ctxs_;
p::DeviceContextPool *pool_ptr_;
f::Scope g_scope_;
std::mutex mu_;
std::vector<int> gpu_list_;
......@@ -185,7 +172,7 @@ void NCCLTester::testNcclAllReduceOp() {
result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place);
auto *dev_ctx = static_cast<phi::GPUContext *>(dev_ctxs_[i]);
auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
paddle::memory::Copy(cpu_place,
ct,
p::CUDAPlace(gpu_list_[i]),
......@@ -242,12 +229,14 @@ void NCCLTester::testNcclReduceOp() {
result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place);
auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
paddle::memory::Copy(cpu_place,
ct,
p::CUDAPlace(gpu_list_[kRoot]),
rt,
recv_tensor.numel() * sizeof(float),
nullptr);
dev_ctx->stream());
dev_ctx->Wait();
for (int64_t j = 0; j < phi::product(kDims); ++j) {
ASSERT_NEAR(ct[j], expected_result, 1e-5);
......@@ -298,7 +287,7 @@ void NCCLTester::testNcclBcastOp() {
result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place);
auto *dev_ctx = static_cast<phi::GPUContext *>(dev_ctxs_[idx]);
auto *dev_ctx = static_cast<phi::GPUContext *>(pool_ptr_->Get(gpu_place));
paddle::memory::Copy(cpu_place,
ct,
p::CUDAPlace(gpu_list_[idx]),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册