提交 38d3adfe 编写于 作者: D Dong Zhihong

"add multioperator testcase"

上级 94992a99
...@@ -100,8 +100,8 @@ class NCCLReduceOp : public framework::OperatorWithKernel { ...@@ -100,8 +100,8 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
} }
}; };
// BcastSendOp // BcastOp
class NCCLBcastSendOp : public framework::OperatorWithKernel { class NCCLBcastOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -111,20 +111,12 @@ class NCCLBcastSendOp : public framework::OperatorWithKernel { ...@@ -111,20 +111,12 @@ class NCCLBcastSendOp : public framework::OperatorWithKernel {
" Input(X) of Bcast op input should not be NULL"); " Input(X) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasInput("Communicator"), PADDLE_ENFORCE(ctx->HasInput("Communicator"),
" Input(Communicator) of Bcast op input should not be NULL"); " Input(Communicator) of Bcast op input should not be NULL");
}
};
// BcastRecvOp
class NCCLBcastRecvOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Communicator"),
" Input(Communicator) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Output(Out) of Bcast op output should not be NULL"); " Output(Out) of Bcast op output should not be NULL");
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
} }
}; };
...@@ -146,52 +138,41 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -146,52 +138,41 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
} }
}; };
// BcastSend should be in the root // ReduceOp
// BcastSendOp class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
NCCLBcastSendOpMaker(framework::OpProto *proto, NCCLReduceOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of BcastSend op"); AddInput("X", "The input of Reduce op");
AddInput("Communicator", "Communicator for communicating between gpus"); AddInput("Communicator", "Communicator for communicating between gpus");
AddAttr<int>("root", "root gpu of Bcast"); AddOutput("Out", "The output of Reduce op");
AddAttr<int>("root",
"root gpu of the parameter. if not set(-1). hashed by name.")
.SetDefault(-1);
AddComment(R"DOC( AddComment(R"DOC(
Bcast the tensors. Reduce the tensors)DOC");
)DOC");
} }
}; };
// BcastOp // BcastOp
class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker { class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
NCCLBcastRecvOpMaker(framework::OpProto *proto, NCCLBcastOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of BcastSend op");
AddInput("Communicator", "Communicator for communicating between gpus"); AddInput("Communicator", "Communicator for communicating between gpus");
AddAttr<int>("root", "root gpu of BcastRecv");
AddOutput("Out", "The output of Bcast"); AddOutput("Out", "The output of Bcast");
AddAttr<int>("root",
"root gpu of the parameter. if not set(-1). hashed by name.")
.SetDefault(-1);
AddComment(R"DOC( AddComment(R"DOC(
Bcast the tensors. Bcast the tensors.
)DOC"); )DOC");
} }
}; };
// BcastRecvOp
class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
NCCLReduceOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of Reduce op");
AddInput("Communicator", "Communicator for communicating between gpus");
AddOutput("Out", "The output of Reduce op");
AddComment(R"DOC(
Reduce the tensors.
)DOC");
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -201,9 +182,7 @@ REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp, ...@@ -201,9 +182,7 @@ REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp,
REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp, REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
ops::NCCLAllReduceOpMaker); ops::NCCLAllReduceOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(ncclBcastSend, ops::NCCLBcastSendOp, REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, ops::NCCLBcastOp,
ops::NCCLBcastSendOpMaker); ops::NCCLBcastOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(ncclBcastRecv, ops::NCCLBcastRecvOp,
ops::NCCLBcastRecvOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp, REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
ops::NCCLReduceOpMaker); ops::NCCLReduceOpMaker);
...@@ -83,6 +83,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -83,6 +83,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
auto ins = ctx.MultiInput<LoDTensor>("X"); // x0, x1, x2 auto ins = ctx.MultiInput<LoDTensor>("X"); // x0, x1, x2
auto outs = ctx.MultiOutput<LoDTensor>("Out"); auto outs = ctx.MultiOutput<LoDTensor>("Out");
int root = ctx.Attr<int>("root");
auto* comm = ctx.Input<Communicator>("Communicator"); auto* comm = ctx.Input<Communicator>("Communicator");
...@@ -97,7 +98,9 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -97,7 +98,9 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
auto ins_names = ctx.Inputs("X"); auto ins_names = ctx.Inputs("X");
std::hash<std::string> hasher; std::hash<std::string> hasher;
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
int root = hasher(ins_names[i]) % comm->comms_.size(); if (root == -1) {
root = hasher(ins_names[i]) % comm->comms_.size();
}
T* recvbuffer = nullptr; T* recvbuffer = nullptr;
if (root == device_id) { if (root == device_id) {
recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace()); recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
...@@ -135,8 +138,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -135,8 +138,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
int device_id = int device_id =
boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId(); boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(device_id); int idx = comm->GetCommId(device_id);
if (idx == root) { if (idx == root) {
auto ins = ctx.MultiInput<Tensor>("X"); auto ins = ctx.MultiInput<LoDTensor>("X");
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE(platform::dynload::ncclBcast(
(void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type, (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
...@@ -144,7 +148,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -144,7 +148,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE(cudaStreamSynchronize(stream)); PADDLE_ENFORCE(cudaStreamSynchronize(stream));
} }
} else { } else {
auto outs = ctx.MultiOutput<Tensor>("Out"); auto outs = ctx.MultiOutput<LoDTensor>("Out");
for (size_t i = 0; i < outs.size(); ++i) { for (size_t i = 0; i < outs.size(); ++i) {
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE(platform::dynload::ncclBcast(
outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(), outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
...@@ -160,6 +164,5 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -160,6 +164,5 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>); REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
REGISTER_OP_GPU_KERNEL(ncclBcastSend, ops::NCCLBcastKernel<float>); REGISTER_OP_GPU_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>); REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
REGISTER_OP_GPU_KERNEL(ncclBcastRecv, ops::NCCLBcastKernel<float>);
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
#include "paddle/framework/var_desc.h" #include "paddle/framework/var_desc.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/nccl/nccl_gpu_common.h" #include "paddle/operators/nccl/nccl_gpu_common.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
...@@ -37,8 +38,7 @@ ...@@ -37,8 +38,7 @@
USE_NO_KERNEL_OP(ncclInit); USE_NO_KERNEL_OP(ncclInit);
USE_GPU_ONLY_OP(ncclAllReduce); USE_GPU_ONLY_OP(ncclAllReduce);
USE_GPU_ONLY_OP(ncclReduce); USE_GPU_ONLY_OP(ncclReduce);
USE_GPU_ONLY_OP(ncclBcastSend); USE_GPU_ONLY_OP(ncclBcast);
USE_GPU_ONLY_OP(ncclBcastRecv);
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
...@@ -144,12 +144,62 @@ class NCCLTester : public ::testing::Test { ...@@ -144,12 +144,62 @@ class NCCLTester : public ::testing::Test {
// } // }
// ncclAllReduceOp with desc // ncclAllReduceOp with desc
TEST_F(NCCLTester, ncclAllReduceOp) { // TEST_F(NCCLTester, ncclAllReduceOp) {
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
// op2->SetType("ncclAllReduce");
// op2->SetInput("X", {"st"});
// op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"});
// std::vector<f::Scope *> dev_scopes;
// std::vector<std::thread> ths;
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// dev_scopes.emplace_back(&g_scope.NewScope());
// std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
// *op2.get(), dev_scopes[i]);
// ths.emplace_back(std::move(th));
// }
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// ths[i].join();
// }
// // check results
// float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
// for (size_t i = 0; i < dev_scopes.size(); ++i) {
// p::CPUPlace cpu_place;
// p::GPUPlace gpu_place(gpu_list[i]);
// auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
// auto *rt = recv_tensor.data<float>();
// auto *result_tensor =
// dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
// result_tensor->Resize(kDims);
// auto *ct = result_tensor->mutable_data<float>(cpu_place);
// paddle::memory::Copy(
// cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
// recv_tensor.numel() * sizeof(float),
// static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
// for (size_t j = 0; j < f::product(kDims); ++j) {
// ASSERT_NEAR(ct[j], result, 1e-5);
// }
// }
// }
// ncclAReduceOp with desc
TEST_F(NCCLTester, ncclReduceOp) {
std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
op2->SetType("ncclAllReduce"); const int kRoot = 0;
op2->SetType("ncclReduce");
op2->SetInput("X", {"st"}); op2->SetInput("X", {"st"});
op2->SetInput("Communicator", {"comm"}); op2->SetInput("Communicator", {"comm"});
op2->SetOutput("Out", {"rt"}); op2->SetOutput("Out", {"rt"});
op2->SetAttr("root", {kRoot});
std::vector<f::Scope *> dev_scopes; std::vector<f::Scope *> dev_scopes;
...@@ -166,39 +216,43 @@ TEST_F(NCCLTester, ncclAllReduceOp) { ...@@ -166,39 +216,43 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
ths[i].join(); ths[i].join();
} }
// check results // check results on
float result = 0; float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
std::accumulate(gpu_list.begin(), gpu_list.end(), result);
for (size_t i = 0; i < dev_scopes.size(); ++i) {
auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>(); p::GPUPlace gpu_place(gpu_list[kRoot]);
result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
cpu_place, ct, p::GPUPlace(gpu_list[i]), rt, auto *rt = recv_tensor.data<float>();
recv_tensor.numel() * sizeof(float), auto *result_tensor =
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream()); dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
for (size_t j = 0; j < f::product(kDims); ++j) { result_tensor->Resize(kDims);
ASSERT_NEAR(ct[j], result, 1e-5); auto *ct = result_tensor->mutable_data<float>(cpu_place);
}
paddle::memory::Copy(
cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
for (int j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
} }
} }
// ncclReduceOp with desc // // ncclBcastOp with desc
TEST(NCCL, ncclReduceOp) { TEST_F(NCCLTester, ncclBcastOp) {
std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
op2->SetType("ncclReduce"); const int kRoot = 0;
op2->SetType("ncclBcast");
op2->SetInput("X", {"st"}); op2->SetInput("X", {"st"});
op2->SetInput("Communicator", {"comm"}); op2->SetInput("Communicator", {"comm"});
op2->SetOutput("Out", {"rt"}); op2->SetOutput("Out", {"rt"});
op2->SetAttr("root", {kRoot});
std::vector<f::Scope *> dev_scopes; std::vector<f::Scope *> dev_scopes;
std::vector<std::thread> ths; std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope()); dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
...@@ -210,76 +264,99 @@ TEST(NCCL, ncclReduceOp) { ...@@ -210,76 +264,99 @@ TEST(NCCL, ncclReduceOp) {
ths[i].join(); ths[i].join();
} }
// check results const int idx = 1;
float result = 0; // check results on
std::accumulate(gpu_list.begin(), gpu_list.end(), result); float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
for (size_t i = 0; i < dev_scopes.size(); ++i) {
auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>(); p::GPUPlace gpu_place(gpu_list[idx]);
result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
cpu_place, ct, p::GPUPlace(gpu_list[i]), rt, auto *rt = recv_tensor.data<float>();
recv_tensor.numel() * sizeof(float), auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream()); result_tensor->Resize(kDims);
for (size_t j = 0; j < f::product(kDims); ++j) { auto *ct = result_tensor->mutable_data<float>(cpu_place);
ASSERT_NEAR(ct[j], result, 1e-5);
} paddle::memory::Copy(
cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
for (size_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
} }
} }
// ncclBcastOp with desc // joint ncclBcastOp and ncclReduceOp
TEST(NCCL, ncclBcastOp) { TEST_F(NCCLTester, MultipleOp) {
const int kRoot = 0;
std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
op1->SetType("ncclBcastSend"); op1->SetType("ncclReduce");
op1->SetInput("X", {"st"}); op1->SetInput("X", {"rt"});
op1->SetInput("Communicator", {"comm"}); op1->SetInput("Communicator", {"comm"});
op1->SetOutput("Out", {"rt"});
op2->SetAttr("root", {kRoot});
std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
op2->SetType("ncclBcastRecv"); op2->SetType("ncclBcast");
op2->SetInput("X", {"st"});
op2->SetInput("Communicator", {"comm"}); op2->SetInput("Communicator", {"comm"});
op2->SetOutput("Out", {"rt"}); op2->SetOutput("Out", {"rt"});
op2->SetAttr("root", {kRoot});
std::vector<f::Scope *> dev_scopes;
std::vector<std::thread> ths; std::vector<std::thread> ths;
for (size_t i = 1; i < gpu_list.size(); ++i) {
// run Bcast
for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
*op2.get(), &g_scope.NewScope()); *op1.get(), dev_scopes[i]);
ths.emplace_back(std::move(th)); ths.emplace_back(std::move(th));
} }
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
ths[i].join(); ths[i].join();
} }
}
// joint ncclBcastOp and ncclReduceOp ths.clear();
// TEST(NCCL, MultipleOp) {
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
// op2->SetType("ncclBcastSend");
// op2->SetInput("X", {"st"});
// op2->SetInput("Communicator", {"comm"});
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind); // run Reduce
// op2->SetType("ncclBcastRecv"); for (size_t i = 0; i < gpu_list.size(); ++i) {
// op2->SetInput("Communicator", {"comm"}); dev_scopes.emplace_back(&g_scope.NewScope());
// op2->SetOutput("Out", {"rt"}); std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
*op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th));
}
// std::vector<std::thread> ths; for (size_t i = 0; i < gpu_list.size(); ++i) {
// for (size_t i = 0; i < gpu_list.size(); ++i) { ths[i].join();
// std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], }
// *op2.get(),
// &g_scope.NewScope());
// ths.emplace_back(std::move(th));
// }
// for (size_t i = 0; i < gpu_list.size(); ++i) { // check results
// ths[i].join(); float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
// }
// } for (size_t i = 0; i < dev_scopes.size(); ++i) {
p::CPUPlace cpu_place;
p::GPUPlace gpu_place(gpu_list[i]);
auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy(
cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
for (int j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
}
}
}
int main(int argc, char **argv) { int main(int argc, char **argv) {
const int dev_count = p::GetCUDADeviceCount(); const int dev_count = p::GetCUDADeviceCount();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册