未验证 提交 55827199 编写于 作者: W WangXi 提交者: GitHub

Optimize error message, include dgc, nccl, size op (#24456), test=release/1.8 (#24524)

上级 f0c61017
...@@ -23,8 +23,8 @@ class DGCClipByNormOp : public ClipByNormOp { ...@@ -23,8 +23,8 @@ class DGCClipByNormOp : public ClipByNormOp {
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("current_step"), OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
"current_step should be set."); "DGCClipByNormOp");
return ClipByNormOp::InferShape(ctx); return ClipByNormOp::InferShape(ctx);
} }
......
...@@ -25,28 +25,21 @@ class DGCOp : public framework::OperatorWithKernel { ...@@ -25,28 +25,21 @@ class DGCOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) of DGCop should not be null."); OP_INOUT_CHECK(ctx->HasInput("U"), "Input", "U", "DGCOp");
PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null."); OP_INOUT_CHECK(ctx->HasInput("V"), "Input", "V", "DGCOp");
PADDLE_ENFORCE(ctx->HasInput("Grad"), OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DGCOp");
"Input(Grad) of DGCop should not be null."); OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "DGCOp");
PADDLE_ENFORCE_EQ( OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
ctx->HasInput("Param"), true, "DGCOp");
platform::errors::NotFound("Input(Param) of DGCop is not found.")); OP_INOUT_CHECK(ctx->HasInput("nranks"), "Input", "nranks", "DGCOp");
PADDLE_ENFORCE(ctx->HasInput("current_step"),
"Input(current_step) of DGCop should not be null."); OP_INOUT_CHECK(ctx->HasOutput("U_out"), "Output", "U_out", "DGCOp");
PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true, OP_INOUT_CHECK(ctx->HasOutput("V_out"), "Output", "V_out", "DGCOp");
"Input(nranks) of DGCop should not be null."); OP_INOUT_CHECK(ctx->HasOutput("k"), "Output", "k", "DGCOp");
OP_INOUT_CHECK(ctx->HasOutput("EncodeGrad"), "Output", "EncodeGrad",
PADDLE_ENFORCE(ctx->HasOutput("U_out"), "DGCOp");
"Output(U_out) of DGCop should not be null."); OP_INOUT_CHECK(ctx->HasOutput("GatherBuff"), "Output", "GatherBuff",
PADDLE_ENFORCE(ctx->HasOutput("V_out"), "DGCOp");
"Output(V_out) of DGCop should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("k"),
"Output(k) of DGCop should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("EncodeGrad"),
"Output(EncodeGrad) of DGCop should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasOutput("GatherBuff"), true,
"Output(EncodeGrad) of DGCop should not be null.");
} }
protected: protected:
......
...@@ -24,14 +24,22 @@ namespace operators { ...@@ -24,14 +24,22 @@ namespace operators {
inline float get_period_sparcity(const std::vector<float>& sparsity, inline float get_period_sparcity(const std::vector<float>& sparsity,
float cur_step, float rampup_steps) { float cur_step, float rampup_steps) {
PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0); PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0,
platform::errors::InvalidArgument(
"DGC current step=%d, but it must >= 0, "
"please submit issue in github",
static_cast<int>(cur_step)));
size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps); size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps);
if (idx >= sparsity.size()) { if (idx >= sparsity.size()) {
idx = sparsity.size() - 1; idx = sparsity.size() - 1;
} }
PADDLE_ENFORCE_LT(idx, sparsity.size()); PADDLE_ENFORCE_LT(
idx, sparsity.size(),
platform::errors::OutOfRange(
"sparsity index out of bounds. idx=%d >= sparsity.size=%d", idx,
sparsity.size()));
return sparsity[idx]; return sparsity[idx];
} }
...@@ -55,7 +63,10 @@ class DGCOpKernel : public framework::OpKernel<T> { ...@@ -55,7 +63,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
// nranks // nranks
auto nranks_tensor = ctx.Input<framework::Tensor>("nranks"); auto nranks_tensor = ctx.Input<framework::Tensor>("nranks");
const int nranks = static_cast<const int>(*nranks_tensor->data<float>()); const int nranks = static_cast<const int>(*nranks_tensor->data<float>());
PADDLE_ENFORCE_GT(nranks, 1, "DGC is not useful when num_trainers <= 1"); PADDLE_ENFORCE_GT(nranks, 1,
platform::errors::PreconditionNotMet(
"DGC is not useful when num_trainers <= 1. Please "
"use multi card or multi machine GPU"));
// regularization // regularization
auto p = ctx.Input<framework::Tensor>("Param"); auto p = ctx.Input<framework::Tensor>("Param");
...@@ -105,8 +116,10 @@ class DGCOpKernel : public framework::OpKernel<T> { ...@@ -105,8 +116,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
1 - get_period_sparcity( 1 - get_period_sparcity(
sparsity, static_cast<float>(*current_step - rampup_begin_step), sparsity, static_cast<float>(*current_step - rampup_begin_step),
rampup_step); rampup_step);
PADDLE_ENFORCE_GE(ratio, 0.0); PADDLE_ENFORCE_GE(ratio, 0.0, platform::errors::InvalidArgument(
PADDLE_ENFORCE_LT(ratio, 1.0); "DGC sparsity ratio must >= 0"));
PADDLE_ENFORCE_LT(ratio, 1.0, platform::errors::InvalidArgument(
"DGC sparsity ratio must < 1"));
int k = static_cast<int>(g->numel() * ratio); int k = static_cast<int>(g->numel() * ratio);
VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov
......
...@@ -31,12 +31,15 @@ class NCCLInitOp : public framework::OperatorBase { ...@@ -31,12 +31,15 @@ class NCCLInitOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kParallelScopes)), PADDLE_ENFORCE_NOT_NULL(
"Can not find variable '%s' in the scope.", scope.FindVar(Input(kParallelScopes)),
kParallelScopes); platform::errors::NotFound("Can not find variable '%s' in the scope.",
kParallelScopes));
const auto &name = Output("Communicator"); const auto &name = Output("Communicator");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name), PADDLE_ENFORCE_NOT_NULL(
"Can not find variable '%s' in the scope.", name); scope.FindVar(name),
platform::errors::NotFound(
"Output(%s) is needed for ncclInit operator.", name));
// A parallel do may not use all the gpus. For example, the batch size is 7 // A parallel do may not use all the gpus. For example, the batch size is 7
// in the last batch while we have 8 gpu. In this case, parallel_do will // in the last batch while we have 8 gpu. In this case, parallel_do will
// create 7 parallel scopes, so should ncclInitOp create 7 gpu peers // create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
...@@ -46,11 +49,9 @@ class NCCLInitOp : public framework::OperatorBase { ...@@ -46,11 +49,9 @@ class NCCLInitOp : public framework::OperatorBase {
for (int i = 0; i < static_cast<int>(parallel_scopes.size()); ++i) { for (int i = 0; i < static_cast<int>(parallel_scopes.size()); ++i) {
gpus[i] = i; gpus[i] = i;
} }
PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus."); PADDLE_ENFORCE_EQ(!gpus.empty(), true,
platform::errors::PreconditionNotMet(
if (scope.FindVar(name) == nullptr) { "gpus is empty, NCCL must init with gpus"));
PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
}
platform::Communicator *comm = platform::Communicator *comm =
scope.FindVar(name)->GetMutable<platform::Communicator>(); scope.FindVar(name)->GetMutable<platform::Communicator>();
...@@ -92,17 +93,17 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { ...@@ -92,17 +93,17 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLAllReduce");
" Input(X) of AllReduce op input should not be NULL"); OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
PADDLE_ENFORCE( "NCCLAllReduce");
ctx->HasInput("Communicator"),
" Input(Communicator) of AllReduce op input should not be NULL"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLAllReduce");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Output(Out) of AllReduce op output should not be NULL");
std::string reduction = ctx->Attrs().Get<std::string>("reduction"); std::string reduction = ctx->Attrs().Get<std::string>("reduction");
PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || PADDLE_ENFORCE_EQ(
reduction == "ncclMin" || reduction == "ncclMax"), (reduction == "ncclSum" || reduction == "ncclProd" ||
"invalid reduction."); reduction == "ncclMin" || reduction == "ncclMax"),
true, platform::errors::InvalidArgument("invalid nccl reduction."));
auto x_dims = ctx->GetInputsDim("X"); auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims); ctx->SetOutputsDim("Out", x_dims);
...@@ -137,18 +138,17 @@ class NCCLReduceOp : public framework::OperatorWithKernel { ...@@ -137,18 +138,17 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLReduce");
" Input(X) of Reduce op input should not be NULL"); OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
PADDLE_ENFORCE( "NCCLReduce");
ctx->HasInput("Communicator"),
" Input(Communicator) of Reduce op input should not be NULL"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLReduce");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Input(X) of Reduce op input should not be NULL");
std::string reduction = ctx->Attrs().Get<std::string>("reduction"); std::string reduction = ctx->Attrs().Get<std::string>("reduction");
PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || PADDLE_ENFORCE_EQ(
reduction == "ncclMin" || reduction == "ncclMax"), (reduction == "ncclSum" || reduction == "ncclProd" ||
"invalid reduction."); reduction == "ncclMin" || reduction == "ncclMax"),
true, platform::errors::InvalidArgument("invalid nccl reduction."));
auto x_dims = ctx->GetInputsDim("X"); auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims); ctx->SetOutputsDim("Out", x_dims);
...@@ -188,15 +188,16 @@ class NCCLBcastOp : public framework::OperatorWithKernel { ...@@ -188,15 +188,16 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLBcast");
" Input(X) of Bcast op input should not be NULL"); OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
PADDLE_ENFORCE(ctx->HasInput("Communicator"), "NCCLBcast");
" Input(Communicator) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"), OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLBcast");
" Output(Out) of Bcast op output should not be NULL");
int root = ctx->Attrs().Get<int>("root"); int root = ctx->Attrs().Get<int>("root");
PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set."); PADDLE_ENFORCE_EQ(
root != platform::kInvalidGPUId, true,
platform::errors::InvalidArgument("Bcast root must be set."));
auto x_dims = ctx->GetInputsDim("X"); auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims); ctx->SetOutputsDim("Out", x_dims);
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <functional> #include <functional>
#include <unordered_map>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -37,36 +38,42 @@ class NCCLTypeWrapper<double> { ...@@ -37,36 +38,42 @@ class NCCLTypeWrapper<double> {
static const ncclDataType_t type = ncclDouble; static const ncclDataType_t type = ncclDouble;
}; };
static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
static const std::unordered_map<std::string, ncclRedOp_t> str_to_type = {
{"ncclSum", ncclSum},
{"ncclMin", ncclMin},
{"ncclMax", ncclMax},
{"ncclProd", ncclProd},
};
auto it = str_to_type.find(reduction);
PADDLE_ENFORCE_EQ(it != str_to_type.end(), true,
platform::errors::InvalidArgument(
"Invalid nccl reduction. Must be ncclMin | ncclMax | "
"ncclProd | ncclSum"));
return it->second;
}
template <typename T> template <typename T>
class NCCLAllReduceKernel : public framework::OpKernel<T> { class NCCLAllReduceKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"This kernel only runs on GPU device."); platform::errors::PreconditionNotMet(
"This kernel only runs on GPU device."));
auto* x = ctx.Input<LoDTensor>("X"); auto* x = ctx.Input<LoDTensor>("X");
auto* out = ctx.Output<LoDTensor>("Out"); auto* out = ctx.Output<LoDTensor>("Out");
auto* comm = ctx.Input<Communicator>("Communicator"); auto* comm = ctx.Input<Communicator>("Communicator");
std::string reduction = ctx.Attr<std::string>("reduction"); std::string reduction = ctx.Attr<std::string>("reduction");
ncclRedOp_t reduction_op_ = ncclSum; auto reduction_op_ = str_to_nccl_red_type(reduction);
if (reduction == "ncclMin") {
reduction_op_ = ncclMin;
} else if (reduction == "ncclMax") {
reduction_op_ = ncclMax;
} else if (reduction == "ncclSum") {
reduction_op_ = ncclSum;
} else if (reduction == "ncclProd") {
reduction_op_ = ncclProd;
} else {
PADDLE_THROW("Invalid reduction. default ncclSum.");
}
// device id // device id
int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
VLOG(3) << "gpu : " VLOG(3) << "gpu : "
<< " invoke allreduce. send " << x->numel() << " recv " << " invoke allreduce. send " << x->numel() << " recv "
<< out->numel(); << out->numel();
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(), x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx), NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
ctx.cuda_device_context().stream())); ctx.cuda_device_context().stream()));
...@@ -80,26 +87,17 @@ template <typename T> ...@@ -80,26 +87,17 @@ template <typename T>
class NCCLReduceKernel : public framework::OpKernel<T> { class NCCLReduceKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"This kernel only runs on GPU device."); platform::errors::InvalidArgument(
"This kernel only runs on GPU device."));
auto x = ctx.Input<LoDTensor>("X"); // x0, x1, x2 auto x = ctx.Input<LoDTensor>("X"); // x0, x1, x2
auto out = ctx.Output<LoDTensor>("Out"); auto out = ctx.Output<LoDTensor>("Out");
auto* comm = ctx.Input<Communicator>("Communicator"); auto* comm = ctx.Input<Communicator>("Communicator");
int root = ctx.Attr<int>("root"); int root = ctx.Attr<int>("root");
std::string reduction = ctx.Attr<std::string>("reduction"); std::string reduction = ctx.Attr<std::string>("reduction");
ncclRedOp_t reduction_op_ = ncclSum; auto reduction_op_ = str_to_nccl_red_type(reduction);
if (reduction == "ncclMin") {
reduction_op_ = ncclMin;
} else if (reduction == "ncclMax") {
reduction_op_ = ncclMax;
} else if (reduction == "ncclSum") {
reduction_op_ = ncclSum;
} else if (reduction == "ncclProd") {
reduction_op_ = ncclProd;
} else {
PADDLE_THROW("Invalid reduction. default ncclSum.");
}
// device id // device id
int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
...@@ -111,7 +109,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -111,7 +109,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
} }
VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
<< " recv " << out->numel(); << " recv " << out->numel();
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type, x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
reduction_op_, root, comm->comms().at(idx), reduction_op_, root, comm->comms().at(idx),
ctx.cuda_device_context().stream())); ctx.cuda_device_context().stream()));
...@@ -124,8 +122,9 @@ template <typename T> ...@@ -124,8 +122,9 @@ template <typename T>
class NCCLBcastKernel : public framework::OpKernel<T> { class NCCLBcastKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
"This kernel only runs on GPU device."); platform::errors::InvalidArgument(
"This kernel only runs on GPU device."));
int root = ctx.Attr<int>("root"); int root = ctx.Attr<int>("root");
auto* comm = ctx.Input<Communicator>("Communicator"); auto* comm = ctx.Input<Communicator>("Communicator");
// device id // device id
...@@ -134,7 +133,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -134,7 +133,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
if (idx == root) { if (idx == root) {
auto* x = ctx.Input<LoDTensor>("X"); auto* x = ctx.Input<LoDTensor>("X");
VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel(); VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx), NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
ctx.cuda_device_context().stream())); ctx.cuda_device_context().stream()));
...@@ -143,7 +142,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -143,7 +142,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<LoDTensor>("Out"); auto* out = ctx.Output<LoDTensor>("Out");
VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
<< framework::product(out->dims()); << framework::product(out->dims());
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
out->mutable_data<T>(ctx.GetPlace()), out->numel(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx), NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
ctx.cuda_device_context().stream())); ctx.cuda_device_context().stream()));
......
...@@ -45,10 +45,9 @@ class NCCLTester : public ::testing::Test { ...@@ -45,10 +45,9 @@ class NCCLTester : public ::testing::Test {
public: public:
void SetUp() override { void SetUp() override {
int count = p::GetCUDADeviceCount(); int count = p::GetCUDADeviceCount();
if (count <= 1) { if (count <= 0) {
LOG(WARNING) LOG(WARNING) << "Cannot test gpu nccl, because the CUDA device count is "
<< "Cannot test multi-gpu nccl, because the CUDA device count is " << count;
<< count;
exit(0); exit(0);
} }
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
...@@ -114,8 +113,9 @@ class NCCLTester : public ::testing::Test { ...@@ -114,8 +113,9 @@ class NCCLTester : public ::testing::Test {
lk.unlock(); lk.unlock();
PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims), PADDLE_ENFORCE_EQ(
"Tensor numel not match!"); send_tensor->numel(), f::product(kDims),
paddle::platform::errors::InvalidArgument("Tensor numel not match!"));
auto op = f::OpRegistry::CreateOp(*op1); auto op = f::OpRegistry::CreateOp(*op1);
...@@ -126,6 +126,10 @@ class NCCLTester : public ::testing::Test { ...@@ -126,6 +126,10 @@ class NCCLTester : public ::testing::Test {
VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type(); VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
} }
void testNcclReduceOp();
void testNcclAllReduceOp();
void testNcclBcastOp();
public: public:
std::vector<p::DeviceContext *> dev_ctxs_; std::vector<p::DeviceContext *> dev_ctxs_;
f::Scope g_scope_; f::Scope g_scope_;
...@@ -133,13 +137,7 @@ class NCCLTester : public ::testing::Test { ...@@ -133,13 +137,7 @@ class NCCLTester : public ::testing::Test {
std::vector<int> gpu_list_; std::vector<int> gpu_list_;
}; };
// ncclInitOp with desc void NCCLTester::testNcclAllReduceOp() {
TEST_F(NCCLTester, ncclInitOp) {}
// ncclAllReduceOp with desc
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
/*
TEST_F(NCCLTester, ncclAllReduceOp) {
std::unique_ptr<f::OpDesc> op2(new f::OpDesc); std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
op2->SetType("ncclAllReduce"); op2->SetType("ncclAllReduce");
op2->SetInput("X", {"st"}); op2->SetInput("X", {"st"});
...@@ -186,10 +184,8 @@ TEST_F(NCCLTester, ncclAllReduceOp) { ...@@ -186,10 +184,8 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
} }
} }
} }
*/
// ncclReduceOp with desc void NCCLTester::testNcclReduceOp() {
TEST_F(NCCLTester, ncclReduceOp) {
std::unique_ptr<f::OpDesc> op2(new f::OpDesc); std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
const int kRoot = 0; const int kRoot = 0;
op2->SetType("ncclReduce"); op2->SetType("ncclReduce");
...@@ -236,10 +232,7 @@ TEST_F(NCCLTester, ncclReduceOp) { ...@@ -236,10 +232,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
} }
} }
// ncclBcastOp with desc void NCCLTester::testNcclBcastOp() {
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
/*
TEST_F(NCCLTester, ncclBcastOp) {
std::unique_ptr<f::OpDesc> op2(new f::OpDesc); std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
const int kRoot = 0; const int kRoot = 0;
op2->SetType("ncclBcast"); op2->SetType("ncclBcast");
...@@ -263,13 +256,17 @@ TEST_F(NCCLTester, ncclBcastOp) { ...@@ -263,13 +256,17 @@ TEST_F(NCCLTester, ncclBcastOp) {
ths[i].join(); ths[i].join();
} }
const int idx = 1; const int idx = gpu_list_.size() - 1;
float result = GetGPUData(kRoot); float result = GetGPUData(kRoot);
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list_[idx]); p::CUDAPlace gpu_place(gpu_list_[idx]);
auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>(); std::string rt_str = "rt";
if (idx == kRoot) {
rt_str = "st";
}
auto &recv_tensor = dev_scopes[idx]->FindVar(rt_str)->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>(); auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
result_tensor->Resize(kDims); result_tensor->Resize(kDims);
...@@ -284,4 +281,20 @@ TEST_F(NCCLTester, ncclBcastOp) { ...@@ -284,4 +281,20 @@ TEST_F(NCCLTester, ncclBcastOp) {
ASSERT_NEAR(ct[j], result, 1e-5); ASSERT_NEAR(ct[j], result, 1e-5);
} }
} }
*/
// ncclInitOp with desc
TEST_F(NCCLTester, ncclInitOp) {}
TEST_F(NCCLTester, ncclOp) {
// Serial execution is required for the same nccl comm.
// ncclAllReduceOp with desc
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
testNcclReduceOp();
testNcclAllReduceOp();
// ncclBcastOp with desc
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
testNcclBcastOp();
}
...@@ -23,10 +23,9 @@ class SizeOp : public framework::OperatorWithKernel { ...@@ -23,10 +23,9 @@ class SizeOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"), OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Size");
"Input (Input) of Size op should not be null."); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Size");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output (Out) of Size op should not be null.");
ctx->SetOutputDim("Out", {1}); ctx->SetOutputDim("Out", {1});
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册