Optimize error message, include dgc, nccl, size op (#24456)

027f9953 · WangXi · GitHub · a5361982 · 027f9953 · 027f9953
7 changed file
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
@@ -23,8 +23,8 @@ class DGCClipByNormOp : public ClipByNormOp {

 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("current_step"),
-                   "current_step should be set.");
+    OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
+                   "DGCClipByNormOp");

    return ClipByNormOp::InferShape(ctx);
  }

--- a/paddle/fluid/operators/dgc_op.cc
+++ b/paddle/fluid/operators/dgc_op.cc
@@ -25,28 +25,21 @@ class DGCOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of DGCop should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Param"), true,
-        platform::errors::NotFound("Input(Param) of DGCop is not found."));
-    PADDLE_ENFORCE(ctx->HasInput("current_step"),
-                   "Input(current_step) of DGCop should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true,
-                      "Input(nranks) of DGCop should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("U_out"),
-                   "Output(U_out) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("V_out"),
-                   "Output(V_out) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("k"),
-                   "Output(k) of DGCop should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("EncodeGrad"),
-                   "Output(EncodeGrad) of DGCop should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("GatherBuff"), true,
-                      "Output(EncodeGrad) of DGCop should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("U"), "Input", "U", "DGCOp");
+    OP_INOUT_CHECK(ctx->HasInput("V"), "Input", "V", "DGCOp");
+    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DGCOp");
+    OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "DGCOp");
+    OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
+                   "DGCOp");
+    OP_INOUT_CHECK(ctx->HasInput("nranks"), "Input", "nranks", "DGCOp");
+
+    OP_INOUT_CHECK(ctx->HasOutput("U_out"), "Output", "U_out", "DGCOp");
+    OP_INOUT_CHECK(ctx->HasOutput("V_out"), "Output", "V_out", "DGCOp");
+    OP_INOUT_CHECK(ctx->HasOutput("k"), "Output", "k", "DGCOp");
+    OP_INOUT_CHECK(ctx->HasOutput("EncodeGrad"), "Output", "EncodeGrad",
+                   "DGCOp");
+    OP_INOUT_CHECK(ctx->HasOutput("GatherBuff"), "Output", "GatherBuff",
+                   "DGCOp");
  }

 protected:

--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -24,14 +24,22 @@ namespace operators {

 inline float get_period_sparcity(const std::vector<float>& sparsity,
                                 float cur_step, float rampup_steps) {
-  PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0);
+  PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0,
+                    platform::errors::InvalidArgument(
+                        "DGC current step=%d, but it must >= 0, "
+                        "please submit issue in github",
+                        static_cast<int>(cur_step)));

  size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps);
  if (idx >= sparsity.size()) {
    idx = sparsity.size() - 1;
  }

-  PADDLE_ENFORCE_LT(idx, sparsity.size());
+  PADDLE_ENFORCE_LT(
+      idx, sparsity.size(),
+      platform::errors::OutOfRange(
+          "sparsity index out of bounds. idx=%d >= sparsity.size=%d", idx,
+          sparsity.size()));
  return sparsity[idx];
 }

@@ -55,7 +63,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
    // nranks
    auto nranks_tensor = ctx.Input<framework::Tensor>("nranks");
    const int nranks = static_cast<const int>(*nranks_tensor->data<float>());
-    PADDLE_ENFORCE_GT(nranks, 1, "DGC is not useful when num_trainers <= 1");
+    PADDLE_ENFORCE_GT(nranks, 1,
+                      platform::errors::PreconditionNotMet(
+                          "DGC is not useful when num_trainers <= 1. Please "
+                          "use multi card or multi machine GPU"));

    // regularization
    auto p = ctx.Input<framework::Tensor>("Param");
@@ -105,8 +116,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
        1 - get_period_sparcity(
                sparsity, static_cast<float>(*current_step - rampup_begin_step),
                rampup_step);
-    PADDLE_ENFORCE_GE(ratio, 0.0);
-    PADDLE_ENFORCE_LT(ratio, 1.0);
+    PADDLE_ENFORCE_GE(ratio, 0.0, platform::errors::InvalidArgument(
+                                      "DGC sparsity ratio must >= 0"));
+    PADDLE_ENFORCE_LT(ratio, 1.0, platform::errors::InvalidArgument(
+                                      "DGC sparsity ratio must < 1"));
    int k = static_cast<int>(g->numel() * ratio);

    VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov

--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cc
@@ -31,12 +31,15 @@ class NCCLInitOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &place) const override {
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kParallelScopes)),
-                            "Can not find variable '%s' in the scope.",
-                            kParallelScopes);
+    PADDLE_ENFORCE_NOT_NULL(
+        scope.FindVar(Input(kParallelScopes)),
+        platform::errors::NotFound("Can not find variable '%s' in the scope.",
+                                   kParallelScopes));
    const auto &name = Output("Communicator");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
-                            "Can not find variable '%s' in the scope.", name);
+    PADDLE_ENFORCE_NOT_NULL(
+        scope.FindVar(name),
+        platform::errors::NotFound(
+            "Output(%s) is needed for ncclInit operator.", name));
    // A parallel do may not use all the gpus. For example, the batch size is 7
    // in the last batch while we have 8 gpu. In this case, parallel_do will
    // create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
@@ -46,11 +49,9 @@ class NCCLInitOp : public framework::OperatorBase {
    for (int i = 0; i < static_cast<int>(parallel_scopes.size()); ++i) {
      gpus[i] = i;
    }
-    PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus.");
-
-    if (scope.FindVar(name) == nullptr) {
-      PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
-    }
+    PADDLE_ENFORCE_EQ(!gpus.empty(), true,
+                      platform::errors::PreconditionNotMet(
+                          "gpus is empty, NCCL must init with gpus"));

    platform::Communicator *comm =
        scope.FindVar(name)->GetMutable<platform::Communicator>();
@@ -92,17 +93,17 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   " Input(X) of AllReduce op input should not be NULL");
-    PADDLE_ENFORCE(
-        ctx->HasInput("Communicator"),
-        " Input(Communicator) of AllReduce op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Output(Out) of AllReduce op output should not be NULL");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLAllReduce");
+    OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
+                   "NCCLAllReduce");
+
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLAllReduce");
+
    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+    PADDLE_ENFORCE_EQ(
+        (reduction == "ncclSum" || reduction == "ncclProd" ||
         reduction == "ncclMin" || reduction == "ncclMax"),
-                   "invalid reduction.");
+        true, platform::errors::InvalidArgument("invalid nccl reduction."));

    auto x_dims = ctx->GetInputsDim("X");
    ctx->SetOutputsDim("Out", x_dims);
@@ -137,18 +138,17 @@ class NCCLReduceOp : public framework::OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   " Input(X) of Reduce op input should not be NULL");
-    PADDLE_ENFORCE(
-        ctx->HasInput("Communicator"),
-        " Input(Communicator) of Reduce op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Input(X) of Reduce op input should not be NULL");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLReduce");
+    OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
+                   "NCCLReduce");
+
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLReduce");

    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
-    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+    PADDLE_ENFORCE_EQ(
+        (reduction == "ncclSum" || reduction == "ncclProd" ||
         reduction == "ncclMin" || reduction == "ncclMax"),
-                   "invalid reduction.");
+        true, platform::errors::InvalidArgument("invalid nccl reduction."));

    auto x_dims = ctx->GetInputsDim("X");
    ctx->SetOutputsDim("Out", x_dims);
@@ -188,15 +188,16 @@ class NCCLBcastOp : public framework::OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   " Input(X) of Bcast op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
-                   " Input(Communicator) of Bcast op input should not be NULL");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   " Output(Out) of Bcast op output should not be NULL");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLBcast");
+    OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
+                   "NCCLBcast");
+
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLBcast");

    int root = ctx->Attrs().Get<int>("root");
-    PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
+    PADDLE_ENFORCE_EQ(
+        root != platform::kInvalidGPUId, true,
+        platform::errors::InvalidArgument("Bcast root must be set."));

    auto x_dims = ctx->GetInputsDim("X");
    ctx->SetOutputsDim("Out", x_dims);

--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <functional>
+#include <unordered_map>

 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -37,29 +38,35 @@ class NCCLTypeWrapper<double> {
  static const ncclDataType_t type = ncclDouble;
 };

+static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
+  static const std::unordered_map<std::string, ncclRedOp_t> str_to_type = {
+      {"ncclSum", ncclSum},
+      {"ncclMin", ncclMin},
+      {"ncclMax", ncclMax},
+      {"ncclProd", ncclProd},
+  };
+  auto it = str_to_type.find(reduction);
+  PADDLE_ENFORCE_EQ(it != str_to_type.end(), true,
+                    platform::errors::InvalidArgument(
+                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
+                        "ncclProd | ncclSum"));
+  return it->second;
+}
+
 template <typename T>
 class NCCLAllReduceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::PreconditionNotMet(
+                          "This kernel only runs on GPU device."));
    auto* x = ctx.Input<LoDTensor>("X");
    auto* out = ctx.Output<LoDTensor>("Out");
    auto* comm = ctx.Input<Communicator>("Communicator");
    std::string reduction = ctx.Attr<std::string>("reduction");

-    ncclRedOp_t reduction_op_ = ncclSum;
-    if (reduction == "ncclMin") {
-      reduction_op_ = ncclMin;
-    } else if (reduction == "ncclMax") {
-      reduction_op_ = ncclMax;
-    } else if (reduction == "ncclSum") {
-      reduction_op_ = ncclSum;
-    } else if (reduction == "ncclProd") {
-      reduction_op_ = ncclProd;
-    } else {
-      PADDLE_THROW("Invalid reduction. default ncclSum.");
-    }
+    auto reduction_op_ = str_to_nccl_red_type(reduction);
+
    // device id
    int gpu_id =
        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
@@ -67,7 +74,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
    VLOG(3) << "gpu : "
            << " invoke allreduce. send " << x->numel() << " recv "
            << out->numel();
-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
        x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
        NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
        ctx.cuda_device_context().stream()));
@@ -81,26 +88,17 @@ template <typename T>
 class NCCLReduceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "This kernel only runs on GPU device."));
    auto x = ctx.Input<LoDTensor>("X");  // x0, x1, x2
    auto out = ctx.Output<LoDTensor>("Out");
    auto* comm = ctx.Input<Communicator>("Communicator");
    int root = ctx.Attr<int>("root");
    std::string reduction = ctx.Attr<std::string>("reduction");

-    ncclRedOp_t reduction_op_ = ncclSum;
-    if (reduction == "ncclMin") {
-      reduction_op_ = ncclMin;
-    } else if (reduction == "ncclMax") {
-      reduction_op_ = ncclMax;
-    } else if (reduction == "ncclSum") {
-      reduction_op_ = ncclSum;
-    } else if (reduction == "ncclProd") {
-      reduction_op_ = ncclProd;
-    } else {
-      PADDLE_THROW("Invalid reduction. default ncclSum.");
-    }
+    auto reduction_op_ = str_to_nccl_red_type(reduction);
+
    // device id
    int gpu_id =
        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
@@ -113,7 +111,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
    }
    VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
            << " recv " << out->numel();
-    PADDLE_ENFORCE(platform::dynload::ncclReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
        x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
        reduction_op_, root, comm->comms().at(idx),
        ctx.cuda_device_context().stream()));
@@ -126,8 +124,9 @@ template <typename T>
 class NCCLBcastKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "This kernel only runs on GPU device."));
    int root = ctx.Attr<int>("root");
    auto* comm = ctx.Input<Communicator>("Communicator");
    // device id
@@ -137,7 +136,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
    if (idx == root) {
      auto* x = ctx.Input<LoDTensor>("X");
      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
-      PADDLE_ENFORCE(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
          NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
          ctx.cuda_device_context().stream()));
@@ -146,7 +145,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
      auto* out = ctx.Output<LoDTensor>("Out");
      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
              << framework::product(out->dims());
-      PADDLE_ENFORCE(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
          out->mutable_data<T>(ctx.GetPlace()), out->numel(),
          NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
          ctx.cuda_device_context().stream()));

--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -45,9 +45,8 @@ class NCCLTester : public ::testing::Test {
 public:
  void SetUp() override {
    int count = p::GetCUDADeviceCount();
-    if (count <= 1) {
-      LOG(WARNING)
-          << "Cannot test multi-gpu nccl, because the CUDA device count is "
+    if (count <= 0) {
+      LOG(WARNING) << "Cannot test gpu nccl, because the CUDA device count is "
                   << count;
      exit(0);
    }
@@ -114,8 +113,9 @@ class NCCLTester : public ::testing::Test {

    lk.unlock();

-    PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
-                   "Tensor numel not match!");
+    PADDLE_ENFORCE_EQ(
+        send_tensor->numel(), f::product(kDims),
+        paddle::platform::errors::InvalidArgument("Tensor numel not match!"));

    auto op = f::OpRegistry::CreateOp(*op1);

@@ -126,6 +126,10 @@ class NCCLTester : public ::testing::Test {
    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
  }

+  void testNcclReduceOp();
+  void testNcclAllReduceOp();
+  void testNcclBcastOp();
+
 public:
  std::vector<p::DeviceContext *> dev_ctxs_;
  f::Scope g_scope_;
@@ -133,13 +137,7 @@ class NCCLTester : public ::testing::Test {
  std::vector<int> gpu_list_;
 };

-// ncclInitOp with desc
-TEST_F(NCCLTester, ncclInitOp) {}
-
-// ncclAllReduceOp with desc
-// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
-/*
-TEST_F(NCCLTester, ncclAllReduceOp) {
+void NCCLTester::testNcclAllReduceOp() {
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  op2->SetType("ncclAllReduce");
  op2->SetInput("X", {"st"});
@@ -186,10 +184,8 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
    }
  }
 }
-*/

-// ncclReduceOp with desc
-TEST_F(NCCLTester, ncclReduceOp) {
+void NCCLTester::testNcclReduceOp() {
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  const int kRoot = 0;
  op2->SetType("ncclReduce");
@@ -236,10 +232,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
  }
 }

-// ncclBcastOp with desc
-// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
-/*
-TEST_F(NCCLTester, ncclBcastOp) {
+void NCCLTester::testNcclBcastOp() {
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  const int kRoot = 0;
  op2->SetType("ncclBcast");
@@ -263,13 +256,17 @@ TEST_F(NCCLTester, ncclBcastOp) {
    ths[i].join();
  }

-  const int idx = 1;
+  const int idx = gpu_list_.size() - 1;
  float result = GetGPUData(kRoot);

  p::CPUPlace cpu_place;
  p::CUDAPlace gpu_place(gpu_list_[idx]);

-  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
+  std::string rt_str = "rt";
+  if (idx == kRoot) {
+    rt_str = "st";
+  }
+  auto &recv_tensor = dev_scopes[idx]->FindVar(rt_str)->Get<f::LoDTensor>();
  auto *rt = recv_tensor.data<float>();
  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
  result_tensor->Resize(kDims);
@@ -284,4 +281,20 @@ TEST_F(NCCLTester, ncclBcastOp) {
    ASSERT_NEAR(ct[j], result, 1e-5);
  }
 }
-*/
+
+// ncclInitOp with desc
+TEST_F(NCCLTester, ncclInitOp) {}
+
+TEST_F(NCCLTester, ncclOp) {
+  // Serial execution is required for the same nccl comm.
+
+  // ncclAllReduceOp with desc
+  // TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
+  testNcclReduceOp();
+
+  testNcclAllReduceOp();
+
+  // ncclBcastOp with desc
+  // TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
+  testNcclBcastOp();
+}
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -23,10 +23,9 @@ class SizeOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input (Input) of Size op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output (Out) of Size op should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Size");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Size");
+
    ctx->SetOutputDim("Out", {1});
  }
 };