merge develop

7a517dc9 · dzhwinter · 61cb4f2f · 387e10c6 · 7a517dc9 · 7a517dc9
107 changed file
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -815,3 +815,8 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:
+topk
+----
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
--- a/doc/fluid/dev/contribute_to_paddle_cn.md
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
+../../v2/dev/contribute_to_paddle_cn.md
\ No newline at end of file
--- a/doc/fluid/dev/contribute_to_paddle_en.md
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
+../../v2/dev/contribute_to_paddle_en.md
\ No newline at end of file
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,6 +4,8 @@
 .. toctree::
  :maxdepth: 1
+  contribute_to_paddle_cn.md
+  write_docs_cn.md
  api_doc_std_cn.md
  new_op_cn.md
  new_op_kernel.md

--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -4,6 +4,8 @@ Development
 .. toctree::
  :maxdepth: 1
+  contribute_to_paddle_en.md
+  write_docs_en.md
  api_doc_std_en.md
  new_op_en.md
  new_op_kernel.md

--- a/doc/fluid/dev/write_docs_cn.rst
+++ b/doc/fluid/dev/write_docs_cn.rst
+../../v2/dev/write_docs_cn.rst
\ No newline at end of file
--- a/doc/fluid/dev/write_docs_en.rst
+++ b/doc/fluid/dev/write_docs_en.rst
+../../v2/dev/write_docs_en.rst
\ No newline at end of file
--- a/doc/v2/api/data/data_reader.rst
+++ b/doc/v2/api/data/data_reader.rst
@@ -6,7 +6,43 @@ Data Reader Interface
 DataTypes
 =========
-..  automodule:: paddle.v2.data_type
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+..  autoclass:: paddle.v2.data_type.InputType
    :members:
    :noindex:

--- a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
-k8s_aws_en.md
\ No newline at end of file
--- a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -102,7 +102,7 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-cc_test(channel_test SRCS channel_test.cc)
+# cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
        channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -51,23 +51,23 @@ void FetchOpHandle::RunImpl() {
    auto *var = static_cast<VarHandle *>(input);
    var->generated_op_->Wait(cpu_ctx);
  }
  tensors_.resize(inputs_.size());
-  auto *var = static_cast<VarHandle *>(inputs_[0]);
+  auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
-  auto &var_name = var->name_;
+  auto &var_name = var_handle->name_;
  platform::CPUPlace cpu;
  auto &scopes = *local_scopes_;
  for (size_t i = 0; i < scopes.size(); ++i) {
    auto &scope = scopes[i];
-    auto &t = scope->FindVar(kLocalExecScopeName)
+    auto *var =
-                  ->Get<Scope *>()
+        scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
-                  ->FindVar(var_name)
+    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
-                  ->Get<framework::LoDTensor>();
+                            var_name);
-    if (platform::is_gpu_place(var->place_)) {
+    auto &t = var->Get<framework::LoDTensor>();
+    if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
-      dev_ctxes_[t.place()]->Wait();
+      dev_ctxes_.at(t.place())->Wait();
 #endif
    } else {
      tensors_[i].ShareDataWith(t);

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -89,101 +89,25 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  bool is_forwarding = true;
  for (auto *op : program.Block(0).AllOps()) {
-    bool change_forward = false;
+    if (op->Type() == "send") {
-    if (!is_forwarding) {
+      // append send op if program is distributed trainer main program.
-      // FIXME(yy): Do not hard code like this
+      // always use the first device
-      if (op->OutputArgumentNames().size() == 1 &&
+      CreateSendOp(&result, *op);
-          op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
+    } else if (IsScaleLossOp(*op)) {
-        continue;  // Drop fill 1. for backward coeff;
+      CreateScaleLossGradOp(&result);
-      }
-    }
-    // append send op if program is distributed trainer main program.
-    // always use the first device
-    if (!is_forwarding && op->Type() == "send") {
-      auto &p = places_[0];
-      auto *s = local_scopes_[0];
-      // FIXME(wuyi): send op always copy from GPU 0
-      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
-      // Create inputs for output on original place and no ssa output
-      // is created for send op.
-      CreateOpHandleIOs(&result, *op, p, 0);
-      continue;
-    }
-    for (size_t i = 0; i < places_.size(); ++i) {
-      auto &p = places_[i];
-      auto *s = local_scopes_[i];
-      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
-      auto *op_handle = result.ops_.back().get();
-      CreateOpHandleIOs(&result, *op, p, i);
-      auto var_names = op->OutputArgumentNames();
-      if (is_forwarding) {
-        if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
-// Insert ScaleCost OpHandle
-#ifdef PADDLE_WITH_CUDA
-          auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
-#else
-          auto *communication_dev_ctx =
-              platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-#endif
-          op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
-                                                communication_dev_ctx);
-          result.ops_.emplace_back(op_handle);
-          // FIXME: Currently ScaleLossGradOp only use device_count as scale
-          // factor. So it does not depend on any other operators.
-          // VarHandle *loss = GetVarHandle(loss_var_name, place);
-          // loss->pending_ops_.emplace_back(op_handle);
-          // op_handle->inputs_.emplace_back(loss);
-          CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
-          change_forward = true;
-        }
-      }
-    }
-    if (change_forward) {
      is_forwarding = false;
-    }
+    } else {
+      CreateComputationalOps(&result, *op);
-    if (!is_forwarding) {
+      if (!is_forwarding) {
-      auto var_names = op->OutputArgumentNames();
+        // Currently, we assume that once gradient is generated, it can be
-      // Currently, we assume that once gradient is generated, it can be
+        // broadcast, and each gradient is only broadcast once. But there are no
-      // broadcast, and each gradient is only broadcast once. But there are no
+        // other cases, for example, we need to adjust the gradient according to
-      // other cases, for example, we need to adjust the gradient according to
+        // the input when we get the gradient, which is not considered at
-      // the input when we get the gradient, which is not considered at present.
+        // present.
-      for (auto &og : var_names) {
+        for (auto &og : op->OutputArgumentNames()) {
-        if (grad_names_.count(og) != 0 &&
+          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
-            og_has_been_broadcast.count(og) == 0) {  // is param grad
+            InsertNCCLAllReduceOp(&result, og);
-                                                     // Insert NCCL AllReduce Op
-          og_has_been_broadcast.insert(og);
-#ifdef PADDLE_WITH_CUDA
-          result.ops_.emplace_back(
-              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
-          auto *op_handle = result.ops_.back().get();
-          for (size_t i = 0; i < places_.size(); ++i) {
-            auto &p = places_[i];
-            auto &vars = result.vars_[i][og];
-            if (vars.empty()) {  // This device has no data. continue.
-              continue;
-            }
-            auto &prev_grad = vars[vars.size() - 1];
-            op_handle->AddInput(prev_grad.get());
-            auto var = new VarHandle(vars.size() - 1, i, og, p);
-            vars.emplace_back(var);
-            op_handle->AddOutput(var);
          }
-#else
-          PADDLE_ENFORCE("Not implemented");
-#endif
        }
      }
    }
@@ -207,7 +131,95 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  }
  return std::unique_ptr<SSAGraph>(graph);
-}  // namespace details
+}
+void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
+    SSAGraph *result, const std::string &og) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+  auto *op_handle = result->ops_.back().get();
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    auto &vars = result->vars_[i][og];
+    PADDLE_ENFORCE(!vars.empty());
+    auto &prev_grad = vars.back();
+    op_handle->AddInput(prev_grad.get());
+    auto var = new VarHandle(vars.size() - 1, i, og, p);
+    vars.emplace_back(var);
+    op_handle->AddOutput(var);
+  }
+#else
+  PADDLE_ENFORCE("Not implemented");
+#endif
+}
+bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
+    const std::string &og,
+    std::unordered_set<std::string> *og_has_been_broadcast) const {
+  bool is_pg_once =
+      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
+  if (is_pg_once) {
+    // Insert NCCL AllReduce Op
+    og_has_been_broadcast->insert(og);
+  }
+  return is_pg_once;
+}
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
+  for (size_t i = 0; i < places_.size(); ++i) {
+// Insert ScaleCost OpHandle
+#ifdef PADDLE_WITH_CUDA
+    auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
+#else
+    auto *communication_dev_ctx =
+        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+#endif
+    auto *op_handle =
+        new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
+                                  places_[i], communication_dev_ctx);
+    result->ops_.emplace_back(op_handle);
+    // FIXME: Currently ScaleLossGradOp only use device_count as scale
+    // factor. So it does not depend on any other operators.
+    // VarHandle *loss = GetVarHandle(loss_var_name, place);
+    // loss->pending_ops_.emplace_back(op_handle);
+    // op_handle->inputs_.emplace_back(loss);
+    CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
+                   i);
+  }
+}
+void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
+                                                     const OpDesc &op) const {
+  for (size_t scope_idx = 0; scope_idx < places_.size(); ++scope_idx) {
+    auto p = places_[scope_idx];
+    auto s = local_scopes_[scope_idx];
+    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
+    CreateOpHandleIOs(result, op, p, scope_idx);
+  }
+}
+void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
+                                           const OpDesc &op) const {
+  auto &p = places_[0];
+  auto *s = local_scopes_[0];
+  // FIXME(wuyi): send op always copy from GPU 0
+  result->ops_.emplace_back(new SendOpHandle(op, s, p));
+  // Create inputs for output on original place and no ssa output
+  // is created for send op.
+  CreateOpHandleIOs(result, op, p, 0);
+}
+bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
+  // FIXME(yy): Do not hard code like this
+  return op.OutputArgumentNames().size() == 1 &&
+         op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -57,6 +57,20 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
  platform::NCCLContextMap *nccl_ctxs_;
 #endif
+  bool IsScaleLossOp(const OpDesc &op) const;
+  void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
+  void CreateComputationalOps(SSAGraph *result, const OpDesc &op) const;
+  void CreateScaleLossGradOp(SSAGraph *result) const;
+  bool IsParameterGradientOnce(
+      const std::string &og,
+      std::unordered_set<std::string> *og_has_been_broadcast) const;
+  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -73,8 +73,9 @@ void NCCLAllReduceOpHandle::RunImpl() {
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
      auto *s = local_scopes_[i];
+      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
-      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
+      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
      lod_tensors.emplace_back(lod_tensor);
    }
@@ -110,17 +111,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
        }
      });
    } else {  // Special handle CPU only Operator's gradient. Like CRF
-      auto &trg =
+      auto &trg = *this->local_scopes_[0]
-          *this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
+                       ->FindVar(kLocalExecScopeName)
+                       ->Get<Scope *>()
+                       ->Var()
+                       ->GetMutable<framework::LoDTensor>();
      // Reduce All Tensor to trg in CPU
      ReduceLoDTensor func(lod_tensors, &trg);
      VisitDataType(ToDataType(lod_tensors[0].type()), func);
      for (size_t i = 0; i < local_scopes_.size(); ++i) {
-        auto &scope = local_scopes_[i];
+        auto &scope =
+            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
        auto &p = places_[i];
-        auto *var = scope->FindVar(var_name);
+        auto *var = scope.FindVar(var_name);
        auto *dev_ctx = dev_ctxes_[p];
        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -30,10 +30,11 @@ ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 void ScaleLossGradOpHandle::RunImpl() {
  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
+  auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  float *tmp =
+  float *tmp = local_scope.FindVar(var_name)
-      scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>(
+                   ->GetMutable<LoDTensor>()
-          make_ddim({1}), place_);
+                   ->mutable_data<float>(make_ddim({1}), place_);
  if (platform::is_cpu_place(place_)) {
    *tmp = coeff_;

--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -69,8 +70,7 @@ class GradOpDescMakerBase {
                      " for input argument with a list of variables, "
                      " drop_empty_grad is not allowed because it makes"
                      " the correspondence bewteen a variable and its gradient"
-                      " ambiguous. Use REGISTER_OP_EX to register the op"
+                      " ambiguous."
-                      " or call InputGrad(?,false) in GradOpDescMaker."
                      " Op type %s",
                      fwd_op_.Type());

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #include <algorithm>
 #include <atomic>
+#include <string>
+#include <tuple>
 #include <type_traits>
 #include <typeinfo>
 #include <unordered_map>
@@ -141,36 +143,6 @@ class OpKernelRegistrar : public Registrar {
    return 0;                                                          \
  }
-/**
- * Macro to register Operator. When the input is duplicable, you should
- * use REGISTER_OP_EX with drop_empty_grad=false instead.
- */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
-                    grad_op_class)                                   \
-  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
-                 grad_op_class, true)
-// When an argument is duplicable, we need to use this version.
-// Perhaps we can omit DropEmptyIG template parameter and
-// only have one version of REGISTER_OP.
-#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
-                       grad_op_class, drop_empty_grad)                        \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
-  class _GradOpDescMaker_##grad_op_type##_                                    \
-      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
-    using ::paddle::framework::DefaultGradOpDescMaker<                        \
-        drop_empty_grad>::DefaultGradOpDescMaker;                             \
-                                                                              \
-   protected:                                                                 \
-    virtual std::string GradOpType() const { return #grad_op_type; }          \
-  };                                                                          \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
-                    op_maker_class);
-#define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
-  REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
-                    ##__VA_ARGS__)
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
  REGISTER_OPERATOR(op_type, op_class, op_maker_class)

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -44,6 +44,7 @@ class ParallelExecutorPrivate {
 #endif
  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
+  bool own_local_scope;
 };
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -63,13 +64,16 @@ ParallelExecutor::ParallelExecutor(
  // Step 1. Bcast the params to devs.
  // Create local scopes
  if (local_scopes.empty()) {
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
+    member_->own_local_scope = true;
-      member_->local_scopes_.push_back(&scope->NewScope());
+    member_->local_scopes_.emplace_back(member_->global_scope_);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.emplace_back(&scope->NewScope());
    }
  } else {
+    member_->own_local_scope = false;
    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.push_back(local_scopes[i]);
+      member_->local_scopes_.emplace_back(local_scopes[i]);
    }
  }
@@ -159,7 +163,9 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                           const std::string &fetched_var_name) {
  platform::RecordBlock b(0);
  // Create local scopes.
-  for (auto &scope : member_->local_scopes_) {
+  for (auto it = member_->local_scopes_.rbegin();
+       it != member_->local_scopes_.rend(); ++it) {
+    auto &scope = *it;
    Scope &local_scope = scope->NewScope();
    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
        &local_scope;
@@ -173,7 +179,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
                           std::get<1>(name_type_pair));
      } else {
-        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+        InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
                           std::get<1>(name_type_pair));
      }
    }
@@ -228,5 +234,13 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
  }
 }
+ParallelExecutor::~ParallelExecutor() {
+  if (member_->own_local_scope) {
+    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
+      member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
+    }
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -42,6 +42,8 @@ class ParallelExecutor {
                            const std::vector<Scope*>& local_scopes,
                            bool allow_op_delay);
+  ~ParallelExecutor();
  std::vector<Scope*>& GetLocalScopes();
  /**

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -110,12 +110,12 @@ function(op_library TARGET)
    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
    file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
    if (one_register STREQUAL "")
        string(REPLACE "_op" "" TARGET "${TARGET}")
    else ()
-        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
        string(REPLACE "," "" TARGET "${TARGET}")
    endif()

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -469,8 +469,6 @@ REGISTER_ACTIVATION_OP_GRAD_MAKER(HardSigmoid, hard_sigmoid);
 namespace ops = paddle::operators;
-void DummyFunctor() {}
 #define FOR_EACH_INPLACE_OP_FUNCTOR(__macro) \
  __macro(Sigmoid, sigmoid);                 \
  __macro(Relu, relu);                       \

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -648,7 +648,7 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }
-  bool Inplace() const { return IsInplace("softrelu"); }
+  bool Inplace() const { return IsInplace("soft_relu"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {

--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -153,9 +153,11 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
+REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp,
-            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
+                  ops::BilinearTensorProductOpMaker,
-            ops::BilinearTensorProductOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(bilinear_tensor_product_grad,
+                  ops::BilinearTensorProductOpGrad)
 REGISTER_OP_CPU_KERNEL(
    bilinear_tensor_product,
    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -81,8 +81,9 @@ class ClipOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
+REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>,
-            ops::ClipOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad)
 REGISTER_OP_CPU_KERNEL(
    clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -103,8 +103,10 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
+REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
-               ops::ConcatOpGrad, false)
+                  paddle::framework::DefaultGradOpDescMaker<
+                      false> /* set false to disable empty grad */)
+REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad)
 REGISTER_OP_CPU_KERNEL(
    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -335,14 +335,17 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
+REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-            ops::ConvOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad)
 // depthwise convolution op
-REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-            depthwise_conv2d_grad, ops::ConvOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
-REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
+REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad)
-            ops::ConvOpGrad);
+REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad)
 // depthwise conv kernel
 // TODO(xingzhaolong): neon kernel for mobile

--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -193,8 +193,9 @@ class ConvShiftGradKernel<platform::CPUPlace, T>
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
+REGISTER_OPERATOR(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
-            conv_shift_grad, ops::ConvShiftGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(conv_shift_grad, ops::ConvShiftGradOp)
 REGISTER_OP_CPU_KERNEL(conv_shift,
                       ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -298,8 +298,10 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
-REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
+REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
-            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+                  ops::Conv2DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad)
 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose,
@@ -311,8 +313,10 @@ REGISTER_OP_CPU_KERNEL(
    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                     double>);
-REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
+REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
-            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
+                  ops::Conv3DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad)
 REGISTER_OP_CPU_KERNEL(
    conv3d_transpose,

--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -153,8 +153,9 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
+REGISTER_OPERATOR(cos_sim, ops::CosSimOp, ops::CosSimOpMaker,
-            ops::CosSimOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad)
 REGISTER_OP_CPU_KERNEL(
    cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -153,7 +153,9 @@ class CropOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
+REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
 REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
 REGISTER_OP_CPU_KERNEL(
    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -164,8 +164,9 @@ or not. But the output only shares the LoD information with input X.
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
-            cross_entropy_grad, ops::CrossEntropyGradientOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp)
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
                       ops::CrossEntropyOpKernel<double>);
 REGISTER_OP_CPU_KERNEL(cross_entropy_grad,

--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -82,7 +82,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
        platform::CPUPlace cpu;
        auto& gpu_dev_ctx =
            static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor.memory_size();
+        auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
        payload = memory::Alloc(cpu, copy_size);
        memory::Copy(cpu, payload,
@@ -99,7 +99,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
      } else {
        payload = tensor.data<void>();
      }
-      payload_size = tensor.memory_size();
+      payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
      e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
    } break;
    case framework::proto::VarType_Type_SELECTED_ROWS: {
@@ -118,7 +118,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
        platform::CPUPlace cpu;
        auto& gpu_dev_ctx =
            static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor->memory_size();
+        auto copy_size =
+            tensor->numel() * framework::SizeOfType(tensor->type());
        payload = memory::Alloc(cpu, copy_size);
        memory::Copy(cpu, payload,
                     boost::get<platform::CUDAPlace>(tensor->place()),
@@ -133,7 +134,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
      } else {
        payload = slr->mutable_value()->data<void>();
      }
-      payload_size = tensor->memory_size();
+      payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
      e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
    } break;
    default:

--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -101,8 +101,9 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
+REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
-            ops::DropoutOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad)
 REGISTER_OP_CPU_KERNEL(
    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -30,8 +30,10 @@ class ElementwiseDivOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
+REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
-            elementwise_div_grad, ops::ElementwiseOpGrad);
+                  ops::ElementwiseDivOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad)
 REGISTER_OP_CPU_KERNEL(
    elementwise_div,
    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
@@ -29,8 +29,10 @@ class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker,
+REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
-            elementwise_max_grad, ops::ElementwiseOpGrad);
+                  ops::ElementwiseMaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad)
 REGISTER_OP_CPU_KERNEL(
    elementwise_max,
    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
@@ -29,8 +29,10 @@ class ElementwiseMinOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker,
+REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
-            elementwise_min_grad, ops::ElementwiseOpGrad);
+                  ops::ElementwiseMinOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad)
 REGISTER_OP_CPU_KERNEL(
    elementwise_min,
    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -31,8 +31,10 @@ class ElementwiseMulOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
+REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp,
-            elementwise_mul_grad, ops::ElementwiseOpGrad);
+                  ops::ElementwiseMulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad)
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul,
    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -29,8 +29,10 @@ class ElementwiseSubOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
+REGISTER_OPERATOR(elementwise_sub, ops::ElementwiseOp,
-            elementwise_sub_grad, ops::ElementwiseOpGrad);
+                  ops::ElementwiseSubOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(elementwise_sub_grad, ops::ElementwiseOpGrad)
 REGISTER_OP_CPU_KERNEL(
    elementwise_sub,
    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/expand_op.h"
+#include <vector>
 namespace paddle {
 namespace operators {
@@ -128,8 +130,9 @@ class ExpandGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
+REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
-            ops::ExpandGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp)
 REGISTER_OP_CPU_KERNEL(
    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -98,5 +98,6 @@ FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
 }  // namespace operators
 }  // namespace paddle
-REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad,
+REGISTER_OPERATOR(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker,
-            paddle::operators::FCOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(fc_grad, paddle::operators::FCOpGrad)
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -100,7 +100,8 @@ Out = [[3, 4],
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
+REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
-            ops::GatherGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(gather_grad, ops::GatherGradOp)
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -216,7 +216,9 @@ class GRUGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(gru_grad, ops::GRUGradOp)
 REGISTER_OP_CPU_KERNEL(
    gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -198,8 +198,9 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
+REGISTER_OPERATOR(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker,
-            ops::GRUUnitGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(gru_unit_grad, ops::GRUUnitGradOp)
 REGISTER_OP_CPU_KERNEL(
    gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -103,8 +103,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
+REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
-            hinge_loss_grad, ops::HingeLossGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp)
 REGISTER_OP_CPU_KERNEL(
    hinge_loss,
    ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -121,8 +121,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
+REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
-            huber_loss_grad, ops::HuberLossGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp)
 REGISTER_OP_CPU_KERNEL(
    huber_loss,
    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -148,8 +148,9 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
+REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
-            im2sequence_grad, ops::Im2SequenceGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp)
 REGISTER_OP_CPU_KERNEL(
    im2sequence,
    ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -67,8 +67,9 @@ $$Out = \sum{|X|}$$
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
+REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker,
-            ops::L1NormGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp)
 REGISTER_OP_CPU_KERNEL(
    l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -117,8 +117,9 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
+REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
-            label_smooth_grad, ops::LabelSmoothGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp)
 REGISTER_OP_CPU_KERNEL(
    label_smooth,
    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -162,8 +162,9 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
-            layer_norm_grad, ops::LayerNormGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp)
 REGISTER_OP_CPU_KERNEL(
    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -256,8 +256,10 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
+REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp,
-            linear_chain_crf_grad, ops::LinearChainCRFGradOp);
+                  ops::LinearChainCRFOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp)
 REGISTER_OP_CPU_KERNEL(
    linear_chain_crf,
    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -155,8 +155,9 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
+REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
-            ops::LoDResetGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp)
 REGISTER_OP_CPU_KERNEL(
    lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
    ops::LoDResetKernel<paddle::platform::CPUPlace, double>,

--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -106,8 +106,9 @@ class LogLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
+REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
-            ops::LogLossGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp)
 REGISTER_OP_CPU_KERNEL(
    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -276,7 +276,9 @@ class LRNOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
+REGISTER_OPERATOR(lrn, ops::LRNOp, ops::LRNOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad)
 REGISTER_OP_CPU_KERNEL(
    lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -273,7 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
+REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp)
 REGISTER_OP_CPU_KERNEL(
    lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
    ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -97,8 +97,9 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
+REGISTER_OPERATOR(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker,
-            ops::LstmUnitGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(lstm_unit_grad, ops::LstmUnitGradOp)
 REGISTER_OP_CPU_KERNEL(lstm_unit,
                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
                       ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);

--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -322,8 +322,9 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
+REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker,
-            ops::LSTMPGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp)
 REGISTER_OP_CPU_KERNEL(
    lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
    ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -111,9 +111,10 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
+REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp,
-            ops::MarginRankLossOpMaker<float>, margin_rank_loss_grad,
+                  ops::MarginRankLossOpMaker<float>,
-            ops::MarginRankLossGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp)
 REGISTER_OP_CPU_KERNEL(
    margin_rank_loss,
    ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -237,8 +237,9 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
+REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
-            ops::MatMulOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad)
 REGISTER_OP_CPU_KERNEL(
    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -101,8 +101,9 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
+REGISTER_OPERATOR(maxout, ops::MaxOutOp, ops::MaxOutOpMaker,
-            ops::MaxOutOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad)
 REGISTER_OP_CPU_KERNEL(
    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -108,9 +108,10 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
+REGISTER_OPERATOR(modified_huber_loss, ops::ModifiedHuberLossOp,
-            ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad,
+                  ops::ModifiedHuberLossOpMaker,
-            ops::ModifiedHuberLossGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(modified_huber_loss_grad, ops::ModifiedHuberLossGradOp)
 REGISTER_OP_CPU_KERNEL(
    modified_huber_loss,

--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -160,7 +160,9 @@ class MulGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp);
+REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
 REGISTER_OP_CPU_KERNEL(
    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/nce_op.h"
+#include <vector>
 namespace paddle {
 namespace operators {
@@ -179,7 +181,9 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
+REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad)
 REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(nce_grad,

--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -85,8 +85,9 @@ class NormOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
+REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker<float>,
-            ops::NormOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(norm_grad, ops::NormOpGrad)
 REGISTER_OP_CPU_KERNEL(
    norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
    ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -333,8 +333,9 @@ Example:
 namespace ops = paddle::operators;
-REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
+REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
-            ops::PoolOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad)
 REGISTER_OP_CPU_KERNEL(
    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
@@ -343,8 +344,9 @@ REGISTER_OP_CPU_KERNEL(
    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
-REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
+REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
-            ops::PoolOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad)
 REGISTER_OP_CPU_KERNEL(
    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -258,9 +258,10 @@ Example:
 namespace ops = paddle::operators;
-REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
+REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
+                  ops::MaxPool2dWithIndexOpMaker,
-            ops::MaxPoolWithIndexOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad)
 REGISTER_OP_CPU_KERNEL(
    max_pool2d_with_index,
@@ -274,9 +275,10 @@ REGISTER_OP_CPU_KERNEL(
    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
                                    int>)
-REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
+REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
+                  ops::MaxPool3dWithIndexOpMaker,
-            ops::MaxPoolWithIndexOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad)
 REGISTER_OP_CPU_KERNEL(
    max_pool3d_with_index,

--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -83,8 +83,9 @@ class PReluGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
-REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
+REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
-            ops::PReluGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp)
 REGISTER_OP_CPU_KERNEL(
    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -121,8 +121,9 @@ class RankLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
+REGISTER_OPERATOR(rank_loss, ops::RankLossOp, ops::RankLossOpMaker,
-            ops::RankLossGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(rank_loss_grad, ops::RankLossGradOp)
 REGISTER_OP_CPU_KERNEL(
    rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/reduce_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {
@@ -122,18 +125,18 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
 protected:
  std::string comment_;
-  void Replace(std::string &src, std::string from, std::string to) {
+  void Replace(std::string *src, std::string from, std::string to) {
    std::size_t len_from = std::strlen(from.c_str());
    std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
+    for (std::size_t pos = src->find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
+         pos = src->find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
+      src->replace(pos, len_from, to);
    }
  }
  void SetComment(std::string name, std::string op) {
-    Replace(comment_, "{ReduceOp}", name);
+    Replace(&comment_, "{ReduceOp}", name);
-    Replace(comment_, "{reduce}", op);
+    Replace(&comment_, "{reduce}", op);
  }
 };
@@ -187,20 +190,25 @@ class ReduceProdOpMaker : public ReduceOpMaker {
 namespace ops = paddle::operators;
-REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
+REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker,
-            ops::ReduceGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp)
-REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
+REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
-            reduce_mean_grad, ops::ReduceGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp)
-REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
+REGISTER_OPERATOR(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker,
-            ops::ReduceGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp)
-REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
+REGISTER_OPERATOR(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker,
-            ops::ReduceGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp)
-REGISTER_OP(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
+REGISTER_OPERATOR(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
-            reduce_prod_grad, ops::ReduceGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp)
 #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -113,8 +113,9 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
+REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
-            ops::ReshapeGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp)
 REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
                       ops::ReshapeKernel<CPU, double>,
                       ops::ReshapeKernel<CPU, int>,

--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -153,8 +153,9 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
+REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
-            ops::ROIPoolGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp)
 REGISTER_OP_CPU_KERNEL(
    roi_pool,
    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -250,8 +250,9 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
+REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker,
-            ops::RowConvGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp)
 REGISTER_OP_CPU_KERNEL(
    row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -102,7 +102,8 @@ $$
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
+REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
-            ops::ScatterGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp)
 REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -124,9 +124,11 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
+REGISTER_OPERATOR(sequence_concat, ops::SequenceConcatOp,
-               ops::SequenceConcatOpMaker, sequence_concat_grad,
+                  ops::SequenceConcatOpMaker,
-               ops::SequenceConcatGradOp, false);
+                  paddle::framework::DefaultGradOpDescMaker<
+                      false> /* set false to disable empty grad */)
+REGISTER_OPERATOR(sequence_concat_grad, ops::SequenceConcatGradOp);
 REGISTER_OP_CPU_KERNEL(
    sequence_concat,
    ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/sequence_conv_op.h"
+#include <algorithm>
 namespace paddle {
 namespace operators {
@@ -174,8 +176,9 @@ context_length, context_stride and context_start.
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
+REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
-            sequence_conv_grad, ops::SequenceConvGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp)
 REGISTER_OP_CPU_KERNEL(
    sequence_conv,

--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -200,8 +200,10 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
+REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp,
-            sequence_expand_grad, ops::SequenceExpandOpGrad);
+                  ops::SequenceExpandOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad)
 REGISTER_OP_CPU_KERNEL(
    sequence_expand,
    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
@@ -120,8 +120,10 @@ NOTE: The first dimension size of input, the size of offset and Length, should b
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
+REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp,
-            sequence_slice_grad, ops::SequenceSliceGradOp);
+                  ops::SequenceSliceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp)
 REGISTER_OP_CPU_KERNEL(
    sequence_slice,
    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -155,9 +155,10 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
+REGISTER_OPERATOR(sequence_softmax, ops::SequenceSoftmaxOp,
-            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
+                  ops::SequenceSoftmaxOpMaker,
-            ops::SequenceSoftmaxGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(sequence_softmax_grad, ops::SequenceSoftmaxGradOp)
 REGISTER_OP_CPU_KERNEL(
    sequence_softmax,
    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -135,11 +135,12 @@ However the output only shares the LoD with input `X`.
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(sigmoid_cross_entropy_with_logits,
+REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
-            ops::SigmoidCrossEntropyWithLogitsOp,
+                  ops::SigmoidCrossEntropyWithLogitsOp,
-            ops::SigmoidCrossEntropyWithLogitsOpMaker,
+                  ops::SigmoidCrossEntropyWithLogitsOpMaker,
-            sigmoid_cross_entropy_with_logits_grad,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
-            ops::SigmoidCrossEntropyWithLogitsGradOp);
+REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
+                  ops::SigmoidCrossEntropyWithLogitsGradOp)
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
                       ops::SigmoidCrossEntropyWithLogitsKernel<
                           paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -132,8 +132,9 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
+REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
-            smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp)
 REGISTER_OP_CPU_KERNEL(
    smooth_l1_loss,
    ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -160,8 +160,9 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
-REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
+REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
-            ops::SoftmaxOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad)
 REGISTER_OP_CPU_KERNEL(
    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/split_byref_op.cc
+++ b/paddle/fluid/operators/split_byref_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/split_byref_op.h"
+#include "paddle/fluid/operators/split_op.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class SplitByrefOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SplitOp should not be null.");
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
+                      "Outputs(Out) of SplitOp should not be empty.");
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(outs_number);
+    if (num > 0) {
+      int64_t in_axis_dim = in_dims[0];
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                        "tensor split does not result"
+                        " in an equal division");
+      size_t out_axis_dim = in_axis_dim / num;
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = out_axis_dim;
+        outs_dims.push_back(dim);
+      }
+    } else if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = sections[i];
+        outs_dims.push_back(dim);
+      }
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+};
+class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitByrefOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of the split operator.");
+    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SplitByref operator
+Split source tensor to sevaral tensors by axis 0. No copy in this operator
+is performed, output tensor shares the same blocks of memory.
+)DOC");
+    AddAttr<std::vector<int>>("sections",
+                              "(vector<int>) "
+                              "the length of each output along the "
+                              "specified axis.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("num",
+                 "(int, default 0)"
+                 "Number of sub-tensors. This must evenly divide "
+                 "Input.dims()[axis]")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+// NOTE: concat op default axis must be 0!
+USE_CPU_ONLY_OP(concat);
+REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
+                  ops::SplitGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/fluid/operators/split_byref_op.cu.cc
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/split_byref_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split_byref,
+    ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/split_byref_op.h
+++ b/paddle/fluid/operators/split_byref_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SplitByrefOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto place = ctx.GetPlace();
+    size_t row_offset = 0;
+    for (size_t i = 0; i < outs.size(); ++i) {
+      // NOTE: no need to call mutable_data here to allocate memory.
+      auto* out = outs[i];
+      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
+      *out = std::move(in->Slice(row_offset, row_offset + out->dims()[0]));
+      row_offset += out->dims()[0];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -108,21 +108,6 @@ Example:
  }
 };
-class SplitGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto op = new framework::OpDesc();
-    op->SetType("concat");
-    op->SetInput("X", OutputGrad("Out"));
-    op->SetOutput("Out", InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -44,5 +44,20 @@ class SplitOpKernel : public framework::OpKernel<T> {
  }
 };
+class SplitGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
+    op->SetType("concat");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -92,7 +92,9 @@ class SppOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OPERATOR(spp, ops::SppOp, ops::SppOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(spp_grad, ops::SppOpGrad)
 REGISTER_OP_CPU_KERNEL(
    spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -109,9 +109,10 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
+REGISTER_OPERATOR(squared_l2_distance, ops::SquaredL2DistanceOp,
-            ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
+                  ops::SquaredL2DistanceOpMaker,
-            ops::SquaredL2DistanceGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp)
 REGISTER_OP_CPU_KERNEL(
    squared_l2_distance,
    ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -67,8 +67,10 @@ $$Out = \sum_{i} X_{i}^2$$
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
+REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp,
-            squared_l2_norm_grad, ops::SquaredL2NormGradOp);
+                  ops::SquaredL2NormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp)
 REGISTER_OP_CPU_KERNEL(
    squared_l2_norm,
    ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -24,7 +24,6 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
@@ -36,9 +35,9 @@ class TopkKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    // Get the top k elements of each row of input tensor
    // FIXME: only deal with matrix(2d tensor).
-    auto* input = ctx.Input<LoDTensor>("X");
+    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<LoDTensor>("Out");
+    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<LoDTensor>("Indices");
+    auto* indices = ctx.Output<Tensor>("Indices");
    // k is determined by Attr
    const size_t k = static_cast<int>(ctx.Attr<int>("k"));

--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -118,8 +118,9 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
+REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
-            ops::TransposeOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad)
 REGISTER_OP_CPU_KERNEL(
    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -132,8 +132,9 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
+REGISTER_OPERATOR(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker,
-            ops::UnpoolOpGrad);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad)
 REGISTER_OP_CPU_KERNEL(
    unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
    ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -132,8 +132,9 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
+REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
-            ops::WarpCTCGradOp);
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp)
 REGISTER_OP_CPU_KERNEL(
    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -39,20 +39,19 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
 class NCCLGroupGuard {
 public:
+  static std::mutex &NCCLMutex() {
+    static std::mutex mtx;
+    return mtx;
+  }
  inline NCCLGroupGuard() {
-    mutex().lock();
+    NCCLMutex().lock();
    PADDLE_ENFORCE(dynload::ncclGroupStart());
  }
  inline ~NCCLGroupGuard() {
    PADDLE_ENFORCE(dynload::ncclGroupEnd());
-    mutex().unlock();
+    NCCLMutex().unlock();
-  }
- private:
-  static std::mutex &mutex() {
-    static std::mutex mtx;
-    return mtx;
  }
 };
@@ -68,26 +67,6 @@ struct NCCLContext {
  int device_id() const {
    return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
  }
-  static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
-                              const std::vector<platform::Place> &places) {
-    std::vector<ncclComm_t> comms;
-    std::vector<int> devs;
-    comms.resize(contexts->size());
-    devs.reserve(contexts->size());
-    for (auto &p : places) {
-      devs.push_back(boost::get<platform::CUDAPlace>(p).device);
-    }
-    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(contexts->size()), &devs[0]));
-    int i = 0;
-    for (auto &dev_id : devs) {
-      contexts->at(dev_id).comm_ = comms[i++];
-    }
-  }
 };
 struct NCCLContextMap {
@@ -107,12 +86,12 @@ struct NCCLContextMap {
        "NCCL Context Map does not support contain two or more same device");
    if (places.size() > 1) {
-      std::vector<ncclComm_t> comms;
+      std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
-      comms.resize(order_.size());
+      {
+        std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
-      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+        PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-          &comms[0], static_cast<int>(order_.size()), &order_[0]));
+            comms.get(), static_cast<int>(order_.size()), order_.data()));
+      }
      int i = 0;
      for (auto &dev_id : order_) {
        contexts_.at(dev_id).comm_ = comms[i++];
@@ -120,6 +99,9 @@ struct NCCLContextMap {
    }
  }
+  NCCLContextMap(const NCCLContextMap &other) = delete;
+  NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
  CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
  CUDADeviceContext *DevCtx(platform::Place p) const {

--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -420,13 +420,14 @@ class DistributeTranspiler:
        # append op to the current block
        per_opt_block = append_block
-        for _, opt_op in enumerate(opt_op_on_pserver):
+        for idx, opt_op in enumerate(opt_op_on_pserver):
            for _, op in enumerate(self.optimize_ops):
                # optimizer is connected to itself
                if ufind.is_connected(op, opt_op) and \
                    op not in global_ops:
                    __append_optimize_op__(op, per_opt_block)
-            per_opt_block = pserver_program.create_block(append_block.idx)
+            if idx == len(opt_op_on_pserver) - 1 and global_ops:
+                per_opt_block = pserver_program.create_block(append_block.idx)
        # append global ops
        for glb_op in global_ops:
@@ -824,7 +825,7 @@ class DistributeTranspiler:
                for v in splited_vars:
                    sections.append(v.shape[0])
                program.global_block().append_op(
-                    type="split",
+                    type="split_byref",
                    inputs={"X": orig_var},
                    outputs={"Out": splited_vars},
                    attrs={"sections": sections}  # assume split evenly

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -32,7 +32,6 @@ __all__ = [
    'Switch',
    'lod_rank_table',
    'max_sequence_len',
-    'topk',
    'lod_tensor_to_array',
    'array_to_lod_tensor',
    'increment',
@@ -751,43 +750,6 @@ def max_sequence_len(rank_table):
    return res
-def topk(input, k):
-    """
-    **topk**
-    This function performs the operation that selects the k entries in the input
-    vector and outputs their values and indices as vectors. Thus topk_out[j] is
-    the j-th largest entry in input, and its index is topk_indices[j]
-    Args:
-        input (Variable|list): The input tensor that has all the data.
-        k (int): The number of top elements that the function will pick.
-    Returns:
-        Variable: The variable of type array that contains the k largest entries
-                  from input.
-        Variable: The variable of type array that contains the indices of k
-                  largest entries from input.
-    Examples:
-        .. code-block:: python
-          x = fluid.layers.data(name='x', shape=[10])
-          k = 5
-          array = fluid.layers.topk(x, k)
-    """
-    helper = LayerHelper('topk', **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype='int64')
-    helper.append_op(
-        type='top_k',
-        inputs={'X': [input]},
-        outputs={'Out': [topk_out],
-                 'Indices': [topk_indices]},
-        attrs={'k': k})
-    return topk_out, topk_indices
 def lod_tensor_to_array(x, table):
    """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.

--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -20,7 +20,7 @@ from ..initializer import init_on_cpu
 __all__ = [
    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay'
+    'polynomial_decay', 'piecewise_decay', 'noam_decay'
 ]
 """
 When training a model, it's often useful to decay the
@@ -32,14 +32,41 @@ strategy according to this module.
 """
-def _decay_step_counter():
+def _decay_step_counter(begin=0):
    # the first global step is zero in learning rate decay
    global_step = nn.autoincreased_step_counter(
-        counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
+        counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
    global_step = tensor.cast(global_step, 'float32')
    return global_step
+def noam_decay(d_model, warmup_steps):
+    """Apply decay to learning rate.
+    ```python
+    lr_value = np.power(d_model, -0.5) * np.min([
+            np.power(current_steps, -0.5),
+            np.power(warmup_steps, -1.5) * current_steps
+        ])
+    ```
+    Args:
+        d_model(Variable): The dimensionality of input and output of model.
+            Reference: attention is all you need
+                https://arxiv.org/pdf/1706.03762.pdf
+        warmup_steps(Variable): A super parameter.
+    Returns:
+        The decayed learning rate.
+    """
+    global_step = _decay_step_counter(1)
+    with init_on_cpu():
+        a = global_step**-0.5
+        b = (warmup_steps**-1.5) * global_step
+        lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
+    return lr_value
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
    """Applies exponential decay to the learning rate.

--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -20,6 +20,7 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
+import nn
 __all__ = ['accuracy', 'auc']
@@ -27,17 +28,10 @@ __all__ = ['accuracy', 'auc']
 def accuracy(input, label, k=1, correct=None, total=None):
    """
    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
+    The output is the top k inputs and their indices.
    """
    helper = LayerHelper("accuracy", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_out, topk_indices = nn.topk(input, k=k)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
    acc_out = helper.create_tmp_variable(dtype="float32")
    if correct is None:
        correct = helper.create_tmp_variable(dtype="int64")
@@ -68,12 +62,7 @@ def auc(input, label, curve='ROC', num_thresholds=200):
    helper = LayerHelper("auc", **locals())
    topk_out = helper.create_tmp_variable(dtype=input.dtype)
    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
+    topk_out, topk_indices = nn.topk(input, k=k)
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
    auc_out = helper.create_tmp_variable(dtype="float32")
    if correct is None:
        correct = helper.create_tmp_variable(dtype="int64")

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -60,6 +60,7 @@ __all__ = [
    'edit_distance',
    'l2_normalize',
    'matmul',
+    'topk',
    'warpctc',
    'sequence_reshape',
    'transpose',
@@ -2576,6 +2577,53 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
    return out
+def topk(input, k):
+    """
+    This operator is used to find values and indices of the k largest entries
+    for the last dimension.
+    If the input is a vector (rank=1), finds the k largest entries in the vector
+    and outputs their values and indices as vectors. Thus values[j] is the j-th
+    largest entry in input, and its index is indices[j].
+    If the input is a Tensor with higher rank, this operator computes the top k
+    entries along the last dimension.
+    Args:
+        input(Variable): The input variable which can be a vector or Tensor with
+            higher rank.
+        k(int): An integer value to specify the top k largest elements.
+    Returns:
+        values(Variable): The k largest elements along each last dimensional
+            slice.
+        indices(Variable): The indices of values within the last dimension of
+            input.
+    Examples:
+        .. code-block:: python
+            top5_values, top5_indices = layers.topk(input, k=5)
+    """
+    shape = input.shape
+    if k < 1 and k >= shape[-1]:
+        raise ValueError("k must be greater than 0 and less than %d." %
+                         (shape[-1]))
+    helper = LayerHelper("top_k", **locals())
+    values = helper.create_tmp_variable(dtype=input.dtype)
+    indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs={"k": k})
+    values.stop_gradient = True
+    indices.stop_gradient = True
+    return values, indices
 def edit_distance(input, label, normalized=True, ignored_tokens=None,
                  name=None):
    """
@@ -2717,15 +2765,7 @@ def ctc_greedy_decoder(input, blank, name=None):
            cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
    """
    helper = LayerHelper("ctc_greedy_decoder", **locals())
-    # top 1 op
+    _, topk_indices = topk(input, k=1)
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": 1})
    # ctc align op
    ctc_out = helper.create_tmp_variable(dtype="int64")

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -16,6 +16,7 @@ import core
 import multiprocessing
 import framework
 import executor
+import warnings
 import sys
 __all__ = ['ParallelExecutor']
@@ -62,8 +63,8 @@ class ParallelExecutor(object):
                  main_program=test_program,
                  share_vars_from=train_exe)
-              train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+              train_loss, = train_exe.run([loss.name], feed=feed_dict)
-              test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+              test_loss, = test_exe.run([loss.name], feed=feed_dict)
        """
        self._places = []
@@ -103,8 +104,8 @@ class ParallelExecutor(object):
        self.persistable_vars = [
            v.name
-            for v in filter(lambda var: \
+            for v in filter(
-                var.persistable and var.type != core.VarDesc.VarType.RAW,
+                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
                main.list_vars())
        ]
@@ -163,7 +164,7 @@ class ParallelExecutor(object):
        Returns: fetched result list.
        """
-        if feed is None:
+        if feed is None and feed_dict is not None:
            feed = feed_dict
            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"

--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py