diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index 878b950858a71ba0e10ab2643667922420d29099..c44793cd11d22b29b4b3422a047d81fe26624982 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -13,125 +13,186 @@
 // limitations under the License.
 
 #include <algorithm>
-#include <memory>
+#include <map>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_graph_view.h"
-#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-VarHandle* GetValidInput(const OpHandleBase* a) {
-  for (auto p : a->Inputs()) {
-    VarHandle* b = dynamic_cast<VarHandle*>(p);
-    if (b) {
-      return b;
+class AllReduceDepsPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override {
+    std::vector<AllReduceOpHandle*> all_reduce_op_handles =
+        GetSortedAllReduceOps(*graph);
+
+    for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) {
+      auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      all_reduce_op_handles[i - 1]->AddOutput(dep_var);
+      all_reduce_op_handles[i]->AddInput(dep_var);
     }
-  }
 
-  return nullptr;
-}
-
-void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
-  auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-
-  // get vars order
-  int order = 0;
-  std::unordered_map<std::string, int> vars;
-  // TODO(gongwb): use graph topology sort to find the order of operators.
-  //               Note that must assert topology sort is stable
-  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
-  for (auto* op_desc : ops) {
-    try {
-      bool is_bk_op =
-          static_cast<bool>(boost::get<int>(op_desc->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kBackward));
-      if (!is_bk_op) continue;
-
-      auto backward_vars =
-          boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
-              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
-      auto outputs = op_desc->Outputs();
-      for (auto& o_it : outputs) {
-        for (auto& v : o_it.second) {  // values
-          vars[v] = order;
-          VLOG(10) << "in all_reduce_deps_pass:" << v;
-        }
-      }
-      order++;
-    } catch (boost::bad_get e) {
+    if (VLOG_IS_ON(10)) {
+      DebugString(*graph, all_reduce_op_handles);
     }
   }
 
-  std::vector<OpHandleBase*> dist_ops;
-  // get allreduce ops.
-  for (auto& op : graph_ops) {
-    // FIXME(gongwb):add broad cast.
-    if (op->Name() == "all_reduce" || op->Name() == "reduce") {
-      dist_ops.push_back(op);
+  std::vector<AllReduceOpHandle*> GetSortedAllReduceOps(
+      const ir::Graph& graph) const {
+    std::vector<AllReduceOpHandle*> all_reduce_op_handles;
+    std::unordered_map<OpHandleBase*, size_t> pending_ops;
+    std::unordered_set<OpHandleBase*> ready_ops;
+    std::unordered_set<OpHandleBase*> next_ready_ops;
+
+    auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(graph);
+    size_t num_of_ops = op_handles.size();
+    for (OpHandleBase* op : op_handles) {
+      size_t not_ready_vars = op->NotReadyInputSize();
+      if (not_ready_vars) {
+        pending_ops.insert({op, not_ready_vars});
+      } else {
+        ready_ops.insert(op);
+      }
     }
-  }
-
-  VLOG(10) << "dist_ops size:" << dist_ops.size()
-           << ", outputs size:" << vars.size() << ", ops size:" << ops.size();
-
-  std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
-                                                  OpHandleBase* op2) {
-    VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
-    VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
-
-    PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
-                   op1->DebugString(), op2->DebugString());
 
-    auto l_it = vars.find(i0->name());
-    auto r_it = vars.find(i1->name());
-
-    PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(),
-                   "can't find var's name %s and %s in opdesc", i0->name(),
-                   i1->name());
-
-    if (l_it->second < r_it->second) return true;
+    GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
+
+    size_t has_run_ops = ready_ops.size();
+    while (has_run_ops != num_of_ops) {
+      for (auto* op : ready_ops) {
+        for (auto& ready_var : op->Outputs()) {
+          for (auto* pend_op : ready_var->PendingOps()) {
+            auto& deps = --pending_ops[pend_op];
+            if (deps == 0) {
+              next_ready_ops.insert(pend_op);
+            }
+          }
+        }
+      }
 
-    if (l_it->second == r_it->second) {
-      return i0->name() < i1->name();
+      PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle.");
+      ready_ops.clear();
+      std::swap(ready_ops, next_ready_ops);
+      GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
+      has_run_ops += ready_ops.size();
     }
+    return all_reduce_op_handles;
+  }
 
-    return false;
-  });
-
-  // add dependency.
-  auto& sorted_ops = dist_ops;
-  for (size_t i = 1; i < sorted_ops.size(); ++i) {
-    auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-
-    auto* pre_op = sorted_ops[i - 1];
-    auto* op = sorted_ops[i];
-
-    pre_op->AddOutput(dep_var);
-    op->AddInput(dep_var);
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+  void GetSortedAllReduceOps(
+      const std::unordered_set<OpHandleBase*>& ready_ops,
+      std::vector<AllReduceOpHandle*>* all_reduce_op_handles) const {
+    std::vector<AllReduceOpHandle*> current_all_reduce_op_handles;
+    for (auto& op_handle : ready_ops) {
+      auto all_reduce_op_handle = dynamic_cast<AllReduceOpHandle*>(op_handle);
+      if (all_reduce_op_handle) {
+        current_all_reduce_op_handles.emplace_back(all_reduce_op_handle);
+      }
+    }
 
-    VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
-             << " and " << op;
+    // NOTE(zcd): For distributed training, it is important to keep the order of
+    // allReduce on each node consistent. Otherwise, hang may occur.
+    // Sort the current_all_reduce_op_handles according to the name of input.
+    sort(current_all_reduce_op_handles.begin(),
+         current_all_reduce_op_handles.end(),
+         [](const AllReduceOpHandle* left,
+            const AllReduceOpHandle* right) -> bool {
+           auto left_in_vars = DynamicCast<VarHandle>(left->Inputs());
+           auto right_in_vars = DynamicCast<VarHandle>(right->Inputs());
+           PADDLE_ENFORCE_GT(left_in_vars.size(), 0);
+           PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size());
+           return left_in_vars[0]->Name() > right_in_vars[0]->Name();
+         });
+
+    all_reduce_op_handles->insert(all_reduce_op_handles->end(),
+                                  current_all_reduce_op_handles.begin(),
+                                  current_all_reduce_op_handles.end());
+  }
 
-    VLOG(10) << "pre_op:" << pre_op->DebugString()
-             << ", op:" << op->DebugString();
+  void DebugString(
+      const ir::Graph& graph,
+      const std::vector<AllReduceOpHandle*>& all_reduce_op_handles) const {
+    // get vars order
+    std::map<int, std::vector<std::string>> vars =
+        GetSoredGradientsFromStaleProgram(graph);
+    std::stringstream out;
+    size_t grads_of_stale_program = 0;
+    out << "Get Order From kStaleProgramOpDescs: ";
+    for (auto& var : vars) {
+      out << "Order " << var.first << " [";
+      for (auto& var_name : var.second) {
+        out << var_name << ", ";
+        ++grads_of_stale_program;
+      }
+      out << "], ";
+    }
+    VLOG(10) << out.str();
+
+    std::stringstream out2;
+    out2 << "Get Order From Topological order: ";
+    for (auto& op : all_reduce_op_handles) {
+      bool find_valid_input = false;
+      for (auto& in_var : op->Inputs()) {
+        if (dynamic_cast<VarHandle*>(in_var)) {
+          out2 << in_var->Name() << ", ";
+          find_valid_input = true;
+          break;
+        }
+      }
+      PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input.");
+    }
+    VLOG(10) << out2.str();
+    if (grads_of_stale_program != all_reduce_op_handles.size()) {
+      VLOG(10)
+          << "The gradients number of stale program and graph is not equal.";
+    }
   }
-}
 
+  std::map<int, std::vector<std::string>> GetSoredGradientsFromStaleProgram(
+      const ir::Graph& graph) const {
+    std::map<int, std::vector<std::string>> vars;
+    auto ops = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
+    int order = 0;
+    for (auto* op_desc : ops) {
+      try {
+        bool is_bk_op =
+            static_cast<bool>(boost::get<int>(op_desc->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                              static_cast<int>(OpRole::kBackward));
+        if (!is_bk_op) continue;
+
+        auto backward_vars =
+            boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
+                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+        if (backward_vars.empty()) continue;
+
+        PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+        for (size_t i = 1; i < backward_vars.size(); i += 2) {
+          vars[order].emplace_back(backward_vars[i]);
+          VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1]
+                  << ", " << backward_vars[i];
+        }
+        order++;
+      } catch (boost::bad_get e) {
+      }
+    }
+    return vars;
+  }
+};
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h
deleted file mode 100644
index 4ed3736587aa3d45e288e3dc7e6ab3099f935f41..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// TODO(gongwb): overlap allreduce with backward computation.
-class AllReduceDepsPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 6e477cd2977561ddb914e4a6343f677044fad4be..ed75b48090b27a9c430afe067467a1a39d711938 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -28,7 +28,7 @@
 // asynchronous nccl allreduce or synchronous issue:
 // https://github.com/PaddlePaddle/Paddle/issues/15049
 DEFINE_bool(
-    sync_nccl_allreduce, false,
+    sync_nccl_allreduce, true,
     "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
     "after allreduce, this mode can get better performance in some scenarios.");
 
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 027de4cda410178fbae11f1db9a580c2b7ad22a3..f8bf43bcb48226b4d1317a78ade7179741097378 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -163,14 +163,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
           "graph_printer", new details::GraphvizSSAGraphPrinter);
     }
 
-    // Verify that the graph is correct for multi-device executor.
-    AppendPass("multi_devices_check_pass");
-
-    if (VLOG_IS_ON(2)) {
-      AppendPass("all_reduce_deps_pass");
-    }
-
-    if (SeqOnlyAllReduceOps(strategy_)) {
+    // experimental shows that the program will be faster if append
+    // all_reduce_deps_pass here.
+    if (!strategy_.enable_parallel_graph_ &&
+        (SeqOnlyAllReduceOps(strategy_) ||
+         strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) {
       VLOG(10) << "Add all_reduce_deps_pass";
       AppendPass("all_reduce_deps_pass");
     }
@@ -179,6 +176,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       VLOG(10) << "Add modify_op_lock_and_record_event_pass";
       AppendPass("modify_op_lock_and_record_event_pass");
     }
+
+    // Verify that the graph is correct for multi-device executor.
+    AppendPass("multi_devices_check_pass");
   }
 
   // Convert graph to run on multi-devices.
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 5811693b7ce6d6f2aebc9a8896960226295bd3e5..cc48c51e924039d93b2e1e18bea752611e7bef92 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -91,7 +91,11 @@ struct BuildStrategy {
 
   bool enable_sequential_execution_{false};
 
-  bool fuse_broadcast_op_{false};
+  // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
+  // faster. Because fusing broadcast OP equals delaying the execution of all
+  // broadcast Ops, in this case, all nccl streams are used only for reduce
+  // operations for a period of time.
+  bool fuse_broadcast_ops_{false};
 
   // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
   // num_trainers is 1, so the current fields of build_strategy doesn't tell if
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 297ee92fc3c84c2feec9cb85bd8671ce8ad94ed0..3e805bd5b480241954960f92a72514723c3a8bb7 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -56,6 +56,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   fetches.resize(fetch_tensors.size());
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
   std::vector<FetchOpHandle *> fetch_ops;
+  std::vector<OpHandleBase *> ready_fetch_ops;
 
   for (auto &fetch_var_name : fetch_tensors) {
     for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
@@ -70,8 +71,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
     auto &var_name = fetch_tensors[i];
     auto fetched_var_it = fetched_vars.find(var_name);
     PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
-                   "Cannot find fetched variable.(Perhaps the main_program "
-                   "is not set to ParallelExecutor)");
+                   "Cannot find fetched variable(%s).(Perhaps the main_program "
+                   "is not set to ParallelExecutor)",
+                   var_name);
 
     auto &vars = fetched_var_it->second;
 
@@ -88,7 +90,11 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
       op->AddInput(var);
     }
 
-    (*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
+    int dep = static_cast<int>(op->NotReadyInputSize());
+    (*op_deps)[op] = dep;
+    if (dep == 0) {
+      ready_fetch_ops.emplace_back(op);
+    }
   }
 
   size_t num_complete = 0;
@@ -97,7 +103,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   for (auto op : bootstrap_ops_) {
     RunOpAsync(op_deps.get(), op, complete_q);
   }
-
+  for (auto op : ready_fetch_ops) {
+    RunOpAsync(op_deps.get(), op, complete_q);
+  }
   while (num_complete != op_deps->size()) {
     size_t num_comp = complete_q->Pop();
     if (num_comp == -1UL) {
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 232d82a5da596a78d2999c4a4c4f7dda0c7cad7e..6c8b8937ebe646042f71cb58cfbc2d32426a4e3c 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
-
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -44,6 +44,7 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const {
 }
 
 void FetchOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
   WaitInputVarGenerated(platform::CPUPlace());
 
   tensors_.resize(inputs_.size());
@@ -62,7 +63,8 @@ void FetchOpHandle::RunImpl() {
     auto &t = var->Get<framework::LoDTensor>();
     if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
-      TensorCopySync(t, cpu, &tensors_[i]);
+      TensorCopy(t, cpu, *dev_ctxes_.at(t.place()), &tensors_[i]);
+      dev_ctxes_.at(t.place())->Wait();
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index e22bd3917895be8d84a83b3986c6919564b2ddab..f213e07b555ca9fc4b73a2f91412063f4e7f47d4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -658,7 +658,7 @@ bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
 
 void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
   if (UseGPU()) {
-    if (strategy_.fuse_broadcast_op_) {
+    if (strategy_.fuse_broadcast_ops_) {
       CreateFusedBroadcastOp(result, bcast_var_name_set_);
     } else {
       for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
@@ -1021,7 +1021,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
         strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
       return;
     }
-    if (strategy_.fuse_broadcast_op_) {
+    if (strategy_.fuse_broadcast_ops_) {
       CreateFusedBroadcastOp(result, bcast_var_name_set_);
     } else {
       for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 413b14961631b3459e0d05af685ad1c5395844c2..69cd84ebf2d678c089141f09a92c46e3a03fe4d9 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -68,7 +68,7 @@ void OpHandleBase::Run(bool use_cuda) {
         if (out_var_handle) {
           PADDLE_ENFORCE(
               platform::is_same_place(place, out_var_handle->place()),
-              "The place of input(%s) is not consistent with the "
+              "The place of output(%s) is not consistent with the "
               "place of current op(%s).",
               out_var_handle->Name(), Name());
           out_var_handle->SetGenerateEvent(events_.at(dev_id));
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 5dde0b76b816bc5309b455d58deb8942300c6af5..67246a4dd448b0ce2f115d6438c5fdd6cc39ca6d 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -80,7 +80,6 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
     }
     set.clear();
   };
-  auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); };
   // Clean run context
   run_op_futures_.clear();
   exception_holder_.Clear();
@@ -116,7 +115,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          run_all_op(op);
+          ready_ops.insert(op);
         }
       }
     }
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 353db435213c74982d582e5be298ecfb1a810f30..e6f5b15af8cd440a9304235acfe62787c5f1b134 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -617,6 +617,25 @@ void OpDesc::Flush() {
 
 static std::once_flag init_infer_shape_funcs;
 
+/**
+ * NOTE(paddle-dev): Very tricky code here. Maybe we should find a
+ * better way to register compile-time infershape method gentlely.
+ *
+ * Normally, we can register a class derived from InferShapeBase, so that
+ * we can set the field of `infer_shape_` inside OpInfo when registering op.
+ *
+ * However, there is another way we can set the field of `infer_shape_` inside
+ * OpInfo. Usually, we overload InferShape method of OperatorWithKernel. After
+ * running the following method InitInferShapeFuncs, `infer_shape_` would be set
+ * to be the InferShape method of OperatorWithKernel. That is to say, we borrow
+ * the run-time InferShape method of OperatorWithKernel to be the compile-time
+ * InferShape method.
+ *
+ * However, during compiling time, we may not know inputs, outputs and attrs of
+ * run-time OperatorWithKernel. So the following code creates a fake
+ * OperatorWithKernel object. That is why the field info_ of OperatorBase
+ * would be null.
+ */
 static void InitInferShapeFuncs() {
   std::call_once(init_infer_shape_funcs, [] {
     auto &map = OpInfoMap::Instance();
@@ -628,11 +647,16 @@ static void InitInferShapeFuncs() {
       PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered",
                      op_type);
       auto &op_info = it->second;
-      auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
-          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
       if (op_info.infer_shape_) {  // infer_shape has been registered.
         continue;
       }
+
+      auto op = dynamic_cast<OperatorWithKernel *>(op_info.Creator()(
+          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
+
+      PADDLE_ENFORCE_NOT_NULL(
+          op, "InferShapeBase is not registered to Operator %s", op_type);
+
       op_info.infer_shape_ = [op](InferShapeContext *ctx) {
         op->InferShape(ctx);
       };
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index a2a8083da955c93175ab2f01a37737c145e6f1b8..4245caf1689c76d72b410c742488c55562c8b998 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -19,11 +19,6 @@ limitations under the License. */
 #include <tuple>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-#include "paddle/fluid/framework/ir/graph.h"
-
-#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -31,6 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #ifdef WITH_GPERFTOOLS
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index e5036d940197ef012cbfd8f52700c8aeb54fb6c5..0109b4a4fa7617880f642f5a39639bca38475515 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -142,7 +142,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
 void AnalysisConfig::EnableMKLDNN() {
 #ifdef PADDLE_WITH_MKLDNN
-  pass_builder()->EnableMKLDNN();
   use_mkldnn_ = true;
 #else
   LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
@@ -235,16 +234,13 @@ void AnalysisConfig::Update() {
   }
 
   if (use_mkldnn_) {
+#ifdef PADDLE_WITH_MKLDNN
     if (!enable_ir_optim_) {
       LOG(ERROR)
           << "EnableMKLDNN() only works when IR optimization is enabled.";
+    } else {
+      pass_builder()->EnableMKLDNN();
     }
-#ifdef PADDLE_WITH_MKLDNN
-    pass_builder()->EnableMKLDNN();
-    use_mkldnn_ = true;
-#else
-    LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
-    use_mkldnn_ = false;
 #endif
   }
 
@@ -256,9 +252,6 @@ void AnalysisConfig::Update() {
     }
 #ifdef PADDLE_WITH_MKLDNN
     pass_builder()->EnableMkldnnQuantizer();
-#else
-    LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
-    use_mkldnn_quantizer_ = false;
 #endif
   }
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 258a79fa4e884177490fab79778151ae52537aa0..c89dd41e0a6283e0723e2925f28c0372cda6a2b2 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -27,6 +27,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 
@@ -266,17 +267,17 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
 }
 
 static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
-                      double latency, int epoch = 1) {
-  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
-            << ", threads: " << num_threads << ", thread id: " << tid
-            << ", latency: " << latency << "ms, fps: " << 1 / (latency / 1000.f)
+                      double batch_latency, int epoch = 1) {
+  PADDLE_ENFORCE(batch_size > 0, "Non-positive batch size.");
+  double sample_latency = batch_latency / batch_size;
+  LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
             << " ======";
-  if (epoch > 1) {
-    int samples = batch_size * epoch;
-    LOG(INFO) << "====== sample number: " << samples
-              << ", average latency of each sample: " << latency / samples
-              << "ms ======";
-  }
+  LOG(INFO) << "====== batch_size: " << batch_size << ", iterations: " << epoch
+            << ", repetitions: " << repeat << " ======";
+  LOG(INFO) << "====== batch latency: " << batch_latency
+            << "ms, number of samples: " << batch_size * epoch
+            << ", sample latency: " << sample_latency
+            << "ms, fps: " << 1000.f / sample_latency << " ======";
 }
 
 static bool IsFileExists(const std::string &path) {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 1d1d39e44096b9f50e5bc9603fa12aba92b0e8e2..87e02a02caebd93d701dfd9e51c35fb974c770ed 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -64,10 +64,12 @@ void PaddlePassBuilder::DeletePass(size_t idx) {
   passes_.erase(std::begin(passes_) + idx);
 }
 
-void GpuPassStrategy::EnableMKLDNN() {
-  LOG(ERROR) << "GPU not support MKLDNN yet";
+void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
+  analysis_passes_.push_back(pass);
 }
 
+void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
+
 // The following passes works for Anakin sub-graph engine.
 const std::vector<std::string> kAnakinSubgraphPasses({
     "infer_clean_graph_pass",                       //
@@ -102,12 +104,12 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   use_gpu_ = true;
 }
 
-void GpuPassStrategy::EnableMkldnnQuantizer() {
-  LOG(ERROR) << "GPU not support MKL-DNN quantization";
+void GpuPassStrategy::EnableMKLDNN() {
+  LOG(ERROR) << "GPU not support MKLDNN yet";
 }
 
-void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
-  analysis_passes_.push_back(pass);
+void GpuPassStrategy::EnableMkldnnQuantizer() {
+  LOG(ERROR) << "GPU not support MKL-DNN quantization";
 }
 
 CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
@@ -135,5 +137,39 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   });
   use_gpu_ = false;
 }
-void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
+
+void CpuPassStrategy::EnableMKLDNN() {
+// TODO(Superjomn) Consider the way to mix CPU with GPU.
+#ifdef PADDLE_WITH_MKLDNN
+  if (!use_mkldnn_) {
+    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
+
+    for (auto &pass : std::vector<std::string>(
+             {"depthwise_conv_mkldnn_pass",    //
+              "conv_bn_fuse_pass",             // Execute BN passes again to
+              "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
+              "conv_bias_mkldnn_fuse_pass",    //
+              "conv3d_bias_mkldnn_fuse_pass",  //
+              "conv_elementwise_add_mkldnn_fuse_pass",
+              "conv_relu_mkldnn_fuse_pass"})) {
+      passes_.push_back(pass);
+    }
+  }
+  use_mkldnn_ = true;
+#else
+  use_mkldnn_ = false;
+#endif
+}
+
+void CpuPassStrategy::EnableMkldnnQuantizer() {
+#ifdef PADDLE_WITH_MKLDNN
+  if (!use_mkldnn_quantizer_) {
+    passes_.push_back("cpu_quantize_placement_pass");
+  }
+  use_mkldnn_quantizer_ = true;
+#else
+  use_mkldnn_quantizer_ = false;
+#endif
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 48da8c156f426477011bcc060260c812ad94df23..09ef195d5e66aff0cef17f1594de34c656187a35 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -109,43 +109,16 @@ class CpuPassStrategy : public PassStrategy {
   CpuPassStrategy();
 
   explicit CpuPassStrategy(const CpuPassStrategy &other)
-      : PassStrategy(other.AllPasses()) {}
+      : PassStrategy(other.AllPasses()) {
+    use_gpu_ = other.use_gpu_;
+    use_mkldnn_ = other.use_mkldnn_;
+    use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
+  }
 
   virtual ~CpuPassStrategy() = default;
 
-  void EnableMKLDNN() override {
-// TODO(Superjomn) Consider the way to mix CPU with GPU.
-#ifdef PADDLE_WITH_MKLDNN
-    if (!use_mkldnn_) {
-      passes_.insert(passes_.begin(), "mkldnn_placement_pass");
-
-      for (auto &pass : std::vector<std::string>(
-               {"depthwise_conv_mkldnn_pass",    //
-                "conv_bn_fuse_pass",             // Execute BN passes again to
-                "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-                "conv_bias_mkldnn_fuse_pass",    //
-                "conv3d_bias_mkldnn_fuse_pass",  //
-                "conv_relu_mkldnn_fuse_pass",    //
-                "conv_elementwise_add_mkldnn_fuse_pass"})) {
-        passes_.push_back(pass);
-      }
-    }
-    use_mkldnn_ = true;
-#else
-    use_mkldnn_ = false;
-#endif
-  }
-
-  void EnableMkldnnQuantizer() override {
-#ifdef PADDLE_WITH_MKLDNN
-    if (!use_mkldnn_quantizer_) {
-      passes_.push_back("cpu_quantize_placement_pass");
-    }
-    use_mkldnn_quantizer_ = true;
-#else
-    use_mkldnn_quantizer_ = false;
-#endif
-  }
+  void EnableMKLDNN() override;
+  void EnableMkldnnQuantizer() override;
 
  protected:
   bool use_mkldnn_quantizer_{false};
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 647913cc80727786379e2e5525b372818e423d23..8ecb0310c9775393631b99681e13cbea7a5b781e 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -26,7 +26,11 @@ endfunction()
 function(inference_analysis_api_int8_test target model_dir data_dir filename)
     inference_analysis_test(${target} SRCS ${filename}
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
-        ARGS --infer_model=${model_dir}/model --infer_data=${data_dir}/data.bin --batch_size=100)
+        ARGS --infer_model=${model_dir}/model
+             --infer_data=${data_dir}/data.bin
+             --warmup_batch_size=100
+             --batch_size=50
+	     --iterations=2)
 endfunction()
 
 function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
@@ -146,22 +150,22 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_con
 
 # int8 image classification tests
 if(WITH_MKLDNN)
-  set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8")
+  set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
   if (NOT EXISTS ${INT8_DATA_DIR})
-    inference_download_and_uncompress(${INT8_DATA_DIR} ${INFERENCE_URL}"/int8" "imagenet_val_100.tar.gz")
+    inference_download_and_uncompress(${INT8_DATA_DIR} "${INFERENCE_URL}/int8" "imagenet_val_100_tail.tar.gz")
   endif()
 
   #resnet50 int8
   set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
   if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR})
-    inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} ${INFERENCE_URL}"/int8" "resnet50_int8_model.tar.gz" )
+    inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "${INFERENCE_URL}/int8" "resnet50_int8_model.tar.gz" )
   endif()
   inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
 
   #mobilenet int8
   set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet")
   if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR})
-    inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} ${INFERENCE_URL}"/int8" "mobilenetv1_int8_model.tar.gz" )
+    inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "${INFERENCE_URL}/int8" "mobilenetv1_int8_model.tar.gz" )
   endif()
   inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
index e73358d8827a40786beb05fad931267b0dd88f6b..9b2e74ec16eb3b6e98bfcc8cc546ed74a7966f33 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -154,7 +154,7 @@ void profile(bool use_mkldnn = false) {
     config.EnableMKLDNN();
   }
 
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
   std::vector<std::vector<PaddleTensor>> inputs;
   LoadInputData(&inputs);
   TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 735e4fb563788438ee49ff6308d11f4dbe4962be..e10d239a5d1b30e089a110c6155520e3b035860a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -197,7 +197,7 @@ void profile(bool use_mkldnn = false) {
     cfg.SetMKLDNNOp(op_list);
   }
 
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
 
@@ -206,9 +206,11 @@ void profile(bool use_mkldnn = false) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     PADDLE_ENFORCE_GT(outputs.size(), 0);
-    size_t size = GetSize(outputs[0]);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_GT(output.size(), 0);
+    size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(outputs[0].data.data());
+    float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
       EXPECT_NEAR(result[i], result_data[i], 1e-3);
     }
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
index 5a4f9a31a164a8fca3f80ce2fe2e6065fd04b340..ece094717b8076321c68d7fdd29f07c4da6b0ed4 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
-DEFINE_int32(iterations, 0, "Number of iterations");
-
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -30,8 +28,13 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->SwitchIrOptim();
   cfg->SwitchSpecifyInputNames(false);
   cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
-
   cfg->EnableMKLDNN();
+  cfg->pass_builder()->SetPasses(
+      {"infer_clean_graph_pass", "mkldnn_placement_pass",
+       "depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass",
+       "conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass",
+       "conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass",
+       "fc_fuse_pass", "is_test_pass"});
 }
 
 template <typename T>
@@ -40,8 +43,8 @@ class TensorReader {
   TensorReader(std::ifstream &file, size_t beginning_offset,
                std::vector<int> shape, std::string name)
       : file_(file), position(beginning_offset), shape_(shape), name_(name) {
-    numel =
-        std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<T>());
+    numel = std::accumulate(shape_.begin(), shape_.end(), size_t{1},
+                            std::multiplies<size_t>());
   }
 
   PaddleTensor NextBatch() {
@@ -71,10 +74,14 @@ class TensorReader {
 };
 
 std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
-    const std::vector<std::vector<PaddleTensor>> &test_data, int num_images) {
+    const std::vector<std::vector<PaddleTensor>> &test_data,
+    int num_images = FLAGS_warmup_batch_size) {
   int test_data_batch_size = test_data[0][0].shape[0];
-  CHECK_LE(static_cast<size_t>(num_images),
-           test_data.size() * test_data_batch_size);
+  auto iterations_max = test_data.size();
+  PADDLE_ENFORCE(
+      static_cast<size_t>(num_images) <= iterations_max * test_data_batch_size,
+      "The requested quantization warmup data size " +
+          std::to_string(num_images) + " is bigger than all test data size.");
 
   PaddleTensor images;
   images.name = "input";
@@ -120,20 +127,17 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
 
   std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
   std::vector<int> label_batch_shape{batch_size, 1};
+  auto images_offset_in_file = static_cast<size_t>(file.tellg());
   auto labels_offset_in_file =
-      static_cast<size_t>(file.tellg()) +
-      sizeof(float) * total_images *
-          std::accumulate(image_batch_shape.begin() + 1,
-                          image_batch_shape.end(), 1, std::multiplies<int>());
+      images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
 
-  TensorReader<float> image_reader(file, 0, image_batch_shape, "input");
+  TensorReader<float> image_reader(file, images_offset_in_file,
+                                   image_batch_shape, "input");
   TensorReader<int64_t> label_reader(file, labels_offset_in_file,
                                      label_batch_shape, "label");
 
-  auto iterations = total_images / batch_size;
-  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations)
-    iterations = FLAGS_iterations;
-  for (auto i = 0; i < iterations; i++) {
+  auto iterations_max = total_images / batch_size;
+  for (auto i = 0; i < iterations_max; i++) {
     auto images = image_reader.NextBatch();
     auto labels = label_reader.NextBatch();
     inputs->emplace_back(
@@ -148,20 +152,21 @@ TEST(Analyzer_int8_resnet50, quantization) {
   AnalysisConfig q_cfg;
   SetConfig(&q_cfg);
 
+  // read data from file and prepare batches with test data
   std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all, 100);
+  SetInput(&input_slots_all);
 
+  // prepare warmup batch from input data read earlier
+  // warmup batch size can be different than batch size
   std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
-      GetWarmupData(input_slots_all, 100);
+      GetWarmupData(input_slots_all);
 
+  // configure quantizer
   q_cfg.EnableMkldnnQuantizer();
   q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
-  q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
+  q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size);
 
-  CompareQuantizedAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-      reinterpret_cast<const PaddlePredictor::Config *>(&q_cfg),
-      input_slots_all);
+  CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 347672eaae314aa42096d48a3b044014f2ddbf84..142905dcd8d9964d93d0c5f7444823eef2b84900 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -124,7 +124,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_LAC, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -137,11 +137,13 @@ TEST(Analyzer_LAC, profile) {
         24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
         44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
         15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    size_t size = GetSize(output[0]);
     size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
     PADDLE_ENFORCE_GE(size, batch1_size);
-    int64_t *pdata = static_cast<int64_t *>(outputs[0].data.data());
+    int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < batch1_size; ++i) {
       EXPECT_EQ(pdata[i], lac_ref_data[i]);
     }
diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
index 089f655c180d784af66af60277bdbf32a6019599..2eb347a44b394a55706d5aa88bee7fe1fcc7838e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -96,7 +96,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
@@ -108,8 +108,9 @@ void profile(bool use_mkldnn = false) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    PADDLE_ENFORCE_EQ(outputs.size(), 2UL);
-    for (auto &output : outputs) {
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_EQ(outputs.back().size(), 2UL);
+    for (auto &output : outputs.back()) {
       size_t size = GetSize(output);
       PADDLE_ENFORCE_GT(size, 0);
       float *result = static_cast<float *>(output.data.data());
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index a70aa7a6ac41121a0c8ea397ebc7e24e4b206d12..36e07d5f55600dc7aa96227289f707fb19f92d56 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -106,7 +106,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 void profile(bool memory_load = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg, memory_load);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -117,10 +117,12 @@ void profile(bool memory_load = false) {
     // the first inference result
     const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                            48, 39, 38, 16, 25};
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
-    int64_t *result = static_cast<int64_t *>(outputs[0].data.data());
+    int64_t *result = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < std::min(11UL, size); i++) {
       EXPECT_EQ(result[i], chinese_ner_result_data[i]);
     }
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
index 5157bd280d0f3ee327d5cee7799477b5e6fd3f71..9443b08063b8f61d3d6b291a7217d645d8825c54 100644
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -127,7 +127,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_Pyramid_DNN, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -135,10 +135,12 @@ TEST(Analyzer_Pyramid_DNN, profile) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(outputs[0].data.data());
+    float *result = static_cast<float *>(output[0].data.data());
     // output is probability, which is in (0, 1).
     for (size_t i = 0; i < size; i++) {
       EXPECT_GT(result[i], 0);
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 629981d565f1b6eeabc192287cb9f892df21b8e4..d4330e6cddf8818ace01be2f13a4c18a192c46e1 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -40,7 +40,7 @@ void profile(bool use_mkldnn = false) {
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
   }
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index dcf4b38ce8a9230148738cfd0840ca96b0c7cf8c..54fd3a4a4caba52110ab636e6d44ee2a473f0cb0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -229,7 +229,7 @@ TEST(Analyzer_rnn1, profile) {
   SetConfig(&cfg);
   cfg.DisableGpu();
   cfg.SwitchIrDebug();
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -280,7 +280,7 @@ TEST(Analyzer_rnn1, compare_determine) {
 TEST(Analyzer_rnn1, multi_thread) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index 007f9f0b66a7b276f5f2e8500a3001788ad41e79..9ccbf58cbd2bbaab9b1a132c27e50356e1a5df37 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -126,7 +126,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_rnn2, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -136,9 +136,11 @@ TEST(Analyzer_rnn2, profile) {
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
     PADDLE_ENFORCE_GT(outputs.size(), 0);
-    size_t size = GetSize(outputs[0]);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_GT(output.size(), 0);
+    size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(outputs[0].data.data());
+    float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
       EXPECT_NEAR(result[i], result_data[i], 1e-3);
     }
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index 47c1d7375843e4bad212c1d7d621c9e6d45e5982..9f23b9f037bcaeb758312d011067ae29c82e73cd 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -110,7 +110,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_seq_conv1, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -119,10 +119,12 @@ TEST(Analyzer_seq_conv1, profile) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
-    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    auto output = outputs.back();
+    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size, 0);
-    float *result = static_cast<float *>(outputs[0].data.data());
+    float *result = static_cast<float *>(output[0].data.data());
     // output is probability, which is in (0, 1).
     for (size_t i = 0; i < size; i++) {
       EXPECT_GT(result[i], 0);
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index 19fa5528da4d11d2eb1a2f932f60a84c3f5468e7..d6f7f468a6c83bd6c4ac087931d0c6b7cac3cc1c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -156,7 +156,7 @@ void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg, use_mkldnn);
 
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
   TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 2003be82019333ca97b9fa8ef83668825fe5710d..54492dbc238bbaf25f86b300fdd6585f74365088 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -70,7 +70,7 @@ TEST(Analyzer_Text_Classification, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   cfg.SwitchIrDebug();
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -79,8 +79,9 @@ TEST(Analyzer_Text_Classification, profile) {
 
   if (FLAGS_num_threads == 1) {
     // Get output
-    LOG(INFO) << "get outputs " << outputs.size();
-    for (auto &output : outputs) {
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    LOG(INFO) << "get outputs " << outputs.back().size();
+    for (auto &output : outputs.back()) {
       LOG(INFO) << "output.shape: " << to_string(output.shape);
       // no lod ?
       CHECK_EQ(output.lod.size(), 0UL);
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
index a925da312cde30380b4997b8b76a0d425a71e817..bd4f1b61973fb0de06dcc288e329c94756d5ed47 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -186,7 +186,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
   }
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index ca04c1365cbbffcb4a2786cde9ab240cc20aa3d8..fb47048cd0ccc887927cb4b533d45df11ef633eb 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -87,7 +87,7 @@ void profile(bool use_mkldnn = false) {
     cfg.EnableMKLDNN();
   }
   // cfg.pass_builder()->TurnOnDebug();
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -100,7 +100,8 @@ void profile(bool use_mkldnn = false) {
     auto refer = ProcessALine(line);
     file.close();
 
-    auto &output = outputs.front();
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    auto &output = outputs.back().front();
     size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
     CHECK_EQ(numel, refer.data.size());
     for (size_t i = 0; i < numel; ++i) {
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 33f1d0254858814be20eee1a6c2faaf00c2e8178..9a0dcc722cf00984b8c0e3ac20f13849e2904102 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -41,7 +41,10 @@ DEFINE_string(model_name, "", "model name");
 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data file");
 DEFINE_string(refer_result, "", "reference result for comparison");
-DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(batch_size, 1, "batch size");
+DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
+// setting iterations to 0 means processing the whole dataset
+DEFINE_int32(iterations, 0, "number of batches to process");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
 DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
 DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
@@ -239,7 +242,7 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
     }
     input.shape = shape;
     input.dtype = PaddleDType::FLOAT32;
-    size_t len = std::accumulate(shape.begin(), shape.end(), 1,
+    size_t len = std::accumulate(shape.begin(), shape.end(), size_t{1},
                                  [](int a, int b) { return a * b; });
     input.data.Resize(len * sizeof(float));
     input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}});
@@ -286,17 +289,18 @@ void ConvertPaddleTensorToZeroCopyTensor(
 
 void PredictionWarmUp(PaddlePredictor *predictor,
                       const std::vector<std::vector<PaddleTensor>> &inputs,
-                      std::vector<PaddleTensor> *outputs, int num_threads,
-                      int tid) {
+                      std::vector<std::vector<PaddleTensor>> *outputs,
+                      int num_threads, int tid) {
   int batch_size = FLAGS_batch_size;
   LOG(INFO) << "Running thread " << tid << ", warm up run...";
   if (FLAGS_zero_copy) {
     ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
   }
+  outputs->resize(1);
   Timer warmup_timer;
   warmup_timer.tic();
   if (!FLAGS_zero_copy) {
-    predictor->Run(inputs[0], outputs, batch_size);
+    predictor->Run(inputs[0], &(*outputs)[0], batch_size);
   } else {
     predictor->ZeroCopyRun();
   }
@@ -308,11 +312,16 @@ void PredictionWarmUp(PaddlePredictor *predictor,
 
 void PredictionRun(PaddlePredictor *predictor,
                    const std::vector<std::vector<PaddleTensor>> &inputs,
-                   std::vector<PaddleTensor> *outputs, int num_threads,
-                   int tid) {
-  int batch_size = FLAGS_batch_size;
+                   std::vector<std::vector<PaddleTensor>> *outputs,
+                   int num_threads, int tid) {
   int num_times = FLAGS_repeat;
-  LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
+  int iterations = inputs.size();  // process the whole dataset ...
+  if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size())
+    iterations =
+        FLAGS_iterations;  // ... unless the number of iterations is set
+  outputs->resize(iterations);
+  LOG(INFO) << "Thread " << tid << ", number of threads " << num_threads
+            << ", run " << num_times << " times...";
   Timer run_timer;
   double elapsed_time = 0;
 #ifdef WITH_GPERFTOOLS
@@ -320,14 +329,14 @@ void PredictionRun(PaddlePredictor *predictor,
 #endif
   if (!FLAGS_zero_copy) {
     run_timer.tic();
-    for (size_t i = 0; i < inputs.size(); i++) {
+    for (size_t i = 0; i < iterations; i++) {
       for (int j = 0; j < num_times; j++) {
-        predictor->Run(inputs[i], outputs, batch_size);
+        predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
       }
     }
     elapsed_time = run_timer.toc();
   } else {
-    for (size_t i = 0; i < inputs.size(); i++) {
+    for (size_t i = 0; i < iterations; i++) {
       ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
       run_timer.tic();
       for (int j = 0; j < num_times; j++) {
@@ -340,13 +349,14 @@ void PredictionRun(PaddlePredictor *predictor,
   ProfilerStop();
 #endif
 
-  PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times,
-            inputs.size());
+  auto batch_latency = elapsed_time / (iterations * num_times);
+  PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency,
+            iterations);
   if (FLAGS_record_benchmark) {
     Benchmark benchmark;
     benchmark.SetName(FLAGS_model_name);
-    benchmark.SetBatchSize(batch_size);
-    benchmark.SetLatency(elapsed_time / num_times);
+    benchmark.SetBatchSize(FLAGS_batch_size);
+    benchmark.SetLatency(batch_latency);
     benchmark.PersistToFile("benchmark_record.txt");
   }
 }
@@ -354,16 +364,17 @@ void PredictionRun(PaddlePredictor *predictor,
 void TestOneThreadPrediction(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
-    std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
+    std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
   auto predictor = CreateTestPredictor(config, use_analysis);
-  PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
-  PredictionRun(predictor.get(), inputs, outputs, 1, 0);
+  PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads,
+                   0);
+  PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0);
 }
 
 void TestMultiThreadPrediction(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
-    std::vector<PaddleTensor> *outputs, int num_threads,
+    std::vector<std::vector<PaddleTensor>> *outputs, int num_threads,
     bool use_analysis = true) {
   std::vector<std::thread> threads;
   std::vector<std::unique_ptr<PaddlePredictor>> predictors;
@@ -376,7 +387,7 @@ void TestMultiThreadPrediction(
     threads.emplace_back([&, tid]() {
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
-      std::vector<PaddleTensor> outputs_tid;
+      std::vector<std::vector<PaddleTensor>> outputs_tid;
       auto &predictor = predictors[tid];
 #ifdef PADDLE_WITH_MKLDNN
       if (use_analysis) {
@@ -384,8 +395,8 @@ void TestMultiThreadPrediction(
             ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
       }
 #endif
-      PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid);
-      PredictionRun(predictor.get(), inputs, outputs, num_threads, tid);
+      PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads, tid);
+      PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid);
     });
   }
   for (int i = 0; i < num_threads; ++i) {
@@ -395,8 +406,8 @@ void TestMultiThreadPrediction(
 
 void TestPrediction(const PaddlePredictor::Config *config,
                     const std::vector<std::vector<PaddleTensor>> &inputs,
-                    std::vector<PaddleTensor> *outputs, int num_threads,
-                    bool use_analysis = FLAGS_use_analysis) {
+                    std::vector<std::vector<PaddleTensor>> *outputs,
+                    int num_threads, bool use_analysis = FLAGS_use_analysis) {
   PrintConfig(config, use_analysis);
   if (num_threads == 1) {
     TestOneThreadPrediction(config, inputs, outputs, use_analysis);
@@ -406,30 +417,41 @@ void TestPrediction(const PaddlePredictor::Config *config,
   }
 }
 
-void CompareTopAccuracy(const std::vector<PaddleTensor> &output_slots1,
-                        const std::vector<PaddleTensor> &output_slots2) {
-  // first output: avg_cost
-  if (output_slots1.size() == 0 || output_slots2.size() == 0)
+void CompareTopAccuracy(
+    const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
+    const std::vector<std::vector<PaddleTensor>> &output_slots_ref) {
+  if (output_slots_quant.size() == 0 || output_slots_ref.size() == 0)
     throw std::invalid_argument(
         "CompareTopAccuracy: output_slots vector is empty.");
-  PADDLE_ENFORCE(output_slots1.size() >= 2UL);
-  PADDLE_ENFORCE(output_slots2.size() >= 2UL);
 
-  // second output: acc_top1
-  if (output_slots1[1].lod.size() > 0 || output_slots2[1].lod.size() > 0)
-    throw std::invalid_argument(
-        "CompareTopAccuracy: top1 accuracy output has nonempty LoD.");
-  if (output_slots1[1].dtype != paddle::PaddleDType::FLOAT32 ||
-      output_slots2[1].dtype != paddle::PaddleDType::FLOAT32)
-    throw std::invalid_argument(
-        "CompareTopAccuracy: top1 accuracy output is of a wrong type.");
-  float *top1_quantized = static_cast<float *>(output_slots1[1].data.data());
-  float *top1_reference = static_cast<float *>(output_slots2[1].data.data());
-  LOG(INFO) << "top1 INT8 accuracy: " << *top1_quantized;
-  LOG(INFO) << "top1 FP32 accuracy: " << *top1_reference;
+  float total_accs1_quant{0};
+  float total_accs1_ref{0};
+  for (size_t i = 0; i < output_slots_quant.size(); ++i) {
+    PADDLE_ENFORCE(output_slots_quant[i].size() >= 2UL);
+    PADDLE_ENFORCE(output_slots_ref[i].size() >= 2UL);
+    // second output: acc_top1
+    if (output_slots_quant[i][1].lod.size() > 0 ||
+        output_slots_ref[i][1].lod.size() > 0)
+      throw std::invalid_argument(
+          "CompareTopAccuracy: top1 accuracy output has nonempty LoD.");
+    if (output_slots_quant[i][1].dtype != paddle::PaddleDType::FLOAT32 ||
+        output_slots_ref[i][1].dtype != paddle::PaddleDType::FLOAT32)
+      throw std::invalid_argument(
+          "CompareTopAccuracy: top1 accuracy output is of a wrong type.");
+    total_accs1_quant +=
+        *static_cast<float *>(output_slots_quant[i][1].data.data());
+    total_accs1_ref +=
+        *static_cast<float *>(output_slots_ref[i][1].data.data());
+  }
+  float avg_acc1_quant = total_accs1_quant / output_slots_quant.size();
+  float avg_acc1_ref = total_accs1_ref / output_slots_ref.size();
+
+  LOG(INFO) << "Avg top1 INT8 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << avg_acc1_quant;
+  LOG(INFO) << "Avg top1 FP32 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << avg_acc1_ref;
   LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy;
-  CHECK_LE(std::abs(*top1_quantized - *top1_reference),
-           FLAGS_quantized_accuracy);
+  CHECK_LE(std::abs(avg_acc1_quant - avg_acc1_ref), FLAGS_quantized_accuracy);
 }
 
 void CompareDeterministic(
@@ -455,20 +477,35 @@ void CompareNativeAndAnalysis(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs) {
   PrintConfig(config, true);
-  std::vector<PaddleTensor> native_outputs, analysis_outputs;
+  std::vector<std::vector<PaddleTensor>> native_outputs, analysis_outputs;
   TestOneThreadPrediction(config, inputs, &native_outputs, false);
   TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
-  CompareResult(analysis_outputs, native_outputs);
+  PADDLE_ENFORCE(native_outputs.size() > 0, "Native output is empty.");
+  PADDLE_ENFORCE(analysis_outputs.size() > 0, "Analysis output is empty.");
+  CompareResult(analysis_outputs.back(), native_outputs.back());
 }
 
 void CompareQuantizedAndAnalysis(
-    const PaddlePredictor::Config *config,
-    const PaddlePredictor::Config *qconfig,
+    const AnalysisConfig *config, const AnalysisConfig *qconfig,
     const std::vector<std::vector<PaddleTensor>> &inputs) {
-  PrintConfig(config, true);
-  std::vector<PaddleTensor> analysis_outputs, quantized_outputs;
-  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
-  TestOneThreadPrediction(qconfig, inputs, &quantized_outputs, true);
+  PADDLE_ENFORCE_EQ(inputs[0][0].shape[0], FLAGS_batch_size,
+                    "Input data has to be packed batch by batch.");
+  LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size
+            << ", warmup batch size " << FLAGS_warmup_batch_size << ".";
+
+  LOG(INFO) << "--- FP32 prediction start ---";
+  auto *cfg = reinterpret_cast<const PaddlePredictor::Config *>(config);
+  PrintConfig(cfg, true);
+  std::vector<std::vector<PaddleTensor>> analysis_outputs;
+  TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true);
+
+  LOG(INFO) << "--- INT8 prediction start ---";
+  auto *qcfg = reinterpret_cast<const PaddlePredictor::Config *>(qconfig);
+  PrintConfig(qcfg, true);
+  std::vector<std::vector<PaddleTensor>> quantized_outputs;
+  TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true);
+
+  LOG(INFO) << "--- comparing outputs --- ";
   CompareTopAccuracy(quantized_outputs, analysis_outputs);
 }
 
@@ -578,9 +615,9 @@ static bool CompareTensorData(const framework::LoDTensor &a,
                               const framework::LoDTensor &b) {
   auto a_shape = framework::vectorize(a.dims());
   auto b_shape = framework::vectorize(b.dims());
-  size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1,
+  size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), size_t{1},
                                   [](int a, int b) { return a * b; });
-  size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1,
+  size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), size_t{1},
                                   [](int a, int b) { return a * b; });
   if (a_size != b_size) {
     LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index cb668a4174134ba3ce9517955ff740ada568e97b..98ce225a0476b38c021b0b81489f69d7953ae456 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -74,7 +74,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
     SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
   }
 
-  std::vector<PaddleTensor> outputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
   if (use_analysis || use_tensorrt) {
     AnalysisConfig config;
     config.EnableUseGpu(100, 0);
diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec
index 568db2cf06d3f1993ebc540bf83a28de8d225bd1..f0e3d3e86f24cb68f6e9d41f48c9698b43bca13e 100644
--- a/paddle/fluid/op_use_default_grad_op_maker.spec
+++ b/paddle/fluid/op_use_default_grad_op_maker.spec
@@ -3,15 +3,11 @@ acos
 asin
 atan
 attention_lstm
-bilinear_tensor_product
 brelu
 conv_shift
 cos
 cos_sim
 dequantize
-elementwise_div
-elementwise_max
-elementwise_min
 elu
 fc
 flatten
@@ -29,8 +25,6 @@ gelu
 gru
 hard_shrink
 hierarchical_sigmoid
-hinge_loss
-huber_loss
 leaky_relu
 log
 logsigmoid
@@ -43,7 +37,6 @@ max_pool3d_with_index
 maxout
 modified_huber_loss
 nce
-norm
 pool2d
 pool3d
 pow
@@ -59,17 +52,7 @@ requantize
 reshape
 rnn_memory_helper
 round
-row_conv
-sequence_concat
-sequence_conv
-sequence_expand
-sequence_expand_as
-sequence_pad
-sequence_scatter
-sequence_slice
 sequence_softmax
-sequence_unpad
-sigmoid_cross_entropy_with_logits
 sin
 softplus
 softshrink
@@ -84,7 +67,6 @@ stanh
 swish
 tanh_shrink
 teacher_student_sigmoid_loss
-temporal_shift
 tensor_array_to_tensor
 thresholded_relu
 transpose
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 1de59a5165c83a314a0ff8f4e4351aa3326beb67..9d7100cc3db91f5bf7dbd993c9f9ba5d4fc98ea6 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/affine_grid_op.h"
+#include <memory>
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -173,9 +175,10 @@ class AffineGridOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
-    auto theta_dims = ctx->GetInputDim("Theta");
     if (ctx->HasOutput(framework::GradVarName("Theta"))) {
-      ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims);
+      auto output_dims = ctx->GetInputDim(framework::GradVarName("Output"));
+      ctx->SetOutputDim(framework::GradVarName("Theta"),
+                        {output_dims[0], 2, 3});
     }
   }
 
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index fc15d56891cf7af10a91ca22a09c84fa2e52d465..7e2740f148f1d273310f44ed4a35d413e7201394 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -74,5 +74,8 @@ class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
   virtual void Apply() = 0;
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(BatchSizeLikeNoNeedBufferVarsInference,
+                                      "Input");
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 8d261a118a75ee16027faf60341cefd30c3cdbba..bd69f422e5dbd5a5dc95150b10daa302f47ec5ff 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/bilinear_tensor_product_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -121,15 +124,9 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
         "The second dimension of input(Out@GRAD) must be equal to "
         "the third dimension of the Input(Weight).");
 
-    if (ctx->HasInput("Bias")) {
-      auto bias_dims = ctx->GetInputDim("Bias");
-      PADDLE_ENFORCE_EQ(
-          bias_dims[1], out_dims[1],
-          "The second dimension of input(Out@GRAD) must be equal to "
-          "the second dimension of the Input(Bias).");
-      auto bias_grad_name = framework::GradVarName("Bias");
-      if (ctx->HasOutput(bias_grad_name))
-        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    auto bias_grad_name = framework::GradVarName("Bias");
+    if (ctx->HasOutput(bias_grad_name)) {
+      ctx->SetOutputDim(bias_grad_name, {1, out_dims[1]});
     }
 
     auto x_grad_name = framework::GradVarName("X");
@@ -148,13 +145,39 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class BilinearTensorProductGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("bilinear_tensor_product_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput("Weight", Input("Weight"));
+    if (ForwardOp().Inputs().count("Bias") > 0) {
+      op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    }
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp,
                   ops::BilinearTensorProductOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::BilinearTensorProductGradOpDescMaker);
 REGISTER_OPERATOR(bilinear_tensor_product_grad,
                   ops::BilinearTensorProductOpGrad);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 85612ba47448a7b0d712e9314e3980019c96e9c3..530a54b7ca186008bc8ec4b083254e65378ae619 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -13,10 +13,47 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include <memory>
+#include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseDivOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Div"; }
+  std::string GetEquation() const override { return "Out = X / Y"; }
+};
+
+class ElementwiseDivGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("elementwise_div_grad");
+    op->SetInput("Y", Input("Y"));
+    op->SetInput("Out", Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 
-REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");
+REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
+                  ops::ElementwiseDivOpMaker, ops::ElementwiseOpInferVarType,
+                  ops::ElementwiseDivGradOpDescMaker);
+
+REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index 8a07339077aeaa4403ffd1e1e30e0d58a9cc30e7..0f0ad8637301772f073bca305b9196b9c7865daf 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -47,7 +47,7 @@ struct DivGradDX {
 template <typename T>
 struct DivGradDY {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return -dout * x / (y * y);
+    return -dout * out / y;
   }
 };
 
@@ -58,13 +58,15 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
     ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Input<Tensor>("Y");
     auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
+
+    auto* x = dout;  // Fake x, not used
+
     ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index ea0dcd736e5700fb0f341938ac3e3e3b178f29c1..b7df9c6f845dfc941e3c6acbc986a584e984a1de 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -13,9 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include <memory>
+#include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Max"; }
+  std::string GetEquation() const override { return "Out = max(X, Y)"; }
+};
+
+class ElementwiseMaxGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("elementwise_max_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_max, "Max", "Out = max(X, Y)");
+
+REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
+                  ops::ElementwiseMaxOpMaker, ops::ElementwiseOpInferVarType,
+                  ops::ElementwiseMaxGradOpDescMaker);
+
+REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
index 3ee0c32e0d5d5df02d5d157416918fb4fb3aca92..abdb1b9671de80d02b9a6a788088f47929fcc6f0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h
@@ -63,10 +63,10 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
 
     auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* out = dout;  // Fake out, not used
     int axis = ctx.Attr<int>("axis");
     ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index b263b9addd40cfd329d2cc8588c278df2cb008e9..f60c0ed8a0faad384f4eaa631c2758f83bc56414 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -13,9 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
+#include <memory>
+#include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseMinOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Min"; }
+  std::string GetEquation() const override { return "Out = min(X, Y)"; }
+};
+
+class ElementwiseMinGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("elementwise_min_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_min, "Min", "Out = min(X, Y)");
+
+REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
+                  ops::ElementwiseMinOpMaker, ops::ElementwiseOpInferVarType,
+                  ops::ElementwiseMinGradOpDescMaker);
+
+REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_min,
     ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
index d04e372faaa4e6296e982afe6155cdde2fec4f81..1a49a6013987ae1ec685ec91ca656e4756ba7c32 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h
@@ -62,10 +62,10 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
 
     auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* out = dout;  // Fake out, not used
     int axis = ctx.Attr<int>("axis");
     ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 6dbb9072495f743a4df1ff05e029a227c2cf618b..95246b38f530ff5f81e1fbb5f1dd22149943c8ff 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -173,12 +173,12 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
   using Tensor = framework::Tensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    auto out_grad_name = framework::GradVarName("Out");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+    PADDLE_ENFORCE(ctx->HasInput(out_grad_name),
                    "Input(Out@GRAD) should not be null");
 
-    auto x_dims = ctx->GetInputDim("X");
+    auto x_dims = ctx->GetInputDim(out_grad_name);
     auto y_dims = ctx->GetInputDim("Y");
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
@@ -187,8 +187,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
     if (ctx->HasOutput(x_grad_name)) {
-      ctx->ShareDim("X", /*->*/ x_grad_name);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
+      ctx->ShareDim(out_grad_name, /*->*/ x_grad_name);
+      ctx->ShareLoD(out_grad_name, /*->*/ x_grad_name);
     }
     if (ctx->HasOutput(y_grad_name)) {
       ctx->ShareDim("Y", /*->*/ y_grad_name);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
index 453a1b32a0171a2ca88879ab3287e89c4d3c7759..b8921b171cf37be17fb62d270a5c22f9d1806c64 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -46,6 +46,7 @@ obtained from the `input` tensor.
 )DOC");
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -53,7 +54,8 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(fill_constant_batch_size_like,
                   ops::FillConstantBatchSizeLikeOp,
                   paddle::framework::EmptyGradOpMaker,
-                  ops::FillConstantBatchSizeLikeOpMaker);
+                  ops::FillConstantBatchSizeLikeOpMaker,
+                  ops::BatchSizeLikeNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     fill_constant_batch_size_like,
     ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index d67bec36b3248be8602da562a88aeb58f5effe39..107f83e3f885bcd5a71aaf1e51cbd0bd39b676f0 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -36,6 +36,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The input of fill-zeros-like op.");
     AddOutput("Out", "The variable will be filled up with zeros.");
+    ExtraMake();
     AddComment(R"DOC(
 FillZerosLike Operator.
 
@@ -44,13 +45,49 @@ The output will have the same size as the input.
 
 )DOC");
   }
+
+ protected:
+  virtual void ExtraMake() {}
+};
+
+class FillZerosLikeOp2 : public FillZerosLikeOp {
+ public:
+  using FillZerosLikeOp::FillZerosLikeOp;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
 };
+
+class FillZerosLikeOp2Maker : public FillZerosLikeOpMaker {
+ protected:
+  void ExtraMake() override {
+    this->AddAttr<int>("dtype",
+                       "(int, default 5(FP32)) "
+                       "Output data type.")
+        .SetDefault(framework::proto::VarType::FP32);
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(FillZerosLikeOp2NoNeedBufferVarsInference,
+                                      "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
                              ops::FillZerosLikeOpMaker);
+
+REGISTER_OPERATOR(fill_zeros_like2, ops::FillZerosLikeOp2,
+                  ops::FillZerosLikeOp2Maker,
+                  ops::FillZerosLikeOp2NoNeedBufferVarsInference,
+                  paddle::framework::EmptyGradOpMaker);
+
 REGISTER_OP_CPU_KERNEL(
     fill_zeros_like,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
@@ -58,3 +95,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
+
+REGISTER_OP_CPU_KERNEL(
+    fill_zeros_like2,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
index e80a703c30c0335124c089ea82ba4f6fe055acde..1831635def79b3ccb713dbc14cc70b8beeb609fc 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -26,3 +26,13 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::float16>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fill_zeros_like2,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::float16>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
index 98ebe1fdf4bb3308b2f07a073072031e79e14146..01302687a421165e908b2aa0646ba8b9c835034e 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -65,17 +65,13 @@ by input arguments.
   }
 };
 
-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
-    GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input");
-
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OPERATOR(
-    gaussian_random_batch_size_like,
-    paddle::operators::GaussianRandomBatchSizeLikeOp,
-    paddle::operators::GaussianRandomBatchSizeLikeOpMaker,
-    paddle::framework::EmptyGradOpMaker,
-    paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference);
+REGISTER_OPERATOR(gaussian_random_batch_size_like,
+                  paddle::operators::GaussianRandomBatchSizeLikeOp,
+                  paddle::operators::GaussianRandomBatchSizeLikeOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::BatchSizeLikeNoNeedBufferVarsInference);
 
 // Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 2ab40f482d7a1463703085037bcb94fd4aecf377..09fd6a25d18d5484f4d1c1631faae8da2fbd5473 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -107,8 +108,6 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
     // check input
     PADDLE_ENFORCE(ctx->HasInput("Y"),
                    "Input(Y) of GroupNormOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Mean"),
-                   "Input(Mean) of GroupNormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Variance"),
                    "Input(Variance) of GroupNormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
@@ -159,7 +158,6 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("Bias", Input("Bias"));
     op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
     op->SetInput("Y", Output("Y"));
-    op->SetInput("Mean", Output("Mean"));
     op->SetInput("Variance", Output("Variance"));
 
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index f458ce6c83bfcfb56d558409b0802f27f13a4761..b6cfa9cc43c312e60a1b7c5e13d1ecbe6bc5dc7d 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/hinge_loss_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -97,12 +100,29 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class HingeLossGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("hinge_loss_grad");
+    op->SetInput("Logits", Input("Logits"));
+    op->SetInput("Labels", Input("Labels"));
+    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::HingeLossGradOpDescMaker);
 REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     hinge_loss,
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 253b65a5f33308fc2c94537641b0fa19378b0cc9..a72db384c1f09f66ecf7ce85271d6263bbdcb523 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/huber_loss_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -90,38 +93,45 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Residual"),
-                   "Input(Residual) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null.");
 
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
     auto residual_dims = ctx->GetInputDim("Residual");
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(residual_dims, x_dims);
-    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims);
 
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
     if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->SetOutputDim(x_grad_name, residual_dims);
     }
     if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
+      ctx->SetOutputDim(y_grad_name, residual_dims);
     }
   }
 };
 
+class HuberLossGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("huber_loss_grad");
+    op->SetInput("Residual", Output("Residual"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::HuberLossGradOpDescMaker);
 REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index aa19c62c83648814e86b1e7062424be3693e4b98..81fbe3e514241ecdd2832141eba4250ced2017a9 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/norm_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
@@ -74,6 +78,24 @@ class NormOpGrad : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
+
+class NormOpGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("norm_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Norm", Output("Norm"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -81,7 +103,7 @@ namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::NormOpGradOpDescMaker);
 REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
 REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel<CPU, float>,
                        ops::NormKernel<CPU, double>);
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 6ef2dacb3869ab3b20505699c2fbe2f129c20068..9731aefa95c5243e29ace87ad8c35d5b01904e60 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -612,8 +615,9 @@ class Pad2dOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
   }
 };
 
@@ -625,7 +629,9 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto* bind = new framework::OpDesc();
     bind->SetInput("X", Input("X"));
-    bind->SetInput("Paddings", Input("Paddings"));
+    if (ForwardOp().Inputs().count("Paddings") > 0) {
+      bind->SetInput("Paddings", Input("Paddings"));
+    }
     bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     bind->SetAttrMap(Attrs());
@@ -634,6 +640,10 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+// TODO(zjl): Paddings can also be skipped!
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(Pad2dOpGradNoNeedBufferVarsInference,
+                                      "X");
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -641,6 +651,7 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker,
                   ops::Pad2dOpGradMaker);
-REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad);
+REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad,
+                  ops::Pad2dOpGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel<float>);
 REGISTER_OP_CPU_KERNEL(pad2d_grad, ops::Pad2dGradCPUKernel<float>);
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index d283bddbe9f974ac6835ee91d5a7851453687b80..81aabdd0061b3940f23d4731d55fc5cbe5817004 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/row_conv_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 
 namespace paddle {
@@ -54,7 +58,6 @@ class RowConvGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Filter"),
                    "Input(Filter) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
@@ -62,8 +65,8 @@ class RowConvGradOp : public framework::OperatorWithKernel {
 
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
-      auto x_dims = ctx->GetInputDim("X");
-      ctx->SetOutputDim(x_grad_name, x_dims);
+      auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+      ctx->SetOutputDim(x_grad_name, dout_dims);
     }
 
     auto filter_grad_name = framework::GradVarName("Filter");
@@ -259,12 +262,31 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
     }
   }
 };
+
+class RowConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("row_conv_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("X", Input("X"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::RowConvGradOpDescMaker);
 REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp);
 REGISTER_OP_CPU_KERNEL(
     row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
index 37f1b9dda50ba4b62d7cf75765125e0ad663d9d8..d652f9216f8faf53deeac2c9ce1f737651c3939b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
+#include <memory>
 #include <vector>
 
 namespace paddle {
@@ -73,13 +74,43 @@ class SeqConcatShapeInferer : public framework::InferShapeBase {
   }
 };
 
-class SeqConcatGradShapeInferer : public framework::InferShapeBase {
+class SeqConcatGradOpDescMaker : public framework::SingleGradOpDescMaker {
  public:
-  void operator()(framework::InferShapeContext *context) const override {
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_concat_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+class SeqConcatGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *context) const override {
     context->SetOutputsDim(framework::GradVarName("X"),
                            context->GetInputsDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SeqConcatGradNoNeedBufferVarsInference,
+                                      "X");
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -87,14 +118,14 @@ namespace op = paddle::operators;
 
 REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
                   op::SeqConcatOpMaker, op::SeqConcatShapeInferer,
-                  paddle::framework::DefaultGradOpDescMaker<false>);
+                  op::SeqConcatGradOpDescMaker);
 template <typename T>
 using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
 REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
                        Kernel<int64_t>);
 
-REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel,
-                  op::SeqConcatGradShapeInferer);
+REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp,
+                  op::SeqConcatGradNoNeedBufferVarsInference);
 template <typename T>
 using GradKernel =
     op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>;
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
index ff035f421c4907ba940b973b3fd2a9421ed2dbae..f9b2ed3846a0f29bd2b058b944360a8fb66c24f8 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
@@ -14,7 +14,9 @@
 
 #pragma once
 
+#include <utility>
 #include <vector>
+#include "boost/optional.hpp"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
@@ -89,37 +91,49 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {
         dxs[i]->mutable_data<T>(context.GetPlace());
       }
     }
+
     std::vector<framework::Tensor> sliced_x;
-    std::vector<boost::variant<boost::blank, framework::Tensor>> sliced_dx;
+    std::vector<boost::optional<framework::Tensor>> sliced_dx;
 
     for (size_t i = 1; i < xs[0]->lod()[0].size(); ++i) {
       for (size_t j = 0; j < xs.size(); ++j) {
         const framework::LoDTensor *x = xs[j];
+        framework::DDim x_dims = x->dims();
+
         framework::LoDTensor *dx = dxs[j];
         auto &x_lod = x->lod()[0];
-        sliced_x.emplace_back(x->Slice(x_lod[i - 1], x_lod[i]));
-        if (dx != nullptr) {
-          sliced_dx.emplace_back(dx->Slice(x_lod[i - 1], x_lod[i]));
+
+        auto prev_lod = x_lod[i - 1];
+        auto next_lod = x_lod[i];
+
+        x_dims[0] = next_lod - prev_lod;
+
+        sliced_x.emplace_back();
+        sliced_x.back().Resize(x_dims);
+
+        if (dx) {
+          sliced_dx.emplace_back(dx->Slice(prev_lod, next_lod));
         } else {
-          sliced_dx.emplace_back(boost::blank());
+          sliced_dx.emplace_back(boost::none);
         }
       }
     }
 
-    math::SplitFunctor<DeviceContext, T> functor;
     std::vector<const framework::Tensor *> sliced_x_ptr;
-    std::vector<framework::Tensor *> sliced_dx_ptr;
+    sliced_x_ptr.reserve(sliced_x.size());
     for (auto &x : sliced_x) {
       sliced_x_ptr.emplace_back(&x);
     }
 
+    std::vector<framework::Tensor *> sliced_dx_ptr;
+    sliced_dx_ptr.reserve(sliced_dx.size());
     for (auto &dx : sliced_dx) {
-      try {
-        sliced_dx_ptr.emplace_back(&boost::get<framework::Tensor>(dx));
-      } catch (boost::bad_get &) {
-        sliced_dx_ptr.emplace_back(nullptr);
+      if (dx) {
+        sliced_dx_ptr.emplace_back(&dx.get());
       }
     }
+
+    math::SplitFunctor<DeviceContext, T> functor;
     functor(context.template device_context<DeviceContext>(),
             detail::Ref(
                 context.Input<framework::Tensor>(framework::GradVarName("Out")),
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
index 65cd9edbc7125f605d6fb437a2e056054eb9a6d7..89c1fe834832802cc86dacd5a2d8c22bafa6072b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h"
 
 #include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_set>
 
 namespace paddle {
 namespace operators {
@@ -171,13 +174,57 @@ context_length, context_stride and context_start.
   }
 };
 
+class SequenceConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_conv_grad");
+    op->SetAttrMap(Attrs());
+
+    if (boost::get<bool>(Attrs().at("paddingTrainable")) &&
+        ForwardOp().Inputs().count("PaddingData") > 0) {
+      op->SetInput("PaddingData", Input("PaddingData"));
+      op->SetOutput(framework::GradVarName("PaddingData"),
+                    InputGrad("PaddingData"));
+    }
+
+    op->SetInput("X", Input("X"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+
+    return op;
+  }
+};
+
+class SequenceConvGradNoNeedBufferVarsInference
+    : public framework::NoNeedBufferVarsInference {
+ public:
+  using framework::NoNeedBufferVarsInference::NoNeedBufferVarsInference;
+
+  std::unordered_set<std::string> operator()() const override {
+    if (!boost::get<bool>(Attrs().at("paddingTrainable"))) {
+      return {"PaddingData"};
+    } else {
+      return {};
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp);
+                  ops::SequenceConvGradOpDescMaker);
+
+REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp,
+                  ops::SequenceConvGradNoNeedBufferVarsInference);
 
 REGISTER_OP_CPU_KERNEL(
     sequence_conv,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
index 3b79d0c71975bb740b4085ce80f7d95b65f600c1..e1f6c3e3d599340acfa9bb5b47017b003721e4a3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
+#include <memory>
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -70,6 +72,12 @@ class SequenceExpandAsOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", out_dims);
     ctx->ShareLoD("Y", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
 };
 
 class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -131,7 +139,6 @@ class SequenceExpandAsOpGrad : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null.");
 
@@ -143,16 +150,48 @@ class SequenceExpandAsOpGrad : public framework::OperatorWithKernel {
       ctx->ShareLoD("X", x_grad_name);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 
+class SequenceExpandAsOpGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_expand_as_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceExpandAsOpNoNeedBufferVarsInference, "Y");
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceExpandAsGradOpNoNeedBufferVarsInference, "X", "Y");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_expand_as, ops::SequenceExpandAsOp,
                   ops::SequenceExpandAsOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad);
+                  ops::SequenceExpandAsOpGradOpDescMaker,
+                  ops::SequenceExpandAsOpNoNeedBufferVarsInference);
+REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad,
+                  ops::SequenceExpandAsGradOpNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand_as,
     ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index f6c42415301bc8d6f3509bfba2ff356265643bad..b7c0420636ab60e8a3e0a9332cbd3858aacda1b0 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -96,6 +97,12 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", out_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
 };
 
 class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -188,7 +195,6 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null.");
 
@@ -199,16 +205,47 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 
+class SequenceExpandOpGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_expand_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SequenceExpandOpNoNeedBufferVarsInference,
+                                      "Y");
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceExpandGradOpNoNeedBufferVarsInference, "X", "Y");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp,
                   ops::SequenceExpandOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad);
+                  ops::SequenceExpandOpGradDescMaker,
+                  ops::SequenceExpandOpNoNeedBufferVarsInference);
+REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad,
+                  ops::SequenceExpandGradOpNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand,
     ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index 23c7bf7cea830bb0ccf5e81f99130043c2d5f80b..5290d0e6c6a2569e389345f61a0844ce3cbde10f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
+#include <memory>
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -194,18 +196,39 @@ class SequencePadGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    auto data_type = framework::GetDataTypeOfVar(
+        ctx.InputVar(framework::GradVarName("Out")));
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
+class SequencePadGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_pad_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequencePadGradOpNoNeedBufferVarsInference, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp);
+                  ops::SequencePadGradOpDescMaker);
+REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp,
+                  ops::SequencePadGradOpNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_pad,
     ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 1754221e7711b09c38f81c3f5803daa5372ed0dd..b4923571df95432d030d393a69d427f3ae17f298 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
+#include <memory>
 #include <string>
 
 namespace paddle {
@@ -114,8 +115,9 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
   }
 };
 
@@ -138,13 +140,17 @@ class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequencePoolGradOpNoNeedBufferVarsInference, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
                   ops::SequencePoolGradOpMaker);
-REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp);
+REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp,
+                  ops::SequencePoolGradOpNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_pool,
     ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index 8267c04f9f20511deba363f9a0aae761736ba90b..5a22212edf29cc79d28b12029dc7595ae5f1aab3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_scatter_op.h"
+#include <memory>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
@@ -124,25 +125,49 @@ class SequenceScatterGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("Updates"),
                       ctx->GetInputDim("Updates"));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   platform::CPUPlace());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        platform::CPUPlace());
   }
 };
 
+class SequenceScatterGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_scatter_grad");
+    op->SetInput("Ids", Input("Ids"));
+    op->SetInput("Updates", Input("Updates"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceScatterGradNoNeedBufferVarsInference, "Updates");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_scatter, ops::SequenceScatterOp,
                   ops::SequenceScatterOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_scatter_grad, ops::SequenceScatterGradOp);
+                  ops::SequenceScatterGradDescMaker);
+REGISTER_OPERATOR(sequence_scatter_grad, ops::SequenceScatterGradOp,
+                  ops::SequenceScatterGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(sequence_scatter, ops::SequenceScatterOpKernel<float>,
                        ops::SequenceScatterOpKernel<double>,
                        ops::SequenceScatterOpKernel<int>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
index 35f49f78cedaca59d58ea19b909e5a950281c6e9..4b2ec6e7cad7c04e248c0ffbb117951fba1ec877 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -70,8 +71,9 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
   }
 };
 
@@ -113,14 +115,35 @@ NOTE: The first dimension size of input, the size of offset and Length, should b
   }
 };
 
+class SequenceSliceGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_slice_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Offset", Input("Offset"));
+    op->SetInput("Length", Input("Length"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceSliceGradNoNeedBufferVarsInference, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp,
-                  ops::SequenceSliceOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp);
+                  ops::SequenceSliceOpMaker, ops::SequenceSliceGradOpDescMaker);
+REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp,
+                  ops::SequenceSliceGradNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice,
     ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
index 2cf508e0b707ecc986886e72e5d42fde3c84894d..6c98a3e8731abb989f8dab97eff5c6ad56111742 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
+#include <memory>
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -125,19 +127,39 @@ class SequenceUnpadGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    auto data_type = framework::GetDataTypeOfVar(
+        ctx.InputVar(framework::GradVarName("Out")));
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
+class SequenceUnpadGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_unpad_grad");
+    op->SetAttrMap(Attrs());
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return op;
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
+    SequenceUnpadGradOpNoNeedBufferVarsInference, "X");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp,
-                  ops::SequenceUnpadOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp);
+                  ops::SequenceUnpadOpMaker, ops::SequenceUnpadGradOpDescMaker);
+REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp,
+                  ops::SequenceUnpadGradOpNoNeedBufferVarsInference);
 REGISTER_OP_CPU_KERNEL(
     sequence_unpad,
     ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
index 07df3dca831d7e646050ae57402c1a493c2e50e9..fe8ca41b698159a782547ce673a374d074d3b73d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -81,10 +81,9 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
     auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
     if (d_x) {
       const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-      const auto* x_t = ctx.Input<LoDTensor>("X");
       d_x->mutable_data<T>(ctx.GetPlace());
 
-      int padded_length = x_t->dims()[1];
+      int padded_length = d_x->dims()[1];
 
       LoDTensor zero_pads;
       zero_pads.Resize({1, 1});
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index 26355e58615454c8e9aea1d6a5405368e6006e87..ad6fb3510f02ae783c8ae4318f559a8db74a59d1 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/shuffle_channel_op.h"
 #include <memory>
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -73,12 +74,7 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@Grad) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output(X@Grad) should not be null");
-
-    auto input_dims = ctx->GetInputDim("X");
+    auto input_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
 
     ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
@@ -87,8 +83,9 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.device_context());
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
   }
 };
 
@@ -100,7 +97,6 @@ class ShuffleChannelGradDescMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
     op->SetType("shuffle_channel_grad");
-    op->SetInput("X", Input("X"));
     op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     op->SetAttrMap(Attrs());
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index 9506343b3d508459c6e10dc68eba13504b07338f..dbc3e1a7ebe26ffccd24d1749093d014751d866f 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -78,10 +78,14 @@ template <typename DeviceContext, typename T>
 class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     int group = ctx.Attr<int>("group");
 
-    auto input_dims = input->dims();
+    const auto& input_dims = input_grad->dims();
     auto num = input_dims[0];
     auto channel = input_dims[1];
     auto height = input_dims[2];
@@ -91,10 +95,7 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
 
     int group_row = group;
     int group_column = channel / group_row;
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
     const T* output_grad_data = output_grad->data<T>();
 
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
index f6af1bc88598870ebccef81bd37f93f376940851..3ce1e0c770bb3fe6c4b0a54dad14e47f372958af 100644
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ b/paddle/fluid/operators/shuffle_channel_op.h
@@ -57,10 +57,14 @@ template <typename DeviceContext, typename T>
 class ShuffleChannelGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     int group = ctx.Attr<int>("group");
 
-    auto input_dims = input->dims();
+    const auto& input_dims = input_grad->dims();
     auto num = input_dims[0];
     auto channel = input_dims[1];
     auto height = input_dims[2];
@@ -71,10 +75,6 @@ class ShuffleChannelGradOpKernel : public framework::OpKernel<T> {
     int group_row = group;
     int group_column = channel / group_row;
 
-    auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
     const T* output_grad_data = output_grad->data<T>();
     for (int n = 0; n < num; ++n) {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index c21b0c13c752b82b80c120cb5a5d4a010ef18287..5c92588cc1d073612d2f6a7b315edf16cc14bedd 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -139,6 +142,24 @@ However the output only shares the LoD with input `X`.
   }
 };
 
+class SigmoidCrossEntropyWithLogitsGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sigmoid_cross_entropy_with_logits_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Label", Input("Label"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -146,7 +167,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
                   ops::SigmoidCrossEntropyWithLogitsOp,
                   ops::SigmoidCrossEntropyWithLogitsOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SigmoidCrossEntropyWithLogitsGradOpDescMaker);
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 94995fc99612adb1164e60f1a51747f74eacfb73..589c98e51e32bc9eb7d6ccfb721a6a5f091470cf 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/slice_op.h"
 #include <algorithm>
+#include <memory>
 #include <vector>
 
 namespace paddle {
@@ -135,6 +136,13 @@ class SliceOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 
 class SliceOpGradMaker : public framework::SingleGradOpDescMaker {
@@ -153,13 +161,17 @@ class SliceOpGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SliceOpGradNoNeedBufferVarsInference,
+                                      "Input");
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker,
                   ops::SliceOpGradMaker);
-REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad);
+REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad,
+                  ops::SliceOpGradNoNeedBufferVarsInference);
 
 REGISTER_OP_CPU_KERNEL(
     slice, ops::SliceKernel<paddle::platform::CPUDeviceContext, int>,
diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
index 7df649fc5b7bf8671303a28d727be1d85c1fa6e4..3b7d90b795b45d97dfdbe90f7e37ea28b942f2a0 100644
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
@@ -10,6 +10,9 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/temporal_shift_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -125,19 +128,32 @@ class TemporalShiftOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto dim_x = ctx->GetInputDim("X");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+      ctx->SetOutputDim(framework::GradVarName("X"),
+                        ctx->GetInputDim(framework::GradVarName("Out")));
     }
   }
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
+};
+
+class TemporalShiftGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("temporal_shift_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
   }
 };
 
@@ -146,8 +162,7 @@ class TemporalShiftOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(temporal_shift, ops::TemporalShiftOp,
-                  ops::TemporalShiftOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::TemporalShiftOpMaker, ops::TemporalShiftGradOpDescMaker);
 REGISTER_OPERATOR(temporal_shift_grad, ops::TemporalShiftOpGrad);
 REGISTER_OP_CPU_KERNEL(temporal_shift, ops::TemporalShiftKernel<float>,
                        ops::TemporalShiftKernel<double>);
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
index 75d6181749e4e9bd81a3c02de69caf0acd81eef9..7260fe25d6ebb357040af8774c574b767bfd9f13 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -64,8 +64,9 @@ with random values sampled from a uniform distribution.
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(
-    uniform_random_batch_size_like,
-    paddle::operators::UniformRandomBatchSizeLikeOp,
-    paddle::operators::UniformRandomBatchSizeLikeOpMaker);
+REGISTER_OPERATOR(uniform_random_batch_size_like,
+                  paddle::operators::UniformRandomBatchSizeLikeOp,
+                  paddle::operators::UniformRandomBatchSizeLikeOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::BatchSizeLikeNoNeedBufferVarsInference);
 // Kernels are registered in uniform_random_op.cc and uniform_random_op.cu
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5b79b759d555934bbc40a03da316d138f4e81a99..044677fb756e0368c65b84f15fdf2540abbd14b8 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1299,7 +1299,20 @@ All parameter, weight, gradient are variables in Paddle.
                       to fuse relu and depthwise_conv2d,
                       it will save GPU memory and may make the execution faster.
                       This options is only available in GPU devices.
-                      Default False)DOC")
+                      Default False.)DOC")
+      .def_property(
+          "fuse_broadcast_ops",
+          [](const BuildStrategy &self) { return self.fuse_broadcast_ops_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
+            self.fuse_broadcast_ops_ = b;
+          },
+          R"DOC(The type is BOOL, fuse_broadcast_op indicates whether
+                      to fuse the broadcast ops. Note that, in Reduce mode,
+                      fusing broadcast ops may make the program faster. Because
+                      fusing broadcast OP equals delaying the execution of all
+                      broadcast Ops, in this case, all nccl streams are used only
+                      for NCCLReduce operations for a period of time. Default False.)DOC")
       .def_property("fuse_all_optimizer_ops",
                     [](const BuildStrategy &self) {
                       return self.fuse_all_optimizer_ops_;
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 6303be003a701e57a8aa1e2f925459f416cdb543..9fd53a74bf51929f9e115fdc94f2f85f8e2fbdda 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -231,9 +231,16 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
     for idx, op_desc in enumerate(op_descs):
         for arg in op_desc.input_arg_names():
             if core.grad_var_suffix() in arg and arg in no_grad_set:
-                to_insert.append((_create_op_desc_("fill_zeros_like", {
-                    "X": [_strip_grad_suffix_(arg)]
-                }, {"Out": [arg]}, {}), idx))
+                x_in = _strip_grad_suffix_(arg)
+                x_in_var_desc = op_desc.block().find_var_recursive(
+                    cpt.to_bytes(x_in))
+                assert x_in_var_desc is not None, "Variable {} not found".format(
+                    x_in)
+                dtype = x_in_var_desc.dtype()
+
+                to_insert.append(
+                    (_create_op_desc_("fill_zeros_like2", {"X": [x_in]},
+                                      {"Out": [arg]}, {"dtype": dtype}), idx))
 
     list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
 
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 7665703a726050b2a236b5d96bbafb6c24b2617a..bf3a16addcb532e7b6a781e3845e0f5791ac5d71 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -15,19 +15,20 @@
 from __future__ import print_function
 
 from six.moves import reduce
-import numpy as np
 
 from .. import core
 from ..layers import utils
 from . import layers
-from ..framework import Variable, OpProtoHolder, Parameter
-from ..layers import layer_function_generator
+from ..framework import Variable, _in_dygraph_mode, OpProtoHolder, Parameter
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant, NumpyArrayInitializer
+import numpy as np
 
 __all__ = [
-    'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm',
-    'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', 'SequenceConv'
+    'Conv2D', 'Conv3D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit',
+    'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose',
+    'Conv3DTranspose', 'SequenceConv', 'RowConv', 'GroupNorm', 'SpectralNorm',
+    'TreeConv'
 ]
 
 
@@ -240,6 +241,303 @@ class Conv2D(layers.Layer):
         return self._helper.append_activation(pre_act, act=self._act)
 
 
+class Conv3D(layers.Layer):
+    """
+    **Convlution3D Layer**
+
+    The convolution3D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and
+    Output(Output) are in NCDHW format. Where N is batch size C is the number of
+    channels, D is the depth of the feature, H is the height of the feature,
+    and W is the width of the feature. Convlution3D is similar with Convlution2D
+    but adds one dimension(depth). If bias attribution and activation type are
+    provided, bias is added to the output of the convolution, and the
+    corresponding activation function is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
+
+        - Output:
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
+
+    Args:
+        input (Variable): The input image with [N, C, D, H, W] format.
+            num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv3d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv3d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None.
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu")
+    """
+
+    def __init__(self,
+                 name_scope,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 act=None):
+        assert param_attr is not False, "param_attr should not be False here."
+        super(Conv3D, self).__init__(name_scope)
+        self._groups = groups
+        self._stride = utils.convert_to_list(stride, 3, 'stride')
+        self._padding = utils.convert_to_list(padding, 3, 'padding')
+        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
+        self._act = act
+        if not isinstance(use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+        self._use_cudnn = use_cudnn
+        self._filter_size = filter_size
+        self._num_filters = num_filters
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+
+    def _build_once(self, input):
+        num_channels = input.shape[1]
+        self._dtype = self._helper.input_dtype(input)
+
+        if self._groups is None:
+            num_filter_channels = num_channels
+        else:
+            if num_channels % self._groups != 0:
+                raise ValueError("num_channels must be divisible by groups.")
+            num_filter_channels = num_channels // self._groups
+
+        filter_size = utils.convert_to_list(self._filter_size, 3, 'filter_size')
+
+        filter_shape = [self._num_filters, num_filter_channels] + filter_size
+
+        def _get_default_param_initializer():
+            filter_elem_num = filter_size[0] * filter_size[1] * filter_size[
+                2] * num_channels
+            std = (2.0 / filter_elem_num)**0.5
+            return Normal(0.0, std, 0)
+
+        self._filter_param = self.create_parameter(
+            attr=self._param_attr,
+            shape=filter_shape,
+            dtype=self._dtype,
+            default_initializer=_get_default_param_initializer())
+
+        self._bias_param = self.create_parameter(
+            attr=self._bias_attr,
+            shape=[self._num_filters],
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, input):
+        pre_bias = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type='conv3d',
+            inputs={
+                'Input': input,
+                'Filter': self._filter_param,
+            },
+            outputs={"Output": pre_bias},
+            attrs={
+                'strides': self._stride,
+                'paddings': self._padding,
+                'dilations': self._dilation,
+                'groups': self._groups if self._groups else 1,
+                'use_cudnn': self._use_cudnn,
+                'use_mkldnn': False
+            })
+
+        pre_act = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type='elementwise_add',
+            inputs={'X': [pre_bias],
+                    'Y': [self._bias_param]},
+            outputs={'Out': [pre_act]},
+            attrs={'axis': 1})
+
+        return self._helper.append_activation(pre_act, act=self._act)
+
+
+class Conv3DTranspose(layers.Layer):
+    def __init__(self,
+                 name_scope,
+                 num_filters,
+                 output_size=None,
+                 filter_size=None,
+                 padding=0,
+                 stride=1,
+                 dilation=1,
+                 groups=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 act=None,
+                 name=None):
+        super(Conv3DTranspose, self).__init__(name_scope)
+        if not isinstance(use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+        assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
+        self._padding = utils.convert_to_list(padding, 3, 'padding')
+        self._stride = utils.convert_to_list(stride, 3, 'stride')
+        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
+        self._param_attr = param_attr
+        self._filter_size = filter_size
+        self._output_size = output_size
+        self._groups = 1 if groups is None else groups
+        self._num_filters = num_filters
+        self._use_cudnn = use_cudnn
+        self._bias_attr = bias_attr
+        self._act = act
+
+    def _build_once(self, input):
+        self._dtype = self._helper.input_dtype(input)
+        self._input_channel = input.shape[1]
+
+        if self._filter_size is None:
+            if self._output_size is None:
+                raise ValueError(
+                    "output_size must be set when filter_size is None")
+            if isinstance(self._output_size, int):
+                self._output_size = [self._output_size, self._output_size]
+
+            d_in = input.shape[2]
+            h_in = input.shape[3]
+            w_in = input.shape[4]
+
+            filter_size_d = (self._output_size[0] -
+                             (d_in - 1) * self._stride[0] + 2 * self._padding[0]
+                             - 1) // self._dilation[0] + 1
+            filter_size_h = (self._output_size[1] -
+                             (h_in - 1) * self._stride[1] + 2 * self._padding[1]
+                             - 1) // self._dilation[1] + 1
+            filter_size_w = (self._output_size[2] -
+                             (w_in - 1) * self._stride[2] + 2 * self._padding[2]
+                             - 1) // self._dilation[2] + 1
+            self._filter_size = [filter_size_d, filter_size_h, filter_size_w]
+        else:
+            self._filter_size = utils.convert_to_list(
+                self._filter_size, 3, 'conv3d_transpose.filter_size')
+
+        filter_shape = [
+            self._input_channel, self._num_filters // self._groups
+        ] + self._filter_size
+        self._img_filter = self.create_parameter(
+            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
+        if self._bias_attr:
+            self._bias_param = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[self._num_filters],
+                dtype=self._dtype,
+                is_bias=True)
+
+    def forward(self, input):
+        pre_bias = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        self._helper.append_op(
+            type="conv3d_transpose",
+            inputs={'Input': [input],
+                    'Filter': [self._img_filter]},
+            outputs={'Output': pre_bias},
+            attrs={
+                'strides': self._stride,
+                'paddings': self._padding,
+                'dilations': self._dilation,
+                'groups': self._groups if self._groups else 1,
+                'use_cudnn': self._use_cudnn
+            })
+
+        if self._bias_attr:
+            pre_act = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias],
+                        'Y': [self._bias_param]},
+                outputs={'Out': [pre_act]},
+                attrs={'axis': 1})
+        else:
+            pre_act = pre_bias
+
+        # Currently, we don't support inplace in imperative mode
+        return self._helper.append_activation(pre_act, act=self._act)
+
+
 class Pool2D(layers.Layer):
     """
     ${comment}
@@ -1720,6 +2018,8 @@ class SequenceConv(layers.Layer):
                  bias_attr=None,
                  param_attr=None,
                  act=None):
+        assert not _in_dygraph_mode(
+        ), "SequenceConv is not supported by dynamic graph mode yet!"
         super(SequenceConv, self).__init__(name_scope)
         self._num_filters = num_filters
         self._filter_size = filter_size
@@ -1729,12 +2029,10 @@ class SequenceConv(layers.Layer):
         self._param_attr = param_attr
 
     def _build_once(self, input):
-
         self._dtype = self._helper.input_dtype(input)
-        print(self._filter_size)
         filter_shape = [self._filter_size * input.shape[1], self._num_filters]
         self._filter_param = self.create_parameter(
-            attr=self.param_attr, shape=filter_shape, dtype=self._dtype)
+            attr=self._param_attr, shape=filter_shape, dtype=self._dtype)
 
     def forward(self, input):
         pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
@@ -1752,3 +2050,237 @@ class SequenceConv(layers.Layer):
             })
         pre_act = self._helper.append_bias_op(pre_bias)
         return self._helper.append_activation(pre_act)
+
+
+class RowConv(layers.Layer):
+    def __init__(self,
+                 name_scope,
+                 future_context_size,
+                 param_attr=None,
+                 act=None):
+        assert not _in_dygraph_mode(
+        ), "RowConv is not supported by dynamic graph mode yet!"
+        super(RowConv, self).__init__(name_scope)
+        self._act = act
+        self._param_attr = param_attr
+        self._future_context_size = future_context_size
+
+    def _build_once(self, input):
+        self._dtype = self._helper.input_dtype(input)
+        filter_shape = [self._future_context_size + 1, input.shape[1]]
+        self._filter_param = self.create_parameter(
+            attr=self._param_attr,
+            shape=filter_shape,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, input):
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type='row_conv',
+            inputs={'X': [input],
+                    'Filter': [self._filter_param]},
+            outputs={'Out': [out]})
+        return self._helper.append_activation(out, act=self._act)
+
+
+class GroupNorm(layers.Layer):
+    """
+        **Group Normalization Layer**
+
+        Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
+
+        Args:
+            name_scope (str): See base class.
+            groups(int): The number of groups that divided from channels.
+            epsilon(float): The small value added to the variance to prevent
+                division by zero.
+            param_attr(ParamAttr|None): The parameter attribute for the learnable
+                scale :math:`g`. If it is set to False, no scale will be added to the output units.
+                If it is set to None, the bias is initialized one. Default: None.
+            bias_attr(ParamAttr|None): The parameter attribute for the learnable
+                bias :math:`b`. If it is set to False, no bias will be added to the output units.
+                If it is set to None, the bias is initialized zero. Default: None.
+            act(str): Activation to be applied to the output of group normalizaiton.
+            data_layout(string|NCHW): Only NCHW is supported.
+            dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
+
+        Returns:
+            Variable: A tensor variable which is the result after applying group normalization on the input.
+
+
+    """
+
+    def __init__(self,
+                 name_scope,
+                 groups,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None,
+                 data_layout='NCHW'):
+        super(GroupNorm, self).__init__(name_scope)
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._epsilon = epsilon
+        self._groups = groups
+        self._act = act
+        if data_layout != 'NCHW':
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    def _build_once(self, input):
+        self._dtype = self._helper.input_dtype(input)
+        param_shape = [input.shape[1]]
+        if self._bias_attr:
+            self._bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                is_bias=True)
+
+        if self._param_attr:
+            self._scale = self.create_parameter(
+                attr=self._param_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                default_initializer=Constant(1.0))
+
+    def forward(self, input):
+        inputs = {'X': input}
+        if self._bias:
+            inputs['Bias'] = self._bias
+        if self._scale:
+            inputs['Scale'] = self._scale
+
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        group_norm_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type="group_norm",
+            inputs=inputs,
+            outputs={
+                "Y": group_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": self._epsilon,
+                   "groups": self._groups})
+
+        return self._helper.append_activation(group_norm_out, self._act)
+
+
+class SpectralNorm(layers.Layer):
+    def __init__(self, name_scope, dim=0, power_iters=1, eps=1e-12, name=None):
+        super(SpectralNorm, self).__init__(name_scope)
+        self._power_iters = power_iters
+        self._eps = eps
+        self._dim = dim
+
+    def _build_once(self, weight):
+        self._dtype = self._helper.input_dtype(weight)
+        input_shape = weight.shape
+        h = input_shape[self._dim]
+        w = np.prod(input_shape) // h
+
+        self.u = self.create_parameter(
+            attr=ParamAttr(),
+            shape=[h],
+            dtype=self._dtype,
+            default_initializer=Normal(0., 1.))
+        self.u.stop_gradient = True
+
+        self.v = self.create_parameter(
+            attr=ParamAttr(),
+            shape=[w],
+            dtype=self._dtype,
+            default_initializer=Normal(0., 1.))
+        self.v.stop_gradient = True
+
+    def forward(self, weight):
+        inputs = {'Weight': weight, 'U': self.u, 'V': self.v}
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="spectral_norm",
+            inputs=inputs,
+            outputs={"Out": out, },
+            attrs={
+                "dim": self._dim,
+                "power_iters": self._power_iters,
+                "eps": self._eps,
+            })
+
+        return out
+
+
+class TreeConv(layers.Layer):
+    def __init__(self,
+                 name_scope,
+                 output_size,
+                 num_filters=1,
+                 max_depth=2,
+                 act='tanh',
+                 param_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(TreeConv, self).__init__(name_scope)
+        self._name = name
+        self._output_size = output_size
+        self._act = act
+        self._max_depth = max_depth
+        self._num_filters = num_filters
+        self._bias_attr = bias_attr
+        self._param_attr = param_attr
+
+    def _build_once(self, nodes_vector, edge_set):
+        assert isinstance(nodes_vector, Variable)
+        assert isinstance(edge_set, Variable)
+        self._dtype = self._helper.input_dtype(nodes_vector)
+
+        feature_size = nodes_vector.shape[2]
+        w_shape = [feature_size, 3, self._output_size, self._num_filters]
+        if self._bias_attr:
+            self._bias_param = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[self._num_filters],
+                dtype=self._dtype,
+                is_bias=True)
+        self.W = self.create_parameter(
+            attr=self._param_attr,
+            shape=w_shape,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, nodes_vector, edge_set):
+        if self._name:
+            out = self.create_variable(
+                name=self._name, dtype=self._dtype, persistable=False)
+        else:
+            out = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+
+        self._helper.append_op(
+            type='tree_conv',
+            inputs={
+                'NodesVector': nodes_vector,
+                'EdgeSet': edge_set,
+                'Filter': self.W
+            },
+            outputs={'Out': out, },
+            attrs={'max_depth': self._max_depth})
+        if self._bias_attr:
+            pre_activation = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [out],
+                        'Y': [self._bias_param]},
+                outputs={'Out': [pre_activation]},
+                attrs={'axis': 1})
+        else:
+            pre_activation = out
+        return self._helper.append_activation(pre_activation, act=self._act)
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index fd07ff0ba3d21721fbbc46099f7dcb6937f93524..c7c82f28e7c441b4aa24ffa81a8695e565d737d8 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -227,7 +227,7 @@ class Precision(MetricBase):
                 metric.reset()
                 for data in train_reader():
                     loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
-                metric.update(preds=preds, labels=labels)
+                    metric.update(preds=preds, labels=labels)
                 numpy_precision = metric.eval()
     """
 
@@ -241,9 +241,11 @@ class Precision(MetricBase):
             raise ValueError("The 'preds' must be a numpy ndarray.")
         if not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray.")
-        sample_num = labels[0]
+        sample_num = labels.shape[0]
+        preds = np.rint(preds).astype("int32")
+
         for i in range(sample_num):
-            pred = preds[i].astype("int32")
+            pred = preds[i]
             label = labels[i]
             if label == 1:
                 if pred == label:
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1390e759d7e309674a2ecb61c59043b0f5032400..0291bc25ed3145a580582fa30a965fd76047daba 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -81,6 +81,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
 list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
+list(REMOVE_ITEM TEST_OPS test_layers)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -118,7 +119,7 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
-
+py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
 if(NOT WIN32)
     py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a84ff1fd6d46c30ad7aa72f1b29a8ae668b90e20
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import importlib
+
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
+
+from test_elementwise_add_op import *
+from test_elementwise_sub_op import *
+from test_concat_op import *
+from test_gather_op import *
+from test_gaussian_random_batch_size_like_op import *
+from test_uniform_random_batch_size_like_op import *
+from test_fill_constant_batch_size_like_op import *
+from test_lod_reset_op import *
+from test_scatter_op import *
+from test_mean_op import *
+from test_slice_op import *
+from test_linear_chain_crf_op import *
+from test_bilinear_interp_op import *
+from test_nearest_interp_op import *
+from test_sequence_concat import *
+from test_seq_conv import *
+from test_seq_pool import *
+from test_sequence_expand_as import *
+from test_sequence_expand import *
+from test_sequence_pad_op import *
+from test_sequence_unpad_op import *
+from test_sequence_scatter_op import *
+from test_sequence_slice_op import *
+from test_pad2d_op import *
+from test_fill_zeros_like2_op import *
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..935653b07a6a4e1d344e8040fa4a0ed72b9b164d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+from op_test import OpTest
+
+
+class TestFillZerosLike2Op(OpTest):
+    def setUp(self):
+        self.op_type = "fill_zeros_like2"
+        self.dtype = np.float32
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
+        self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
+        self.attrs = {'dtype': convert_np_dtype_to_dtype_(self.dtype)}
+
+    def init_dtype(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFillZerosLike2OpFp16(TestFillZerosLike2Op):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestFillZerosLike2OpFp64(TestFillZerosLike2Op):
+    def init_dtype(self):
+        self.dtype = np.float64
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 959b48d03cac533af55017f2c7297cc1ac006f51..576ca57a3c518b68540ea8e682118bf4b731c308 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -348,6 +348,55 @@ class TestImperative(unittest.TestCase):
         self.assertEqual(mlp._fc2, sublayers[1])
         self.assertEqual(len(sublayers), 2)
 
+    def test_dygraph_vs_static(self):
+        inp1 = np.random.rand(4, 3, 3)
+        inp2 = np.random.rand(4, 3, 3)
+
+        # dynamic graph
+        with fluid.dygraph.guard():
+            if np.sum(inp1) < np.sum(inp2):
+                x = fluid.layers.elementwise_add(inp1, inp2)
+            else:
+                x = fluid.layers.elementwise_sub(inp1, inp2)
+            dygraph_result = x._numpy()
+
+        # static graph
+        with new_program_scope():
+            inp_data1 = fluid.layers.data(
+                name='inp1', shape=[3, 3], dtype=np.float32)
+            inp_data2 = fluid.layers.data(
+                name='inp2', shape=[3, 3], dtype=np.float32)
+
+            a = fluid.layers.expand(
+                fluid.layers.reshape(
+                    fluid.layers.reduce_sum(inp_data1), [1, 1]), [4, 1])
+            b = fluid.layers.expand(
+                fluid.layers.reshape(
+                    fluid.layers.reduce_sum(inp_data2), [1, 1]), [4, 1])
+            cond = fluid.layers.less_than(x=a, y=b)
+
+            ie = fluid.layers.IfElse(cond)
+            with ie.true_block():
+                d1 = ie.input(inp_data1)
+                d2 = ie.input(inp_data2)
+                d3 = fluid.layers.elementwise_add(d1, d2)
+                ie.output(d3)
+
+            with ie.false_block():
+                d1 = ie.input(inp_data1)
+                d2 = ie.input(inp_data2)
+                d3 = fluid.layers.elementwise_sub(d1, d2)
+                ie.output(d3)
+            out = ie()
+
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            static_result = exe.run(fluid.default_main_program(),
+                                    feed={'inp1': inp1,
+                                          'inp2': inp2},
+                                    fetch_list=out)[0]
+        self.assertTrue(np.allclose(dygraph_result, static_result))
+
     def test_rnn(self):
         np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
                            [10.0, 11.0, 12.0]])
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 2fd82884f4a8d07ced06d5775e5b414ebc2ef654..1abbdf7d6702449ff84768533dfe80b941afe79c 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -595,6 +595,280 @@ class TestLayer(LayerTest):
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
         self.assertTrue(np.allclose(nce_loss3.numpy(), static_rlt))
 
+    def test_conv3d(self):
+        with self.static_graph():
+            images = layers.data(
+                name='pixel', shape=[3, 6, 6, 6], dtype='float32')
+            ret = layers.conv3d(input=images, num_filters=3, filter_size=2)
+            static_ret = self.get_static_graph_result(
+                feed={'pixel': np.ones(
+                    [2, 3, 6, 6, 6], dtype='float32')},
+                fetch_list=[ret])[0]
+
+        with self.static_graph():
+            images = layers.data(
+                name='pixel', shape=[3, 6, 6, 6], dtype='float32')
+            conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2)
+            ret = conv3d(images)
+            static_ret2 = self.get_static_graph_result(
+                feed={'pixel': np.ones(
+                    [2, 3, 6, 6, 6], dtype='float32')},
+                fetch_list=[ret])[0]
+
+        with self.dynamic_graph():
+            images = np.ones([2, 3, 6, 6, 6], dtype='float32')
+            conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2)
+            dy_ret = conv3d(base.to_variable(images))
+
+        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+
+    def test_row_conv(self):
+        input = np.arange(15).reshape([3, 5]).astype('float32')
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+
+        with self.static_graph():
+            x = layers.data(
+                name='X',
+                shape=[3, 5],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            ret = layers.row_conv(input=x, future_context_size=2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    'X': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1, 1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        with self.static_graph():
+            x = layers.data(
+                name='X',
+                shape=[3, 5],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            rowConv = nn.RowConv('RowConv', future_context_size=2)
+            ret = rowConv(x)
+            static_ret2 = self.get_static_graph_result(
+                feed={
+                    'X': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1, 1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        # TODO: dygraph can't support LODTensor
+
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+
+    def test_group_norm(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+
+        shape = (2, 4, 3, 3)
+
+        input = np.random.random(shape).astype('float32')
+
+        with self.static_graph():
+            X = fluid.layers.data(
+                name='X',
+                shape=shape,
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            ret = layers.group_norm(input=X, groups=2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    'X': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        with self.static_graph():
+            X = fluid.layers.data(
+                name='X',
+                shape=shape,
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            groupNorm = nn.GroupNorm('GroupNorm', groups=2)
+            ret = groupNorm(X)
+            static_ret2 = self.get_static_graph_result(
+                feed={
+                    'X': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        with self.dynamic_graph():
+            groupNorm = nn.GroupNorm('GroupNorm', groups=2)
+            dy_ret = groupNorm(base.to_variable(input))
+
+        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+
+    def test_spectral_norm(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+
+        shape = (2, 4, 3, 3)
+
+        input = np.random.random(shape).astype('float32')
+
+        with self.static_graph():
+            Weight = fluid.layers.data(
+                name='Weight',
+                shape=shape,
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            ret = layers.spectral_norm(weight=Weight, dim=1, power_iters=2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    'Weight': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1]], place=place),
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        with self.static_graph():
+            Weight = fluid.layers.data(
+                name='Weight',
+                shape=shape,
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
+            ret = spectralNorm(Weight)
+            static_ret2 = self.get_static_graph_result(
+                feed={
+                    'Weight': fluid.create_lod_tensor(
+                        data=input, recursive_seq_lens=[[1, 1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=True)[0]
+
+        with self.dynamic_graph():
+            spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
+            dy_ret = spectralNorm(base.to_variable(input))
+
+        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+
+    def test_tree_conv(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        adj_array = [1, 2, 1, 3, 1, 4, 1, 5, 2, 6, 2, 7, 2, 8, 4, 9, 4, 10]
+        adj = np.array(adj_array).reshape((1, 9, 2)).astype('int32')
+        adj = np.tile(adj, (1, 1, 1))
+        vectors = np.random.random((1, 10, 5)).astype('float32')
+        with self.static_graph():
+            NodesVector = fluid.layers.data(
+                name='NodesVector',
+                shape=(1, 10, 5),
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            EdgeSet = fluid.layers.data(
+                name='EdgeSet',
+                shape=(1, 9, 2),
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            ret = layers.tree_conv(
+                nodes_vector=NodesVector,
+                edge_set=EdgeSet,
+                output_size=6,
+                num_filters=1,
+                max_depth=2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    'NodesVector': fluid.create_lod_tensor(
+                        data=vectors, recursive_seq_lens=[[1]], place=place),
+                    'EdgeSet': fluid.create_lod_tensor(
+                        data=adj, recursive_seq_lens=[[1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=False)[0]
+
+        with self.static_graph():
+            NodesVector = fluid.layers.data(
+                name='NodesVector',
+                shape=(1, 10, 5),
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            EdgeSet = fluid.layers.data(
+                name='EdgeSet',
+                shape=(1, 9, 2),
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            treeConv = nn.TreeConv(
+                'TreeConv', output_size=6, num_filters=1, max_depth=2)
+            ret = treeConv(NodesVector, EdgeSet)
+            static_ret2 = self.get_static_graph_result(
+                feed={
+                    'NodesVector': fluid.create_lod_tensor(
+                        data=vectors, recursive_seq_lens=[[1]], place=place),
+                    'EdgeSet': fluid.create_lod_tensor(
+                        data=adj, recursive_seq_lens=[[1]], place=place)
+                },
+                fetch_list=[ret],
+                with_lod=False)[0]
+
+        with self.dynamic_graph():
+            treeConv = nn.TreeConv(
+                'SpectralNorm', output_size=6, num_filters=1, max_depth=2)
+            dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj))
+
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+
+    def test_conv3d_transpose(self):
+        input_array = np.arange(0, 48).reshape(
+            [2, 3, 2, 2, 2]).astype('float32')
+
+        with self.static_graph():
+            img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32')
+            out = layers.conv3d_transpose(
+                input=img, num_filters=12, filter_size=12, use_cudnn=False)
+            static_rlt = self.get_static_graph_result(
+                feed={'pixel': input_array}, fetch_list=[out])[0]
+        with self.static_graph():
+            img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32')
+            conv3d_transpose = nn.Conv3DTranspose(
+                'Conv3DTranspose',
+                num_filters=12,
+                filter_size=12,
+                use_cudnn=False)
+            out = conv3d_transpose(img)
+            static_rlt2 = self.get_static_graph_result(
+                feed={'pixel': input_array}, fetch_list=[out])[0]
+        with self.dynamic_graph():
+            conv3d_transpose = nn.Conv3DTranspose(
+                'Conv3DTranspose',
+                num_filters=12,
+                filter_size=12,
+                use_cudnn=False)
+            dy_rlt = conv3d_transpose(base.to_variable(input_array))
+        self.assertTrue(np.allclose(static_rlt2, static_rlt))
+        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+
 
 class TestBook(unittest.TestCase):
     def test_fit_a_line(self):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index bda8b666dcde22b0e4bacdb5db252267f4c7e34b..645b0188d5f45935ace074ba343de246af476b41 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -38,7 +38,15 @@ def Lenet(data, class_dim):
 
 
 class TestFetchAndFeed(unittest.TestCase):
-    def parallel_exe(self, use_cuda, run_parallel_exe, seed=1):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
+    def parallel_exe(self,
+                     use_cuda,
+                     run_parallel_exe,
+                     use_experimental_executor=False,
+                     seed=1):
         main_program = fluid.Program()
         startup = fluid.Program()
         startup.random_seed = seed
@@ -63,8 +71,12 @@ class TestFetchAndFeed(unittest.TestCase):
         build_strategy = fluid.BuildStrategy()
         build_strategy.enable_inplace = False
         build_strategy.memory_optimize = False
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.use_experimental_executor = use_experimental_executor
         train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
+            loss_name=loss.name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
 
         run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)
 
@@ -131,8 +143,7 @@ class TestFetchAndFeed(unittest.TestCase):
             if batch_id == 2:
                 break
 
-    def test_fetch(self):
-        os.environ['CPU_NUM'] = str(4)
+    def test_fetch_with_threaded_executor(self):
         if core.is_compiled_with_cuda():
             self.parallel_exe(
                 use_cuda=True,
@@ -140,8 +151,18 @@ class TestFetchAndFeed(unittest.TestCase):
         self.parallel_exe(
             use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_fetch)
 
+    def test_fetch_with_fast_threaded_executor(self):
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(
+                use_cuda=True,
+                run_parallel_exe=self.run_parallel_exe_with_fetch,
+                use_experimental_executor=True)
+        self.parallel_exe(
+            use_cuda=False,
+            run_parallel_exe=self.run_parallel_exe_with_fetch,
+            use_experimental_executor=True)
+
     def test_feed(self):
-        os.environ['CPU_NUM'] = str(4)
         if core.is_compiled_with_cuda():
             self.parallel_exe(
                 use_cuda=True, run_parallel_exe=self.run_parallel_exe_with_feed)