diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index bc7fe5454f5883108e43b4ca47920995dc13a1ff..69da9b98198de358348621ecdb444f2f81c7757f 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -201,7 +201,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
     ENDIF()
 
-    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_REPO "https://github.com/protocolbuffers/protobuf.git")
     SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
 
     ExternalProject_Add(
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index 878b950858a71ba0e10ab2643667922420d29099..c44793cd11d22b29b4b3422a047d81fe26624982 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -13,125 +13,186 @@
 // limitations under the License.
 
 #include <algorithm>
-#include <memory>
+#include <map>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_graph_view.h"
-#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-VarHandle* GetValidInput(const OpHandleBase* a) {
-  for (auto p : a->Inputs()) {
-    VarHandle* b = dynamic_cast<VarHandle*>(p);
-    if (b) {
-      return b;
+class AllReduceDepsPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override {
+    std::vector<AllReduceOpHandle*> all_reduce_op_handles =
+        GetSortedAllReduceOps(*graph);
+
+    for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) {
+      auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      all_reduce_op_handles[i - 1]->AddOutput(dep_var);
+      all_reduce_op_handles[i]->AddInput(dep_var);
     }
-  }
 
-  return nullptr;
-}
-
-void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
-  auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-
-  // get vars order
-  int order = 0;
-  std::unordered_map<std::string, int> vars;
-  // TODO(gongwb): use graph topology sort to find the order of operators.
-  //               Note that must assert topology sort is stable
-  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
-  for (auto* op_desc : ops) {
-    try {
-      bool is_bk_op =
-          static_cast<bool>(boost::get<int>(op_desc->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kBackward));
-      if (!is_bk_op) continue;
-
-      auto backward_vars =
-          boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
-              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
-      auto outputs = op_desc->Outputs();
-      for (auto& o_it : outputs) {
-        for (auto& v : o_it.second) {  // values
-          vars[v] = order;
-          VLOG(10) << "in all_reduce_deps_pass:" << v;
-        }
-      }
-      order++;
-    } catch (boost::bad_get e) {
+    if (VLOG_IS_ON(10)) {
+      DebugString(*graph, all_reduce_op_handles);
     }
   }
 
-  std::vector<OpHandleBase*> dist_ops;
-  // get allreduce ops.
-  for (auto& op : graph_ops) {
-    // FIXME(gongwb):add broad cast.
-    if (op->Name() == "all_reduce" || op->Name() == "reduce") {
-      dist_ops.push_back(op);
+  std::vector<AllReduceOpHandle*> GetSortedAllReduceOps(
+      const ir::Graph& graph) const {
+    std::vector<AllReduceOpHandle*> all_reduce_op_handles;
+    std::unordered_map<OpHandleBase*, size_t> pending_ops;
+    std::unordered_set<OpHandleBase*> ready_ops;
+    std::unordered_set<OpHandleBase*> next_ready_ops;
+
+    auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(graph);
+    size_t num_of_ops = op_handles.size();
+    for (OpHandleBase* op : op_handles) {
+      size_t not_ready_vars = op->NotReadyInputSize();
+      if (not_ready_vars) {
+        pending_ops.insert({op, not_ready_vars});
+      } else {
+        ready_ops.insert(op);
+      }
     }
-  }
-
-  VLOG(10) << "dist_ops size:" << dist_ops.size()
-           << ", outputs size:" << vars.size() << ", ops size:" << ops.size();
-
-  std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
-                                                  OpHandleBase* op2) {
-    VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
-    VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
-
-    PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
-                   op1->DebugString(), op2->DebugString());
 
-    auto l_it = vars.find(i0->name());
-    auto r_it = vars.find(i1->name());
-
-    PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(),
-                   "can't find var's name %s and %s in opdesc", i0->name(),
-                   i1->name());
-
-    if (l_it->second < r_it->second) return true;
+    GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
+
+    size_t has_run_ops = ready_ops.size();
+    while (has_run_ops != num_of_ops) {
+      for (auto* op : ready_ops) {
+        for (auto& ready_var : op->Outputs()) {
+          for (auto* pend_op : ready_var->PendingOps()) {
+            auto& deps = --pending_ops[pend_op];
+            if (deps == 0) {
+              next_ready_ops.insert(pend_op);
+            }
+          }
+        }
+      }
 
-    if (l_it->second == r_it->second) {
-      return i0->name() < i1->name();
+      PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle.");
+      ready_ops.clear();
+      std::swap(ready_ops, next_ready_ops);
+      GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
+      has_run_ops += ready_ops.size();
     }
+    return all_reduce_op_handles;
+  }
 
-    return false;
-  });
-
-  // add dependency.
-  auto& sorted_ops = dist_ops;
-  for (size_t i = 1; i < sorted_ops.size(); ++i) {
-    auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-
-    auto* pre_op = sorted_ops[i - 1];
-    auto* op = sorted_ops[i];
-
-    pre_op->AddOutput(dep_var);
-    op->AddInput(dep_var);
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+  void GetSortedAllReduceOps(
+      const std::unordered_set<OpHandleBase*>& ready_ops,
+      std::vector<AllReduceOpHandle*>* all_reduce_op_handles) const {
+    std::vector<AllReduceOpHandle*> current_all_reduce_op_handles;
+    for (auto& op_handle : ready_ops) {
+      auto all_reduce_op_handle = dynamic_cast<AllReduceOpHandle*>(op_handle);
+      if (all_reduce_op_handle) {
+        current_all_reduce_op_handles.emplace_back(all_reduce_op_handle);
+      }
+    }
 
-    VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
-             << " and " << op;
+    // NOTE(zcd): For distributed training, it is important to keep the order of
+    // allReduce on each node consistent. Otherwise, hang may occur.
+    // Sort the current_all_reduce_op_handles according to the name of input.
+    sort(current_all_reduce_op_handles.begin(),
+         current_all_reduce_op_handles.end(),
+         [](const AllReduceOpHandle* left,
+            const AllReduceOpHandle* right) -> bool {
+           auto left_in_vars = DynamicCast<VarHandle>(left->Inputs());
+           auto right_in_vars = DynamicCast<VarHandle>(right->Inputs());
+           PADDLE_ENFORCE_GT(left_in_vars.size(), 0);
+           PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size());
+           return left_in_vars[0]->Name() > right_in_vars[0]->Name();
+         });
+
+    all_reduce_op_handles->insert(all_reduce_op_handles->end(),
+                                  current_all_reduce_op_handles.begin(),
+                                  current_all_reduce_op_handles.end());
+  }
 
-    VLOG(10) << "pre_op:" << pre_op->DebugString()
-             << ", op:" << op->DebugString();
+  void DebugString(
+      const ir::Graph& graph,
+      const std::vector<AllReduceOpHandle*>& all_reduce_op_handles) const {
+    // get vars order
+    std::map<int, std::vector<std::string>> vars =
+        GetSoredGradientsFromStaleProgram(graph);
+    std::stringstream out;
+    size_t grads_of_stale_program = 0;
+    out << "Get Order From kStaleProgramOpDescs: ";
+    for (auto& var : vars) {
+      out << "Order " << var.first << " [";
+      for (auto& var_name : var.second) {
+        out << var_name << ", ";
+        ++grads_of_stale_program;
+      }
+      out << "], ";
+    }
+    VLOG(10) << out.str();
+
+    std::stringstream out2;
+    out2 << "Get Order From Topological order: ";
+    for (auto& op : all_reduce_op_handles) {
+      bool find_valid_input = false;
+      for (auto& in_var : op->Inputs()) {
+        if (dynamic_cast<VarHandle*>(in_var)) {
+          out2 << in_var->Name() << ", ";
+          find_valid_input = true;
+          break;
+        }
+      }
+      PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input.");
+    }
+    VLOG(10) << out2.str();
+    if (grads_of_stale_program != all_reduce_op_handles.size()) {
+      VLOG(10)
+          << "The gradients number of stale program and graph is not equal.";
+    }
   }
-}
 
+  std::map<int, std::vector<std::string>> GetSoredGradientsFromStaleProgram(
+      const ir::Graph& graph) const {
+    std::map<int, std::vector<std::string>> vars;
+    auto ops = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
+    int order = 0;
+    for (auto* op_desc : ops) {
+      try {
+        bool is_bk_op =
+            static_cast<bool>(boost::get<int>(op_desc->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                              static_cast<int>(OpRole::kBackward));
+        if (!is_bk_op) continue;
+
+        auto backward_vars =
+            boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
+                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+        if (backward_vars.empty()) continue;
+
+        PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+        for (size_t i = 1; i < backward_vars.size(); i += 2) {
+          vars[order].emplace_back(backward_vars[i]);
+          VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1]
+                  << ", " << backward_vars[i];
+        }
+        order++;
+      } catch (boost::bad_get e) {
+      }
+    }
+    return vars;
+  }
+};
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h
deleted file mode 100644
index 4ed3736587aa3d45e288e3dc7e6ab3099f935f41..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// TODO(gongwb): overlap allreduce with backward computation.
-class AllReduceDepsPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 6e477cd2977561ddb914e4a6343f677044fad4be..61276efedeeca76a8818c15ddab73b3c53725c4b 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -28,7 +28,7 @@
 // asynchronous nccl allreduce or synchronous issue:
 // https://github.com/PaddlePaddle/Paddle/issues/15049
 DEFINE_bool(
-    sync_nccl_allreduce, false,
+    sync_nccl_allreduce, true,
     "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
     "after allreduce, this mode can get better performance in some scenarios.");
 
@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
       this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
     }
   }
+  // TODO(gongwb) :polish them!
+  if (is_encoded) {
+    VLOG(1) << "Use dgc allreduce mode";
+  }
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
         paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
     auto encode_var_name = original_name + g_dgc_encoded;
     auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
+    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
     auto &in = in_var->Get<LoDTensor>();
     ins.emplace_back(&in);
 
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index df69b11ec6ae3bb08ba03b749c69eb718525de4d..36720b8ad97c6cacfccc106066bcdbe20c39ff47 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -163,15 +163,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
           "graph_printer", new details::GraphvizSSAGraphPrinter);
     }
 
-    // Verify that the graph is correct for multi-device executor.
-    AppendPass("multi_devices_check_pass");
-
-    if (VLOG_IS_ON(2)) {
-      AppendPass("all_reduce_deps_pass");
-    }
-
-    if (SeqOnlyAllReduceOps(strategy_)) {
-      VLOG(10) << "Add all_reduce_deps_pass";
+    // experimental shows that the program will be faster if append
+    // all_reduce_deps_pass here.
+    if (!strategy_.enable_parallel_graph_ &&
+        (SeqOnlyAllReduceOps(strategy_) ||
+         strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) {
       AppendPass("all_reduce_deps_pass");
     }
 
@@ -179,6 +175,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       VLOG(10) << "Add modify_op_lock_and_record_event_pass";
       AppendPass("modify_op_lock_and_record_event_pass");
     }
+
+    // Verify that the graph is correct for multi-device executor.
+    AppendPass("multi_devices_check_pass");
   }
 
   // Convert graph to run on multi-devices.
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 297ee92fc3c84c2feec9cb85bd8671ce8ad94ed0..3e805bd5b480241954960f92a72514723c3a8bb7 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -56,6 +56,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   fetches.resize(fetch_tensors.size());
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
   std::vector<FetchOpHandle *> fetch_ops;
+  std::vector<OpHandleBase *> ready_fetch_ops;
 
   for (auto &fetch_var_name : fetch_tensors) {
     for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
@@ -70,8 +71,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
     auto &var_name = fetch_tensors[i];
     auto fetched_var_it = fetched_vars.find(var_name);
     PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
-                   "Cannot find fetched variable.(Perhaps the main_program "
-                   "is not set to ParallelExecutor)");
+                   "Cannot find fetched variable(%s).(Perhaps the main_program "
+                   "is not set to ParallelExecutor)",
+                   var_name);
 
     auto &vars = fetched_var_it->second;
 
@@ -88,7 +90,11 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
       op->AddInput(var);
     }
 
-    (*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
+    int dep = static_cast<int>(op->NotReadyInputSize());
+    (*op_deps)[op] = dep;
+    if (dep == 0) {
+      ready_fetch_ops.emplace_back(op);
+    }
   }
 
   size_t num_complete = 0;
@@ -97,7 +103,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   for (auto op : bootstrap_ops_) {
     RunOpAsync(op_deps.get(), op, complete_q);
   }
-
+  for (auto op : ready_fetch_ops) {
+    RunOpAsync(op_deps.get(), op, complete_q);
+  }
   while (num_complete != op_deps->size()) {
     size_t num_comp = complete_q->Pop();
     if (num_comp == -1UL) {
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 232d82a5da596a78d2999c4a4c4f7dda0c7cad7e..6c8b8937ebe646042f71cb58cfbc2d32426a4e3c 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
-
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -44,6 +44,7 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const {
 }
 
 void FetchOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
   WaitInputVarGenerated(platform::CPUPlace());
 
   tensors_.resize(inputs_.size());
@@ -62,7 +63,8 @@ void FetchOpHandle::RunImpl() {
     auto &t = var->Get<framework::LoDTensor>();
     if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
-      TensorCopySync(t, cpu, &tensors_[i]);
+      TensorCopy(t, cpu, *dev_ctxes_.at(t.place()), &tensors_[i]);
+      dev_ctxes_.at(t.place())->Wait();
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index 79150f719e379ca4e2b87d2e7db1b2daeee9aa67..84c9e4a379a5e07dc3a8e85409c804eebc390c73 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -305,6 +305,12 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
 
     VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
 
+    if (var_nodes_[in_var_name].back() != in_node) {
+      VLOG(4) << "SKIP since " << in_var_name
+              << " is also used as output by other ops";
+      continue;
+    }
+
     bool can_replace = true;
     if (in_var_name == out_var_name) {
       can_replace = false;
@@ -527,6 +533,9 @@ void GraphView::Build(ir::Graph* g) {
   };
   for (auto& node : g->Nodes()) {
     if (!node->IsOp()) continue;
+    // avoid optimize the variable used in sub-blocks
+    if (OpHasSubBlock(node->Op())) update_skip_set(node);
+
     if (node->Name() == "send") update_skip_set(node);
     if (node->Name() == "recv") update_skip_set(node);
     if (node->Name() == "prefetch") update_skip_set(node);
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc
index 894d7dad2e623649fe96b00bb515c9605c89a404..1af57dc4087d2fd734c43e9549a4bd4526af4d35 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -131,16 +131,7 @@ size_t NodeSize(const VarDesc& node) {
   return type_size * std::abs(size);
 }
 
-size_t NodeSize(ir::Node* n) {
-  VarDesc* desc = nullptr;
-  // some op do not have block pointer
-  if (n->inputs[0]->Op() != nullptr) {
-    desc = FindVarDescInBlock(n);
-  } else {
-    desc = n->Var();
-  }
-  return NodeSize(*desc);
-}
+size_t NodeSize(ir::Node* n) { return NodeSize(*(n->Var())); }
 
 std::string DebugStringImpl(VarDesc* var) {
   std::stringstream ss;
@@ -163,24 +154,22 @@ std::string DebugStringImpl(VarDesc* var) {
 }
 
 std::string DebugString(ir::Node* var) {
-  return DebugStringImpl(FindVarDescInBlock(var));
+  return DebugStringImpl(GetVarDesc(var));
 }
 
 // NOTE(dzh): based ir node, if a large node has been reused
 // by a small size node, then next time it appear in pool, it will
 // have the small size. Find the original node shap from blockdesc.
-VarDesc* FindVarDescInBlock(ir::Node* n) {
+VarDesc* GetVarDesc(ir::Node* n) {
   PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1);
-  BlockDesc* block = n->inputs[0]->Op()->Block();
-  PADDLE_ENFORCE(block->HasVar(n->Name()),
-                 string::Sprintf("Block do not has var %s", n->Name()));
-  return block->FindVar(n->Name());
+  return n->Var();
 }
 
 struct NodeComparator {
   bool operator()(ir::Node* lhs, ir::Node* rhs) const {
-    auto* lhs_desc = FindVarDescInBlock(lhs);
-    auto* rhs_desc = FindVarDescInBlock(rhs);
+    if (lhs->Var()->GetType() != rhs->Var()->GetType()) return false;
+    auto* lhs_desc = GetVarDesc(lhs);
+    auto* rhs_desc = GetVarDesc(rhs);
     // match data type
     if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
       return false;
@@ -204,7 +193,7 @@ void OrderedSet::Insert(ir::Node* var) {
     return;
   }
 
-  auto* var_desc = FindVarDescInBlock(var);
+  auto* var_desc = var->Var();
   auto var_shape = var_desc->GetShape();
   int batch_size = static_cast<int>(var_shape[0]);
 
@@ -212,7 +201,7 @@ void OrderedSet::Insert(ir::Node* var) {
   Iter it = nodes_.begin();
   while (it != nodes_.end()) {
     auto& prev = it->front();
-    auto* cache_desc = FindVarDescInBlock(prev);
+    auto* cache_desc = GetVarDesc(prev);
     int cache_batch_size = cache_desc->GetShape()[0];
     if ((cache_batch_size == -1 && batch_size == -1) ||
         (cache_batch_size != -1 && batch_size != -1)) {
@@ -336,10 +325,16 @@ int MinChunkSize() {
 bool NodeCanReused(const VarDesc& node) {
   auto type = node.GetType();
   // only these types holds bulk of gpu memory
-  if (!(type == proto::VarType::LOD_TENSOR ||
-        type == proto::VarType::LOD_TENSOR_ARRAY)) {
-    return false;
-  }
+  // FIXME(liuwei1031) did not find good ways to test SELECTED_ROWS and
+  // LOD_TENSOR_ARRAY re-use logic,
+  // disable them in version 1.4
+  // if (!(type == proto::VarType::LOD_TENSOR ||
+  //       type == proto::VarType::SELECTED_ROWS ||
+  //       type == proto::VarType::LOD_TENSOR_ARRAY)) {
+  //   return false;
+  // }
+  if (type != proto::VarType::LOD_TENSOR) return false;
+
   // persistable variable is parameter
   if (node.Persistable()) {
     return false;
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h
index b5348cc66eaa446719b299b63caa340eab3e2ab9..65c7017d2d462976cf8cd4d7b5f660e279e12b6a 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -20,6 +20,7 @@
 #include <map>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
@@ -140,11 +141,7 @@ size_t NodeSize(const VarDesc&);
 
 std::string DebugString(ir::Node* var);
 
-// NOTE(dzhwinter)
-// after node reuse, the replaced node shape is
-// different with its VarDesc. So need to find the
-// correct VarDesc in Block.
-VarDesc* FindVarDescInBlock(ir::Node* n);
+VarDesc* GetVarDesc(ir::Node* n);
 
 static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
   return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 413b14961631b3459e0d05af685ad1c5395844c2..69cd84ebf2d678c089141f09a92c46e3a03fe4d9 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -68,7 +68,7 @@ void OpHandleBase::Run(bool use_cuda) {
         if (out_var_handle) {
           PADDLE_ENFORCE(
               platform::is_same_place(place, out_var_handle->place()),
-              "The place of input(%s) is not consistent with the "
+              "The place of output(%s) is not consistent with the "
               "place of current op(%s).",
               out_var_handle->Name(), Name());
           out_var_handle->SetGenerateEvent(events_.at(dev_id));
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index c00932a7bdb170e63b5fd4d43ccb2072f1a0a9c9..356ec373b780d917f8cc192148526ebb2dfe0d2f 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -68,7 +68,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     }
     set.clear();
   };
-  auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); };
+
   // Clean run context
   run_op_futures_.clear();
   exception_holder_.Clear();
@@ -102,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          run_all_op(op);
+          ready_ops.insert(op);
         }
       }
     }
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index dcc48fb934e7a06f2e85fa34fde335261f551415..a8720ff4bfb5c7fa7aee6d23949b030c328b90e6 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -84,7 +84,8 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
 
   // 1. record op nodes of different roles
   for (auto node : nodes) {
-    if (node->IsVar()) continue;
+    if (!node->IsOp()) continue;
+    PADDLE_ENFORCE(node->Op(), "must find opdesc");
     int op_role = boost::get<int>(node->Op()->GetAttr(
         framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
     if ((op_role == static_cast<int>(framework::OpRole::kForward)) ||
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ab0947c631fe9a409406b3b092972ae6512beae7..3f10daf56ebe0d7a351236b58514d7ad6bb6352e 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -19,17 +19,14 @@ limitations under the License. */
 #include <tuple>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-#include "paddle/fluid/framework/ir/graph.h"
-
-#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #ifdef WITH_GPERFTOOLS
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 036d2a50a4a7ea3ce7e052a56202b1d54465b03e..bc03285a4c5fe6db2abf2b271d6ddc86e75a9412 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -122,14 +122,14 @@ class Autograd {
       std::map<std::string, std::vector<VarBase*>> input_grads =
           ready_op->ApplyGrad();
 
-      for (auto it : input_grads) {
-        const std::vector<VarBase*>& ingrads = it.second;
+      for (auto it = input_grads.rbegin(); it != input_grads.rend(); ++it) {
+        const std::vector<VarBase*>& ingrads = it->second;
         for (size_t i = 0; i < ingrads.size(); ++i) {
           if (!ingrads[i]) continue;
-          if (ready_op->input_vars_[it.first][i]->IsStopGradient()) {
+          if (ready_op->input_vars_[it->first][i]->IsStopGradient()) {
             continue;
           }
-          OpBase* pre_op = ready_op->pre_ops_[it.first][i];
+          OpBase* pre_op = ready_op->pre_ops_[it->first][i];
           if (!pre_op) continue;
 
           dep_counts[pre_op] -= 1;
diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
index 1e7f5ac799de0d7a1debec0529d262f021bba790..d3d1522dccf0d8af4f26eec4e0c57257279880e0 100644
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
@@ -1,5 +1,4 @@
-cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
- elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc  softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
+cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc  softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
 
 cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
 cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
diff --git a/paddle/fluid/inference/anakin/convert/activation.cc b/paddle/fluid/inference/anakin/convert/activation.cc
index c85b958d7b85cb3e21df8714c89eee10b9b3fecc..a9aeb19ffd5f04c03df593e8f48976e7fa6155ab 100644
--- a/paddle/fluid/inference/anakin/convert/activation.cc
+++ b/paddle/fluid/inference/anakin/convert/activation.cc
@@ -34,6 +34,7 @@ ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
 }
 
 void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
+                                       const framework::BlockDesc &block_desc,
                                        const framework::Scope &scope,
                                        bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/activation.h b/paddle/fluid/inference/anakin/convert/activation.h
index 49a4518bef418491a7fbc0bcde403bf047f774bd..592a3d5bd9d1272aae8a13d0d0acc77f8990c6b3 100644
--- a/paddle/fluid/inference/anakin/convert/activation.h
+++ b/paddle/fluid/inference/anakin/convert/activation.h
@@ -27,6 +27,7 @@ class ActivationOpConverter : public AnakinOpConverter {
   explicit ActivationOpConverter(const std::string &op_type);
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~ActivationOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc
index 94014802bdbe1792e9eaba28d7134624dd3edc90..38cf6172027b3b200a378a61b6d5b395cc571de7 100644
--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc
@@ -29,6 +29,7 @@ namespace inference {
 namespace anakin {
 
 void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::BlockDesc &block_desc,
                                       const framework::Scope &scope,
                                       bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.h b/paddle/fluid/inference/anakin/convert/batch_norm.h
index cee5c43ae76bf28284118380ca4c861d5cbedd1c..c56735f15b435b46cf9f623bd284b5731a36c327 100644
--- a/paddle/fluid/inference/anakin/convert/batch_norm.h
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.h
@@ -25,6 +25,7 @@ class BatchNormOpConverter : public AnakinOpConverter {
   BatchNormOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~BatchNormOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/concat.cc b/paddle/fluid/inference/anakin/convert/concat.cc
index e2d1111acbb60690167530a25aeaf59858b71987..ae90c083690da6e108a05460de68be2eb0cd9b48 100644
--- a/paddle/fluid/inference/anakin/convert/concat.cc
+++ b/paddle/fluid/inference/anakin/convert/concat.cc
@@ -29,6 +29,7 @@ namespace inference {
 namespace anakin {
 
 void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::BlockDesc &block_desc,
                                    const framework::Scope &scope,
                                    bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/concat.h b/paddle/fluid/inference/anakin/convert/concat.h
index 4ff2b6d85b758efc7529c5034a34e094ee06cccb..974ff689bfef681f8993d5dbb0dbbbdde91f33bd 100644
--- a/paddle/fluid/inference/anakin/convert/concat.h
+++ b/paddle/fluid/inference/anakin/convert/concat.h
@@ -25,6 +25,7 @@ class ConcatOpConverter : public AnakinOpConverter {
   ConcatOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~ConcatOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc
index b99c6e71c4dfd2b567d85904f57ebecf0ed9a1cc..308f14604b9c83f2278499359328109d31f9ff17 100644
--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d.cc
@@ -28,6 +28,7 @@ namespace inference {
 namespace anakin {
 
 void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::BlockDesc &block_desc,
                                    const framework::Scope &scope,
                                    bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/conv2d.h b/paddle/fluid/inference/anakin/convert/conv2d.h
index 75a30c10d481762fe5579ccb4d79feeba73dc98a..dca5d19f468ac6d6e2f4bcda8ecaa3922d80e6b1 100644
--- a/paddle/fluid/inference/anakin/convert/conv2d.h
+++ b/paddle/fluid/inference/anakin/convert/conv2d.h
@@ -25,6 +25,7 @@ class Conv2dOpConverter : public AnakinOpConverter {
   Conv2dOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~Conv2dOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
index 4d105430dd298076fa8aa4c1925329c3a0e356a1..fa1ab0efeeb5cacd112ca1b644735eaaf49e55f8 100644
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
@@ -28,6 +28,7 @@ namespace inference {
 namespace anakin {
 
 void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::BlockDesc &block_desc,
                                          const framework::Scope &scope,
                                          bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
index 07359b9cba05bf7c885eb38d64816bdb718a6aba..0d9ef28183b309c4b50714fcbe64e24c5d9dfbaa 100644
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
@@ -25,6 +25,7 @@ class Conv2dFusionOpConverter : public AnakinOpConverter {
   Conv2dFusionOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~Conv2dFusionOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
index 35e02919aa70c211da5d4a5785a9833747d99ce2..30796f7592427191a4396a154be62838b7e666ad 100644
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
@@ -27,9 +27,9 @@ namespace paddle {
 namespace inference {
 namespace anakin {
 
-void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
-                                            const framework::Scope& scope,
-                                            bool test_mode) {
+void DensityPriorBoxOpConverter::operator()(
+    const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc,
+    const framework::Scope& scope, bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
   auto input_name = op_desc.Input("Input").front();
   auto image_name = op_desc.Input("Image").front();
diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.h b/paddle/fluid/inference/anakin/convert/density_prior_box.h
index 44265cbf2e968e8821bc1a9ae3225c9b7d405235..bf9210711a0f69595c241803cd40d42770ccd5d7 100644
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.h
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h
@@ -27,6 +27,7 @@ class DensityPriorBoxOpConverter : public AnakinOpConverter {
   DensityPriorBoxOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~DensityPriorBoxOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc
index 67636651017cfb18967cf8dc76d4f4a552fbd021..262ad28a654609cddde979d387621bb0c7c1a7f9 100644
--- a/paddle/fluid/inference/anakin/convert/detection_out.cc
+++ b/paddle/fluid/inference/anakin/convert/detection_out.cc
@@ -26,6 +26,7 @@ namespace inference {
 namespace anakin {
 
 void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::BlockDesc &block_desc,
                                          const framework::Scope &scope,
                                          bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/detection_out.h b/paddle/fluid/inference/anakin/convert/detection_out.h
index 5bf1c3ecbc89795d075301a2fd568312236bd874..ca78f10fdc2a7c7064ae0399e7f1afff1383ce67 100644
--- a/paddle/fluid/inference/anakin/convert/detection_out.h
+++ b/paddle/fluid/inference/anakin/convert/detection_out.h
@@ -27,6 +27,7 @@ class DetectionOutOpConverter : public AnakinOpConverter {
   DetectionOutOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~DetectionOutOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/dropout.cc b/paddle/fluid/inference/anakin/convert/dropout.cc
index ed6d7f7561cb78666855146864b33254026926ef..bc9b26dcf2733369e558cde2954e9d0caaba86b0 100644
--- a/paddle/fluid/inference/anakin/convert/dropout.cc
+++ b/paddle/fluid/inference/anakin/convert/dropout.cc
@@ -31,6 +31,7 @@ namespace inference {
 namespace anakin {
 
 void DropoutOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::BlockDesc &block_desc,
                                     const framework::Scope &scope,
                                     bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/dropout.h b/paddle/fluid/inference/anakin/convert/dropout.h
index 2a0fb6e76ac8354d884f9d815a4df785248e6475..11412e217ef5fa77bd22d7530d88be1347f2616f 100644
--- a/paddle/fluid/inference/anakin/convert/dropout.h
+++ b/paddle/fluid/inference/anakin/convert/dropout.h
@@ -25,6 +25,7 @@ class DropoutOpConverter : public AnakinOpConverter {
   DropoutOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~DropoutOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/elementwise.cc b/paddle/fluid/inference/anakin/convert/elementwise.cc
index 55b12390baf90a9365fd4d197b19a3c5cd675afd..fe9a896d8266e06250b712be0c75290c039e9a08 100644
--- a/paddle/fluid/inference/anakin/convert/elementwise.cc
+++ b/paddle/fluid/inference/anakin/convert/elementwise.cc
@@ -30,9 +30,9 @@ namespace paddle {
 namespace inference {
 namespace anakin {
 
-void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op,
-                                           const framework::Scope &scope,
-                                           bool test_mode) {
+void ElementwiseAddOpConverter::operator()(
+    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
+    const framework::Scope &scope, bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
   PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
   PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
@@ -50,9 +50,9 @@ void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op,
   engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
 }
 
-void ElementwiseMulOpConverter::operator()(const framework::proto::OpDesc &op,
-                                           const framework::Scope &scope,
-                                           bool test_mode) {
+void ElementwiseMulOpConverter::operator()(
+    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
+    const framework::Scope &scope, bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
   PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
   PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
diff --git a/paddle/fluid/inference/anakin/convert/elementwise.h b/paddle/fluid/inference/anakin/convert/elementwise.h
index 47525e41daafcbca0c7c86bad44066f18a3ac79c..e4664493a9d3ce1ed9a0c79a05fb466c4e781b3e 100644
--- a/paddle/fluid/inference/anakin/convert/elementwise.h
+++ b/paddle/fluid/inference/anakin/convert/elementwise.h
@@ -25,6 +25,7 @@ class ElementwiseAddOpConverter : public AnakinOpConverter {
   ElementwiseAddOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~ElementwiseAddOpConverter() {}
@@ -37,6 +38,7 @@ class ElementwiseMulOpConverter : public AnakinOpConverter {
   ElementwiseMulOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~ElementwiseMulOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc
index 2514eb1e093b4e05b7e6b2814cfd8185b3aede6c..a80a1a47e91aa085935b5febb3858e028f396091 100644
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
@@ -27,6 +27,7 @@ namespace inference {
 namespace anakin {
 
 void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::BlockDesc &block_desc,
                                    const framework::Scope &scope,
                                    bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/fc.h b/paddle/fluid/inference/anakin/convert/fc.h
index 060c649b19ef335a9e926eb205ec691a2a188fe1..fb461908b35e0111065e1a46c52306c64ace7d7c 100644
--- a/paddle/fluid/inference/anakin/convert/fc.h
+++ b/paddle/fluid/inference/anakin/convert/fc.h
@@ -25,6 +25,7 @@ class FcBaseOpConverter : public AnakinOpConverter {
   FcBaseOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~FcBaseOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc
index c6c372bbef87de7f38c1f66a21c170cabac8c0ed..7f5c1510960d1014c33bd565939812fe7c7dfc06 100644
--- a/paddle/fluid/inference/anakin/convert/flatten.cc
+++ b/paddle/fluid/inference/anakin/convert/flatten.cc
@@ -26,6 +26,7 @@ namespace inference {
 namespace anakin {
 
 void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::BlockDesc &block_desc,
                                     const framework::Scope &scope,
                                     bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/flatten.h b/paddle/fluid/inference/anakin/convert/flatten.h
index 1ace76b16381980a9eaec12806e0bc94d7b1fb85..c9cc0006eb2448917bbcc0952f5e2cae72b73de1 100644
--- a/paddle/fluid/inference/anakin/convert/flatten.h
+++ b/paddle/fluid/inference/anakin/convert/flatten.h
@@ -25,6 +25,7 @@ class FlattenOpConverter : public AnakinOpConverter {
   FlattenOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~FlattenOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.cc b/paddle/fluid/inference/anakin/convert/im2sequence.cc
index 568d7e4746f11b13ce8ea9e5a47a1b43d1c12693..2cc330c3829f6033229748523c3df750b951626f 100644
--- a/paddle/fluid/inference/anakin/convert/im2sequence.cc
+++ b/paddle/fluid/inference/anakin/convert/im2sequence.cc
@@ -31,6 +31,7 @@ namespace inference {
 namespace anakin {
 
 void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::BlockDesc &block_desc,
                                       const framework::Scope &scope,
                                       bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.h b/paddle/fluid/inference/anakin/convert/im2sequence.h
index 3003eac2c6f416663c3e7c4c3e297b6347edfb47..714679c1d9601136f1f54287bb58d611e852f3fe 100644
--- a/paddle/fluid/inference/anakin/convert/im2sequence.h
+++ b/paddle/fluid/inference/anakin/convert/im2sequence.h
@@ -25,6 +25,7 @@ class Im2SequenceConverter : public AnakinOpConverter {
   Im2SequenceConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~Im2SequenceConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
index 45db4221747128cd7f6d26c8830fa75ebf81ac72..1ca62658ef26ffebcc068c91ece7d9bbed0a348f 100644
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -40,8 +40,10 @@ class AnakinOpConverter {
   AnakinOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope, bool test_mode) {}
   void ConvertOp(const framework::proto::OpDesc &op,
+                 const framework::BlockDesc &block_desc,
                  const std::unordered_set<std::string> &parameters,
                  const framework::Scope &scope, AnakinNvEngine *engine,
                  bool test_mode = false) {
@@ -58,16 +60,17 @@ class AnakinOpConverter {
     }
     PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type);
     it->SetEngine(engine);
-    (*it)(op, scope, test_mode);
+    (*it)(op, block_desc, scope, test_mode);
   }
 
-  void ConvertBlock(const framework::proto::BlockDesc &block,
+  void ConvertBlock(framework::BlockDesc *block_desc,
                     const std::unordered_set<std::string> &parameters,
                     const framework::Scope &scope, AnakinNvEngine *engine) {
     std::unique_lock<std::mutex> lock(mutex_);
-    for (auto i = 0; i < block.ops_size(); i++) {
-      auto &op = block.ops(i);
-      ConvertOp(op, parameters, scope, engine);
+    framework::proto::BlockDesc *block = block_desc->Proto();
+    for (auto i = 0; i < block->ops_size(); i++) {
+      auto &op = block->ops(i);
+      ConvertOp(op, *block_desc, parameters, scope, engine);
     }
   }
 
@@ -77,9 +80,7 @@ class AnakinOpConverter {
       const std::vector<std::string> &inputs,
       const std::unordered_set<std::string> &parameters,
       const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
-    framework::proto::BlockDesc *block_proto = block_desc->Proto();
-    ConvertBlock(*block_proto, parameters, *scope, engine);
-
+    ConvertBlock(block_desc, parameters, *scope, engine);
     engine->Freeze();
     // if the max_batch size
     int max_batch_size = engine->GetMaxBatchSize();
diff --git a/paddle/fluid/inference/anakin/convert/pool2d.cc b/paddle/fluid/inference/anakin/convert/pool2d.cc
index 9b01d56a126b2ebc194f5b5bb5b2f52c298a316e..87eefe712a5ad2acd8c9b5abe521c832ad2c1ef2 100644
--- a/paddle/fluid/inference/anakin/convert/pool2d.cc
+++ b/paddle/fluid/inference/anakin/convert/pool2d.cc
@@ -31,6 +31,7 @@ namespace inference {
 namespace anakin {
 
 void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::BlockDesc &block_desc,
                                    const framework::Scope &scope,
                                    bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/pool2d.h b/paddle/fluid/inference/anakin/convert/pool2d.h
index 1931a03c7ac236b4e57236cd1eb2947110f279a8..ec28e48ac848eff1d37c39063725624bf7d65723 100644
--- a/paddle/fluid/inference/anakin/convert/pool2d.h
+++ b/paddle/fluid/inference/anakin/convert/pool2d.h
@@ -25,6 +25,7 @@ class Pool2dOpConverter : public AnakinOpConverter {
   Pool2dOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~Pool2dOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/relu.cc b/paddle/fluid/inference/anakin/convert/relu.cc
index 2ce96db1804a3d6d6d1afac79e4e1fc55ed4c35d..993437d014b1f951dac94da7a3179b4bcb63466d 100644
--- a/paddle/fluid/inference/anakin/convert/relu.cc
+++ b/paddle/fluid/inference/anakin/convert/relu.cc
@@ -26,6 +26,7 @@ namespace inference {
 namespace anakin {
 
 void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
+                                 const framework::BlockDesc &block_desc,
                                  const framework::Scope &scope,
                                  bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/relu.h b/paddle/fluid/inference/anakin/convert/relu.h
index 54c4c2316eb32ef70696a2477211008e04892552..6ede506511917c80faa59d40ee0a7bfff194da97 100644
--- a/paddle/fluid/inference/anakin/convert/relu.h
+++ b/paddle/fluid/inference/anakin/convert/relu.h
@@ -27,6 +27,7 @@ class ReluOpConverter : public AnakinOpConverter {
   ReluOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~ReluOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc
index eee36d2f37ea79c841ac8bf60c6e533069d06240..17e0a1acb5f4e08e848e91bbb051757d85796c0a 100644
--- a/paddle/fluid/inference/anakin/convert/reshape.cc
+++ b/paddle/fluid/inference/anakin/convert/reshape.cc
@@ -26,6 +26,7 @@ namespace inference {
 namespace anakin {
 
 void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::BlockDesc &block_desc,
                                     const framework::Scope &scope,
                                     bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/reshape.h b/paddle/fluid/inference/anakin/convert/reshape.h
index 970e8ce5572572bd18c34eeffa902fa2495c1cce..9ce2ea2a4f3f8802225fe8ca8ed602c9f7d27968 100644
--- a/paddle/fluid/inference/anakin/convert/reshape.h
+++ b/paddle/fluid/inference/anakin/convert/reshape.h
@@ -25,6 +25,7 @@ class ReshapeOpConverter : public AnakinOpConverter {
   ReshapeOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~ReshapeOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/scale.cc b/paddle/fluid/inference/anakin/convert/scale.cc
index 6f3aa8c5d1111dc2829e241c9331eeb521003c03..dd68af4f79a6d1e8add04bde6a6890bca1b00d14 100644
--- a/paddle/fluid/inference/anakin/convert/scale.cc
+++ b/paddle/fluid/inference/anakin/convert/scale.cc
@@ -26,6 +26,7 @@ namespace inference {
 namespace anakin {
 
 void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
+                                  const framework::BlockDesc &block_desc,
                                   const framework::Scope &scope,
                                   bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/scale.h b/paddle/fluid/inference/anakin/convert/scale.h
index b858e3c512494f80c7c3818a570e43d90d65251b..ba3bcdd21494a4eeb6190aa8383e17e1b828b5f3 100644
--- a/paddle/fluid/inference/anakin/convert/scale.h
+++ b/paddle/fluid/inference/anakin/convert/scale.h
@@ -27,6 +27,7 @@ class ScaleOpConverter : public AnakinOpConverter {
   ScaleOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~ScaleOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/softmax.cc b/paddle/fluid/inference/anakin/convert/softmax.cc
index d5cd8908ebf623f0334a3b4df2b19147c63f77a3..a6c1e971b16fa7fe6a074bcb2cdf391410f8871f 100644
--- a/paddle/fluid/inference/anakin/convert/softmax.cc
+++ b/paddle/fluid/inference/anakin/convert/softmax.cc
@@ -24,6 +24,7 @@ namespace inference {
 namespace anakin {
 
 void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::BlockDesc &block_desc,
                                     const framework::Scope &scope,
                                     bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
@@ -32,8 +33,16 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
   auto input = op_desc.Input("X").front();
   auto output = op_desc.Output("Out").front();
   auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+
+  auto input_var_desc = block_desc.FindVar(input);
+  PADDLE_ENFORCE(input_var_desc,
+                 "Cant find %s variable When runing Anakin Softmax converter.",
+                 input);
+  auto input_shape_in_fluid = input_var_desc->GetShape();
+  size_t input_dims = input_shape_in_fluid.size();
+
   engine_->AddOp(op_name, "Softmax", {input}, {output});
-  engine_->AddOpAttr(op_name, "axis", 2);
+  engine_->AddOpAttr(op_name, "axis", static_cast<int>(input_dims - 1));
 }
 
 }  // namespace anakin
diff --git a/paddle/fluid/inference/anakin/convert/softmax.h b/paddle/fluid/inference/anakin/convert/softmax.h
index 0508da0c6fecaf29b7376005904235dadf04ea28..a16356d5bb61ac2f3b4f7751e257ce36ca604bf1 100644
--- a/paddle/fluid/inference/anakin/convert/softmax.h
+++ b/paddle/fluid/inference/anakin/convert/softmax.h
@@ -25,6 +25,7 @@ class SoftMaxOpConverter : public AnakinOpConverter {
   SoftMaxOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~SoftMaxOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/split.cc b/paddle/fluid/inference/anakin/convert/split.cc
index b8464a766d21e93426eb4a00b8caab2af5470055..ec582c1812623cd4bcefa2097015ba258f6bacbb 100644
--- a/paddle/fluid/inference/anakin/convert/split.cc
+++ b/paddle/fluid/inference/anakin/convert/split.cc
@@ -30,6 +30,7 @@ namespace inference {
 namespace anakin {
 
 void SplitOpConverter::operator()(const framework::proto::OpDesc &op,
+                                  const framework::BlockDesc &block_desc,
                                   const framework::Scope &scope,
                                   bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/split.h b/paddle/fluid/inference/anakin/convert/split.h
index a4c6a14e62168ffaf5ff67b5cf953d477ff9e34d..184112e589e2bbdb30bc7a5d2cd053b7f3732a58 100644
--- a/paddle/fluid/inference/anakin/convert/split.h
+++ b/paddle/fluid/inference/anakin/convert/split.h
@@ -25,6 +25,7 @@ class SplitOpConverter : public AnakinOpConverter {
   SplitOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~SplitOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/sum.cc b/paddle/fluid/inference/anakin/convert/sum.cc
index df9104cf4631d86e0cbd87cb0e93a96d984953f5..2a4178e2371389b44557d44ea526c7cc4a731d16 100644
--- a/paddle/fluid/inference/anakin/convert/sum.cc
+++ b/paddle/fluid/inference/anakin/convert/sum.cc
@@ -31,6 +31,7 @@ namespace inference {
 namespace anakin {
 
 void SumOpConverter::operator()(const framework::proto::OpDesc &op,
+                                const framework::BlockDesc &block_desc,
                                 const framework::Scope &scope, bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
   PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2);
diff --git a/paddle/fluid/inference/anakin/convert/sum.h b/paddle/fluid/inference/anakin/convert/sum.h
index ddecc4b3bcb84f83af95e77399847f191c785563..b5d402b77fcf555ffaf910f8c9d1b7337181a64b 100644
--- a/paddle/fluid/inference/anakin/convert/sum.h
+++ b/paddle/fluid/inference/anakin/convert/sum.h
@@ -25,6 +25,7 @@ class SumOpConverter : public AnakinOpConverter {
   SumOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~SumOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc
index 6a887401034f9d8c0b8b6aa3eeffb6579e395029..f35372fe5c315ec68bc80a6d03c5931899ff7555 100644
--- a/paddle/fluid/inference/anakin/convert/transpose.cc
+++ b/paddle/fluid/inference/anakin/convert/transpose.cc
@@ -28,6 +28,7 @@ namespace inference {
 namespace anakin {
 
 void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::BlockDesc &block_desc,
                                       const framework::Scope &scope,
                                       bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
diff --git a/paddle/fluid/inference/anakin/convert/transpose.h b/paddle/fluid/inference/anakin/convert/transpose.h
index 62d26b6a9cc9885682f5750df32018596f014b33..bacbf152bc12319e6296677500b17d55d9772412 100644
--- a/paddle/fluid/inference/anakin/convert/transpose.h
+++ b/paddle/fluid/inference/anakin/convert/transpose.h
@@ -25,6 +25,7 @@ class TransposeOpConverter : public AnakinOpConverter {
   TransposeOpConverter() = default;
 
   virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                           const framework::Scope &scope,
                           bool test_mode) override;
   virtual ~TransposeOpConverter() {}
diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h
index e0371d95347a521f499dd9454d284907b3048a04..029aff6704ff1015e5c2378a2202c94043df990d 100644
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -112,6 +113,17 @@ class AnakinConvertValidation {
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
     RandomizeTensor(x_tensor, place_, ctx);
+
+    std::vector<int64_t> dim_vec_int64;
+    for (auto& ele : dim_vec) {
+      dim_vec_int64.push_back(static_cast<int64_t>(ele));
+    }
+
+    // Add var_desc to block_desc
+    auto* block_desc = program_desc_.MutableBlock(framework::kRootBlockIndex);
+
+    auto* var_desc = block_desc->Var(name);
+    var_desc->SetShape(dim_vec_int64);
   }
 
   void SetOp(const framework::proto::OpDesc& desc) {
@@ -119,8 +131,10 @@ class AnakinConvertValidation {
     op_desc_.reset(new framework::OpDesc(desc, nullptr));
     // should init anakin engine here.
 
+    auto& block_desc = program_desc_.Block(framework::kRootBlockIndex);
     Singleton<AnakinOpConverter>::Global().ConvertOp(
-        desc, parameters_, *scope_, engine_.get(), true /*test_mode*/);
+        desc, block_desc, parameters_, *scope_, engine_.get(),
+        true /*test_mode*/);
     engine_->Freeze();
 
     std::map<std::string, std::vector<int>> temp_max_input_shape;
@@ -194,6 +208,7 @@ class AnakinConvertValidation {
   cudaStream_t stream_;
   std::unique_ptr<framework::OperatorBase> op_;
   std::unique_ptr<framework::OpDesc> op_desc_;
+  framework::ProgramDesc program_desc_;
   const std::unordered_set<std::string>& parameters_;
   framework::Scope* scope_;
   platform::CUDAPlace place_;
diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc
index ccf78ad7e56306d24af829c45c888021f4e3fbc4..ba044c9401a5f0fb5a839c1766fdd9d412d42212 100644
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -91,7 +91,6 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
                    " or equal to the real input shape, Please set the max "
                    "input shape using EnableAnakinEngine");
     anakin_input->reshape(fluid_input_shape);
-
     ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
                                                        fluid_input_shape);
     anakin_input->copy_from(tmp_anakin_tensor);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 29f16943e0c13fbe080e8e073b081583f1d14d11..a736ca393ccb7168a9faf650a6bce13f35fffca8 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -168,6 +168,7 @@ struct Argument {
   DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
                       anakin_max_shape_t);
   DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int);
   DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
 
   // Memory optimized related.
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
index 38612d5cc3d093885144f3b1cd6107232885b645..b8d8b6fed8ca237e87cfc67979ec6ddd340b8916 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -151,13 +151,20 @@ void AnakinSubgraphPass::CreateAnakinOp(
   op_desc->SetType("anakin_engine");
 
   std::unordered_map<std::string, std::string> output_name_map;
+  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
+
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      graph_var_map[node->Name()] = node;
+    }
+  }
   auto &subgraph_nodes = *Agent(node).subgraph();
 
   // The following procedure is used to rename all the intermediate
   // variables and the output variables of the subgraph.
   RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
                       &output_names_with_id, &output_names, &output_name_map,
-                      false);
+                      graph_var_map, false);
 
   // When anakin engine runs at the end of the operation,
   // output_mapping help us copy the data from the renamed ITensor
@@ -168,13 +175,6 @@ void AnakinSubgraphPass::CreateAnakinOp(
     output_mapping.push_back(output_name_map[name]);
   }
 
-  auto *vars = block_desc.Proto()->mutable_vars();
-  for (framework::ir::Node *node : graph->Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      *vars->Add() = *node->Var()->Proto();
-    }
-  }
-
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                  "the block has no var-desc");
   PADDLE_ENFORCE(!output_mapping.empty());
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index a17ee1b707a7f950cddc62373a9a57c793d5528f..7c4aab06a1d2b3fadc76b46c7e95cea7818c56e2 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -60,6 +60,7 @@ void RenameAndGetOutputs(
     std::set<std::string> *output_names_with_id,
     std::set<std::string> *output_names,
     std::unordered_map<std::string, std::string> *output_name_map,
+    const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
     bool is_trt) {
   //// In the normal case, the paddle-trt exists bug when runing the googlenet.
   // When there are more than two convolutions of 1 * 1 with the same input, the
@@ -69,6 +70,15 @@ void RenameAndGetOutputs(
   std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
       same_hierarchy_conv2d_num_map;
 
+  auto add_block_var = [&](const std::string &graph_arg,
+                           const std::string &block_arg) {
+    auto arg_var_node = graph_var_map.find(graph_arg);
+    PADDLE_ENFORCE(arg_var_node != graph_var_map.end());
+    auto *var_t = block_desc->Var(block_arg);
+    var_t->SetShape(arg_var_node->second->Var()->GetShape());
+    var_t->SetDataType(arg_var_node->second->Var()->GetDataType());
+  };
+
   for (size_t index = 0; index < block_desc->OpSize(); ++index) {
     framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
     framework::OpDesc op_desc(*op, nullptr);
@@ -87,13 +97,20 @@ void RenameAndGetOutputs(
       auto *in_var = op->mutable_inputs(i);
       std::vector<std::string> replaced_names;
       for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
+        const std::string arg_value = in_var->arguments(k);
+        const std::string arg_value_with_id =
             arg_value + std::to_string(var2id[arg_value]);
+
         if (input_names_with_id.count(arg_value_with_id)) {
           replaced_names.push_back(arg_value);
+          if (graph_var_map.count(arg_value)) {
+            add_block_var(arg_value, arg_value);
+          }
         } else {
           replaced_names.push_back(arg_value_with_id);
+          if (graph_var_map.count(arg_value)) {
+            add_block_var(arg_value, arg_value_with_id);
+          }
         }
       }
       in_var->clear_arguments();
@@ -105,7 +122,6 @@ void RenameAndGetOutputs(
     for (auto out_var : correspond_node->outputs) {
       var2id[out_var->Name()] = out_var->id();
     }
-
     if (op_desc.Type() == "conv2d" && is_trt) {
       auto input_var_name = op_desc.Input("Input").front();
       auto filter_var_name = op_desc.Input("Filter").front();
@@ -125,15 +141,18 @@ void RenameAndGetOutputs(
         same_hierarchy_conv2d_num_map[input_var_name] += 1;
       }
     }
-
     // rename for the output variables of op inside subgraph
     for (int i = 0; i < op->outputs_size(); i++) {
       framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
       std::vector<std::string> replaced_names;
       for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
+        const std::string arg_value = out_var->arguments(k);
+        const std::string arg_value_with_id =
             arg_value + std::to_string(var2id[arg_value]);
+
+        if (graph_var_map.count(arg_value)) {
+          add_block_var(arg_value, arg_value_with_id);
+        }
         if (output_names_with_id->count(arg_value_with_id)) {
           (*output_name_map)[arg_value] = arg_value_with_id;
         }
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
index 3cf21bf5f426a7142626e6ae1db6ee478418d08a..bb445027821096689965096c69b8183dd9da403c 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -42,6 +42,7 @@ void RenameAndGetOutputs(
     std::set<std::string> *output_names_with_id,
     std::set<std::string> *output_names,
     std::unordered_map<std::string, std::string> *output_name_map,
+    const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
     bool is_trt = true);
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 019098a5dd0d372a690955698a2ab6a4039a2416..67650a352d8b8239da228462c21877ff440147b8 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -142,6 +142,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   }
 
   std::unordered_map<std::string, std::string> output_name_map;
+  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
+
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      graph_var_map[node->Name()] = node;
+    }
+  }
   auto &subgraph_nodes = *Agent(node).subgraph();
 
   // The following procedure is used to rename all the intermediate
@@ -157,7 +164,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // So we have to rename the variable in the subgraph to make sure
   // it is either an OP's input or an OP's output.
   RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
-                      &output_names_with_id, &output_names, &output_name_map);
+                      &output_names_with_id, &output_names, &output_name_map,
+                      graph_var_map);
 
   // When tensorrt engine runs at the end of the operation,
   // output_mapping help us copy the data from the renamed ITensor
@@ -168,14 +176,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
     output_mapping.push_back(output_name_map[name]);
   }
   PADDLE_ENFORCE(!output_mapping.empty());
-
-  auto *vars = block_desc.Proto()->mutable_vars();
-  for (framework::ir::Node *node : graph->Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      *vars->Add() = *node->Var()->Proto();
-    }
-  }
-
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                  "the block has no var-desc");
 
@@ -213,7 +213,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
   std::string trt_engine_serialized_data = "";
-
   SetAttr(op_desc->Proto(), "engine_serialized_data",
           trt_engine_serialized_data);
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 85910c10e7409ac629c8fb5265550a5cfa6bc2c2..0109b4a4fa7617880f642f5a39639bca38475515 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -115,6 +115,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(use_anakin_);
   CP_MEMBER(anakin_max_batchsize_);
   CP_MEMBER(anakin_max_input_shape_);
+  CP_MEMBER(anakin_min_subgraph_size_);
 
   // Ir related.
   CP_MEMBER(enable_ir_optim_);
@@ -315,6 +316,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << specify_input_name_;
   ss << cpu_math_library_num_threads_;
   ss << use_anakin_;
+  ss << anakin_min_subgraph_size_;
   return ss.str();
 }
 
@@ -386,10 +388,11 @@ void AnalysisConfig::SwitchIrDebug(int x) {
   Update();
 }
 void AnalysisConfig::EnableAnakinEngine(
-    int max_batch_size,
-    std::map<std::string, std::vector<int>> max_input_shape) {
+    int max_batch_size, std::map<std::string, std::vector<int>> max_input_shape,
+    int min_subgraph_size) {
   anakin_max_batchsize_ = max_batch_size;
   anakin_max_input_shape_ = max_input_shape;
+  anakin_min_subgraph_size_ = min_subgraph_size;
   use_anakin_ = true;
   Update();
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 7d8e9fe8bfada743388afd3ae4eedb5d84961706..6942604b0723f8665f0e8b058d48a5356a1a01f4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -385,6 +385,7 @@ void AnalysisPredictor::PrepareArgument() {
   if (config_.use_gpu() && config_.anakin_engine_enabled()) {
     argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
     argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
+    argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_);
     LOG(INFO) << "Anakin subgraph engine is enabled";
   }
 
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 2ad4add2945d65037829e0bb453372e38a04421c..c67c4b5bd0bfeea6d022f9e821f6d0b877c71d7a 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -151,7 +151,8 @@ struct AnalysisConfig {
    */
   void EnableAnakinEngine(
       int max_batch_size = 1,
-      std::map<std::string, std::vector<int>> max_input_shape = {});
+      std::map<std::string, std::vector<int>> max_input_shape = {},
+      int min_subgraph_size = 6);
 
   /** A boolean state indicating whether the Anakin sub-graph engine is used.
   */
@@ -288,6 +289,7 @@ struct AnalysisConfig {
 
   bool use_anakin_{false};
   int anakin_max_batchsize_;
+  int anakin_min_subgraph_size_{6};
   std::map<std::string, std::vector<int>> anakin_max_input_shape_;
   std::map<std::string, std::string> engine_opt_info_;
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 87e02a02caebd93d701dfd9e51c35fb974c770ed..3d72295be4b779693a56d1ddb6bc4aad7c2c82c9 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -86,7 +86,8 @@ const std::vector<std::string> kAnakinSubgraphPasses({
 
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
-    "infer_clean_graph_pass",  //
+    "infer_clean_graph_pass",          //
+        "runtime_context_cache_pass",  //
         //   "identity_scale_op_clean_pass",              //
         "conv_affine_channel_fuse_pass",             //
         "conv_eltwiseadd_affine_channel_fuse_pass",  //
@@ -96,7 +97,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "conv_elementwise_add_act_fuse_pass",   //
         "conv_elementwise_add2_act_fuse_pass",  //
         "conv_elementwise_add_fuse_pass",       //
-        "runtime_context_cache_pass",           //
 #endif                                          //
         "transpose_flatten_concat_fuse_pass",
   });
@@ -116,7 +116,11 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   // NOTE the large fusions should be located in the front, so that they will
   // not be damaged by smaller ones.
   passes_.assign({
-      "infer_clean_graph_pass",         //
+      "infer_clean_graph_pass",  //
+      // TODO(luotao): runtime_context_cache_pass should be located in the
+      // front, see https://github.com/PaddlePaddle/Paddle/issues/16609,
+      // will enhance this pass later.
+      "runtime_context_cache_pass",     //
       "attention_lstm_fuse_pass",       //
       "seqpool_concat_fuse_pass",       //
       "seqconv_eltadd_relu_fuse_pass",  //
@@ -132,8 +136,6 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
       "conv_bn_fuse_pass",             //
       "conv_eltwiseadd_bn_fuse_pass",  //
       "is_test_pass",                  //
-      "identity_scale_op_clean_pass",  //
-      "runtime_context_cache_pass",    //
   });
   use_gpu_ = false;
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
index ece094717b8076321c68d7fdd29f07c4da6b0ed4..fbf67d933786e3ee2baab7a20911da2837cdce4d 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -23,18 +23,11 @@ namespace analysis {
 
 void SetConfig(AnalysisConfig *cfg) {
   cfg->SetModel(FLAGS_infer_model);
-  cfg->SetProgFile("__model__");
   cfg->DisableGpu();
   cfg->SwitchIrOptim();
-  cfg->SwitchSpecifyInputNames(false);
+  cfg->SwitchSpecifyInputNames();
   cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
   cfg->EnableMKLDNN();
-  cfg->pass_builder()->SetPasses(
-      {"infer_clean_graph_pass", "mkldnn_placement_pass",
-       "depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass",
-       "conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass",
-       "conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass",
-       "fc_fuse_pass", "is_test_pass"});
 }
 
 template <typename T>
@@ -84,13 +77,13 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
           std::to_string(num_images) + " is bigger than all test data size.");
 
   PaddleTensor images;
-  images.name = "input";
+  images.name = "image";
   images.shape = {num_images, 3, 224, 224};
   images.dtype = PaddleDType::FLOAT32;
   images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
 
   PaddleTensor labels;
-  labels.name = "labels";
+  labels.name = "label";
   labels.shape = {num_images, 1};
   labels.dtype = PaddleDType::INT64;
   labels.data.Resize(sizeof(int64_t) * num_images);
@@ -132,7 +125,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
       images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
 
   TensorReader<float> image_reader(file, images_offset_in_file,
-                                   image_batch_shape, "input");
+                                   image_batch_shape, "image");
   TensorReader<int64_t> label_reader(file, labels_offset_in_file,
                                      label_batch_shape, "label");
 
diff --git a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..46ca08992f00bf48bfa1fb9768fe1e4b60b16d73
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
@@ -0,0 +1,70 @@
+# INT8 MKL-DNN quantization 
+
+This document describes how to use Paddle inference Engine to convert the FP32 model to INT8 model on ResNet-50 and MobileNet-V1. We provide the instructions on enabling INT8 MKL-DNN quantization in Paddle inference and show the ResNet-50 and MobileNet-V1 results in accuracy and performance.
+
+## 0. Install PaddlePaddle 
+Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments. 
+```
+cmake ..  -DWITH_TESTING=ON -WITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_MKL=ON  -WITH_SWIG_PY=OFF -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
+
+```  
+Note: MKL-DNN and MKL are required.
+
+## 1. Enable INT8 MKL-DNN quantization 
+For reference, please examine the code of unit test enclosed in [analyzer_int8_image_classification_tester.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc).
+
+* ### Create Analysis config
+INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease) 
+
+* ### Create quantize config by analysis config
+We enable the MKL-DNN quantization procedure by calling an appropriate method from analysis config. Afterwards, all the required quantization parameters (quantization op names, quantization strategies etc.) can be set through quantizer config which is present in the analysis config. It is also necessary to specify a pre-processed warmup dataset and desired batch size.
+
+```cpp
+//Enable MKL-DNN quantization
+cfg.EnableMkldnnQuantizer();
+
+//use analysis config to call the MKL-DNN quantization config
+cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); 
+cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
+```
+
+## 2. Accuracy and Performance benchmark
+
+We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 on single core.
+
+   >**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
+
+| Model  | Dataset  | FP32 Accuracy  | INT8 Accuracy  | Accuracy Diff  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  | 76.63%  | 76.48%  | 0.15% |
+| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.36%  | 0.42%  |
+
+   >**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
+
+| Model  | Dataset  | FP32 Throughput  | INT8 Throughput  |  Ratio(INT8/FP32)  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  |  13.17 images/s | 49.84 images/s | 3.78 |
+| MobileNet-V1 | Full ImageNet Val  | 75.49 images/s | 232.38 images/s | 3.07  |
+
+Notes:
+* Measurement of accuracy requires a model which accepts two inputs: data and labels.
+* Different sampling batch data may cause slight difference on INT8 top accuracy.
+* C-API performance data is better than Python API performance data because of the python overhead. Especially for the small computational model, python overhead will be more obvious. 
+
+
+## 3. Commands to reproduce the above accuracy and performance benchmark
+* #### Full dataset (Single core)
+   * ##### Download full ImageNet Validation Dataset
+```bash
+cd /PATH/TO/PADDLE/build
+python ../paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+```
+The converted data binary file is saved by default in ~/.cache/paddle/dataset/int8/download/int8_full_val.bin
+   * ##### ResNet50 Full dataset benchmark
+```bash
+./paddle/fluid/inference/tests/api/test_analyzer_int8_resnet50 --infer_model=third_party/inference_demo/int8v2/resnet50/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
+```
+   * ##### Mobilenet-v1 Full dataset benchmark
+```bash
+./paddle/fluid/inference/tests/api/test_analyzer_int8_mobilenet --infer_model=third_party/inference_demo/int8v2/mobilenet/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
+```
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 9a0dcc722cf00984b8c0e3ac20f13849e2904102..5cc54ed299c50b48c83de2742b715b16cf1f8cd0 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -316,7 +316,8 @@ void PredictionRun(PaddlePredictor *predictor,
                    int num_threads, int tid) {
   int num_times = FLAGS_repeat;
   int iterations = inputs.size();  // process the whole dataset ...
-  if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size())
+  if (FLAGS_iterations > 0 &&
+      FLAGS_iterations < static_cast<int64_t>(inputs.size()))
     iterations =
         FLAGS_iterations;  // ... unless the number of iterations is set
   outputs->resize(iterations);
@@ -329,14 +330,14 @@ void PredictionRun(PaddlePredictor *predictor,
 #endif
   if (!FLAGS_zero_copy) {
     run_timer.tic();
-    for (size_t i = 0; i < iterations; i++) {
+    for (int i = 0; i < iterations; i++) {
       for (int j = 0; j < num_times; j++) {
         predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
       }
     }
     elapsed_time = run_timer.toc();
   } else {
-    for (size_t i = 0; i < iterations; i++) {
+    for (int i = 0; i < iterations; i++) {
       ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
       run_timer.tic();
       for (int j = 0; j < num_times; j++) {
@@ -366,9 +367,8 @@ void TestOneThreadPrediction(
     const std::vector<std::vector<PaddleTensor>> &inputs,
     std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
   auto predictor = CreateTestPredictor(config, use_analysis);
-  PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads,
-                   0);
-  PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0);
+  PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
+  PredictionRun(predictor.get(), inputs, outputs, 1, 0);
 }
 
 void TestMultiThreadPrediction(
diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h
index 9d5b4f6f54ccfc9802cef6abac428e28a72ac293..e4feb14b2271a50c8e8fb7ce4c81dd6c99042e21 100644
--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
@@ -120,40 +120,8 @@ class AnakinEngineOp : public framework::OperatorBase {
           inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
               .Get(engine_key_);
     }
-
     return anakin_engine_;
   }
-
-  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
-               AnakinNvEngineT *engine) const {
-    LOG(INFO) << "Prepare Anakin engine (Optimize model structure, Select OP "
-                 "kernel etc). This process may cost a lot of time.";
-    framework::proto::BlockDesc block_desc;
-    block_desc.ParseFromString(Attr<std::string>("subgraph"));
-
-    std::vector<std::string> output_maps =
-        Attr<std::vector<std::string>>("output_name_mapping");
-
-    inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
-        .ConvertBlock(block_desc, param_names_, scope, engine);
-    engine->Freeze();
-    for (const auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
-      auto &t =
-          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
-      auto t_shape = framework::vectorize2int(t.dims());
-      // all input shape should be 4 dims
-      if (t_shape.size() == 2) {
-        t_shape.push_back(1);
-        t_shape.push_back(1);
-      }
-      engine->SetInputShape(x, t_shape);
-    }
-
-    engine->Optimize();
-
-    engine->InitGraph();
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h
index bd22d16f7a21877af4e78c30f7e0985c64b543f2..197bf59b2a470e1f6e4e31c6706d1e3f8e73fbbc 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -24,18 +24,21 @@ class DGCClipByNormKernel : public ClipByNormKernel<DeviceContext, T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
-    if (static_cast<int>(rampup_begin_step) >= 0) {
-      auto current_step_tensor =
-          context.Input<framework::Tensor>("current_step");
-      auto* current_step = current_step_tensor->data<T>();
-
-      if (static_cast<int>(*current_step) <
-          static_cast<int>(rampup_begin_step)) {
-        VLOG(10) << "current_step:" << *current_step
-                 << " < rampup_begin_step:" << rampup_begin_step
-                 << " so does't use dgc_clip_by_norm";
-        return;
-      }
+    if (static_cast<int>(rampup_begin_step) < 0) {
+      return;
+    }
+
+    auto current_step_tensor = context.Input<framework::Tensor>("current_step");
+    auto* current_step = current_step_tensor->data<T>();
+
+    VLOG(10) << "current_step:" << *current_step
+             << ", rampup_begin_step:" << rampup_begin_step;
+
+    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
+      VLOG(10) << "current_step:" << *current_step
+               << " < rampup_begin_step:" << rampup_begin_step
+               << " so does't use dgc_clip_by_norm";
+      return;
     }
 
     return ClipByNormKernel<DeviceContext, T>::Compute(context);
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index d30fa014ed5fbac9ed71f3185ce0443d33f4a281..875d4f864353c131ca4d72b5176adcae8aff724a 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -991,15 +991,17 @@ TEST(JITKernel_pool, jitpool) {
 
 TEST(JITKernel_pool, more) {
   const auto& kers = jit::KernelPool::Instance().AllKernels();
-#if defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(kers.size(), 10UL);
-#else
-#ifdef PADDLE_WITH_MKLML
-  EXPECT_EQ(kers.size(), 22UL);
-#else
-  EXPECT_EQ(kers.size(), 8UL);
+  size_t target_num = 8;
+
+#ifdef __AVX__
+  target_num += 2;
 #endif
+
+#ifdef PADDLE_WITH_MKLML
+  target_num += 12;
 #endif
+
+  EXPECT_EQ(kers.size(), target_num);
 }
 
 TEST(JITKernel_pool, refer) {
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 656728c609eb19f90390d9dec72d9e30fd3040fd..435c755df3642ae0ba5144a89ed30ed6e0b63258 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = framework::OpKernelType(
-        framework::proto::VarType::FP32, platform::CPUPlace());
+        framework::proto::VarType::FP32, ctx.GetPlace());
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
index 9f73bbc1fdc72766a0b57bc72c62d208277c2f20..5ef385d2fcbaf01dce5c9b85321b41c103e5655a 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -75,6 +75,7 @@ std::vector<std::string> NgraphEngine::feed_vars = {};
 std::vector<std::string> NgraphEngine::fetch_vars = {};
 framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
 const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
+bool NgraphEngine::is_training = false;
 
 std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
 std::unordered_map<std::string,
@@ -93,11 +94,13 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
   int size = ops->size();
   int left = 0;
   while (left < size && ops->at(left)->Type() != framework::kFeedOpType &&
+         ops->at(left)->Type() != "read" &&
          ops->at(left)->Type() != framework::kFetchOpType) {
     ++left;
   }
 
-  while (left < size && ops->at(left)->Type() == framework::kFeedOpType) {
+  while (left < size && (ops->at(left)->Type() == framework::kFeedOpType ||
+                         ops->at(left)->Type() == "read")) {
     for (auto& var_name_item : ops->at(left)->Outputs()) {
       for (auto& var_name : var_name_item.second) {
         NgraphEngine::feed_vars.emplace_back(var_name);
@@ -270,6 +273,7 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
 
   for (auto op_desc : ops_desc) {
     if (op_desc->Type().find("_grad") != std::string::npos) {
+      is_training = true;
       this->is_test_ = false;
       break;
     }
@@ -590,7 +594,7 @@ void NgraphEngine::Run(const framework::Scope& scope,
       }
       bool is_persistable =
           (p_persistables->find(vi) != p_persistables->end()) ? true : false;
-      if (is_test && is_persistable) {
+      if (!is_training && is_test && is_persistable) {
         ti->set_stale(false);
       }
       (*p_t_in).emplace_back(ti);
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h
index b6532519e947bc59f0605c4f2008270f5e51b0e0..19400ac5b0ecd9d3254583b8db9889fc6cf8bc0f 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -57,6 +57,7 @@ class NgraphEngine {
 
   void Run(const framework::Scope& scope, const platform::Place& place) const;
 
+  static bool is_training;
   static const framework::BlockDesc* p_bdesc;
   static std::vector<std::string> feed_vars, fetch_vars;
 
diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py
index c56652e103ce93bf5459b30b66c7b1f04e7c14d0..f8e607aab8491a45958843745bd7aa7e3021fc15 100644
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -91,6 +91,10 @@ class LayerObjectHelper(LayerHelperBase):
 
         Returns input, param_attr
         """
+        param_attr_in = ParamAttr._to_attr(param_attr_in)
+        if isinstance(param_attr_in, bool):
+            raise ValueError('Param_attr should not be False in {}'.format(
+                self.name))
         inputs = inputs_in if (inputs_in is not None) else []
         inputs = self._multiple_input(inputs)
         param_attrs = self._multiple_param_attr(len(inputs), param_attr_in)
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index ddf0de9a29b3b9e5346711329d5613d1f2241b3c..527c37cb2c4f1540fb8c464dfdbe061b2899f678 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -511,46 +511,69 @@ class FC(layers.Layer):
         self._param_attr = param_attr
         self._bias_attr = bias_attr
         self._act = act
+        self.__w = list()
 
-    def _build_once(self, input):
-        input_shape = input.shape
-        param_shape = [
-            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
-        ] + [self._size]
-        self._w = self.create_parameter(
-            attr=self._param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=False)
+    @property
+    def _w(self, i=0):
+        return self.__w[i]
 
-        if self._bias_attr:
-            size = list([self._size])
-            self._b = self.create_parameter(
-                attr=self._bias_attr,
-                shape=size,
-                dtype=self._dtype,
-                is_bias=True)
-        else:
-            self._b = None
+    @_w.setter
+    def _w(self, value, i=0):
+        assert isinstance(value, Parameter)
+        self.__w[i] = value
 
-    def forward(self, input):
-        tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": input,
-                    "Y": self._w},
-            outputs={"Out": tmp},
-            attrs={
-                "x_num_col_dims": self._num_flatten_dims,
-                "y_num_col_dims": 1
-            })
+    def _build_once(self, input):
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            input_shape = inp.shape
+
+            param_shape = [
+                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
+                       1)
+            ] + [self._size]
+            self.__w.append(
+                self.add_parameter(
+                    '_w%d' % i,
+                    self.create_parameter(
+                        attr=param,
+                        shape=param_shape,
+                        dtype=self._dtype,
+                        is_bias=False)))
+            i += 1
+
+        size = list([self._size])
+        self._b = self.create_parameter(
+            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
 
-        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="sum",
-            inputs={"X": [tmp]},
-            outputs={"Out": pre_bias},
-            attrs={"use_mkldnn": False})
+    def forward(self, input):
+        mul_results = list()
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            tmp = self._helper.create_variable_for_type_inference(self._dtype)
+            self._helper.append_op(
+                type="mul",
+                inputs={"X": inp,
+                        "Y": self.__w[i]},
+                outputs={"Out": tmp},
+                attrs={
+                    "x_num_col_dims": self._num_flatten_dims,
+                    "y_num_col_dims": 1
+                })
+            i += 1
+            mul_results.append(tmp)
+
+        if len(mul_results) == 1:
+            pre_bias = mul_results[0]
+        else:
+            pre_bias = self._helper.create_variable_for_type_inference(
+                self._dtype)
+            self._helper.append_op(
+                type="sum",
+                inputs={"X": mul_results},
+                outputs={"Out": pre_bias},
+                attrs={"use_mkldnn": False})
 
         if self._b:
             pre_activation = self._helper.create_variable_for_type_inference(
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 0f5a8f51463a63dff0ef9f7aaac992c31753b584..7953d98bcbb826267fa21f6503e55049c8aff5ba 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -493,7 +493,8 @@ class Variable(object):
         self._ivar._run_backward()
 
     def _gradient(self):
-        return np.array(self._ivar._grad_value())
+        new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
+        return np.array(new_ivar.value().get_tensor())
 
     def _clear_gradient(self):
         self._ivar._clear_gradient()
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 79accabe87869c832b7467acbaf70d11cbca8a96..94bc3d0854d5b17de4811aca07c35fbaa0e382ca 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -752,7 +752,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
             force_cpu=True)
 
         for param_var, grad_var in param_and_grads:
-            var_numel = reduce(lambda x, y: x * y, param_var.shape)
+            var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
             if var_numel < 16384 or \
                 param_var.type == core.VarDesc.VarType.SELECTED_ROWS  or \
                 grad_var.type == core.VarDesc.VarType.SELECTED_ROWS  or  \
@@ -832,7 +832,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
             type=x.type, name=name, dtype=x.dtype, persistable=False)
 
         helper.append_op(
-            type="clip_by_norm",
+            type="dgc_clip_by_norm",
             inputs={"X": x,
                     "current_step": self._global_step_var},
             attrs={
@@ -845,7 +845,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
     def _append_clip_norm(self, grad_var, clip_norm):
         with grad_var.block.program._backward_role_guard():
             return self._clip_by_norm(
-                x=grad_var, max_norm=clip_norm, name=grad_var.name + "@DGC")
+                x=grad_var, max_norm=clip_norm, name=grad_var.name)
 
     def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
                 encoded_var):
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 6b88e7a99fd78f6a7670ba55bc678e85d229ddf4..1a91dafb8a5048c293a3001d6043282775a4cddd 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -104,10 +104,11 @@ class ParallelExecutor(object):
         self._scope = scope if scope is not None else executor.global_scope()
 
         if main_program is not None and main_program._enable_dgc:
+            assert num_trainers > 1, "dgc is not useful for single trainer training."
             assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
             assert num_trainers * len(
-                self._places) > 1, "dgc is not useful for single card training"
-            assert use_cuda
+                self._places) > 1, "dgc is not useful for single card training."
+            assert use_cuda, "dgc only used when cuda is used."
 
         main_program = main_program if main_program is not None \
             else framework.default_main_program()
@@ -123,6 +124,11 @@ class ParallelExecutor(object):
             exec_strategy=exec_strategy,
             share_vars_from=share_vars_from._compiled_program
             if share_vars_from else None)
+
+        # FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr.
+        if main_program._enable_dgc:
+            self._compiled_program._build_strategy.is_distribution = True
+
         self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
         self._exe = executor.Executor(self._place)
         self._compiled_program._compile(place=self._place, scope=self._scope)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 9c0efe6d905929f87106f18ecf74a7915e39eba9..e75c4cb30a1b084e44f07119d4590785850c5d9c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -139,8 +139,7 @@ class TestDistRunnerBase(object):
         pass_builder = None
         if args.batch_merge_repeat > 1:
             pass_builder = build_stra._finalize_strategy_and_create_passes()
-            mypass = pass_builder.insert_pass(
-                len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
+            mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass")
             mypass.set("num_repeats", args.batch_merge_repeat)
 
         if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 998c675815ece9236c819bffc4a4b74d44ff790e..eb8a82430f062003a66c159c679fe51d7994971a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import paddle.fluid as fluid
+from paddle.fluid import core
 from paddle.fluid.dygraph.nn import Embedding
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import SGDOptimizer
@@ -200,8 +201,6 @@ class PtbModel(fluid.dygraph.Layer):
             rnn_out, shape=[-1, self.num_steps, self.hidden_size])
         projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
         projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
-        projection = fluid.layers.reshape(
-            projection, shape=[-1, self.vocab_size])
         projection = fluid.layers.reshape(
             projection, shape=[-1, self.vocab_size])
         loss = fluid.layers.softmax_with_cross_entropy(
@@ -223,6 +222,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
         num_steps = 3
         init_scale = 0.1
         batch_size = 4
+        batch_num = 200
 
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
@@ -242,7 +242,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
             dy_loss = None
             last_hidden = None
             last_cell = None
-            batch_num = 200
 
             for i in range(batch_num):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -280,9 +279,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             sgd = SGDOptimizer(learning_rate=1e-3)
-            x = fluid.layers.data(name="x", shape=[-1, 3, 1], dtype='int64')
+            x = fluid.layers.data(
+                name="x", shape=[-1, num_steps, 1], dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
             init_hidden = fluid.layers.data(
                 name="init_hidden", shape=[1], dtype='float32')
@@ -332,7 +333,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
                     for k in range(3, len(out)):
                         static_param_updated[static_param_name_list[k -
                                                                     3]] = out[k]
-
         self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy()))
         self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy()))
         self.assertTrue(
@@ -340,13 +340,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
         for key, value in six.iteritems(static_param_init):
             # print("static_init name: {}, value {}".format(key, value))
             # print("dy_init name: {}, value {}".format(key, dy_param_init[key]))
-            self.assertTrue(np.allclose(value, dy_param_init[key], atol=1e-5))
+            self.assertTrue(np.allclose(value, dy_param_init[key]))
         for key, value in six.iteritems(static_param_updated):
             # print("static name: {}, value {}".format(key, value))
             # print("dy name: {}, value {}".format(key, dy_param_updated[key]))
-            self.assertTrue(
-                np.allclose(
-                    value, dy_param_updated[key], atol=1e-5))
+            self.assertTrue(np.allclose(value, dy_param_updated[key]))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
index 3bdf3349730b0c9916449cfe0658d5a3c88834ed..ef9d3ffca2dfcef4719184f1829ff42989d93166 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -302,8 +302,7 @@ use_py_reader = False
 # if we run sync mode
 sync = False
 
-# how many batches we use
-batch_num = 2
+batch_num = 5
 
 np.random.seed = 1
 src_word_np = np.random.randint(
@@ -335,24 +334,6 @@ lbl_word_np = np.random.randint(
     dtype='int64')
 lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
 
-# np.random.seed = 1
-# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
-# src_pos_np = np.random.randint(
-#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-#
-# trg_word_np =  np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
-# trg_pos_np = np.random.randint(
-#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-#
-# lbl_word_np =  np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
-# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
-#
 pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
                                   ModelHyperParams.d_model)
 pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
@@ -739,7 +720,7 @@ class DecoderSubLayer(Layer):
         enc_attn_output_pp = self._multihead_attention_layer2(
             pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
         enc_attn_output = self._post_process_layer2(
-            slf_attn_output, enc_attn_output_pp, self._postprocess_cmd,
+            slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
             self._prepostprcess_dropout)
         pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
                                                     self._preprocess_cmd,
@@ -1076,20 +1057,17 @@ class TestDygraphTransformer(unittest.TestCase):
                                                                     4]] = out[k]
 
         self.assertTrue(
-            np.allclose(static_avg_cost_value, dy_avg_cost._numpy()))
+            np.array_equal(static_avg_cost_value, dy_avg_cost._numpy()))
         self.assertTrue(
-            np.allclose(static_sum_cost_value, dy_sum_cost._numpy()))
+            np.array_equal(static_sum_cost_value, dy_sum_cost._numpy()))
         self.assertTrue(
-            np.allclose(
-                static_predict_value, dy_predict._numpy(), atol=1e-5))
+            np.array_equal(static_predict_value, dy_predict._numpy()))
         self.assertTrue(
-            np.allclose(static_token_num_value, dy_token_num._numpy()))
+            np.array_equal(static_token_num_value, dy_token_num._numpy()))
         for key, value in six.iteritems(static_param_init):
-            self.assertTrue(np.allclose(value, dy_param_init[key]))
+            self.assertTrue(np.array_equal(value, dy_param_init[key]))
         for key, value in six.iteritems(static_param_updated):
-            self.assertTrue(
-                np.allclose(
-                    value, dy_param_updated[key], atol=1e-4))
+            self.assertTrue(np.array_equal(value, dy_param_updated[key]))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index bda8b666dcde22b0e4bacdb5db252267f4c7e34b..645b0188d5f45935ace074ba343de246af476b41 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -38,7 +38,15 @@ def Lenet(data, class_dim):
 
 
 class TestFetchAndFeed(unittest.TestCase):
-    def parallel_exe(self, use_cuda, run_parallel_exe, seed=1):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
+    def parallel_exe(self,
+                     use_cuda,
+                     run_parallel_exe,
+                     use_experimental_executor=False,
+                     seed=1):
         main_program = fluid.Program()
         startup = fluid.Program()
         startup.random_seed = seed
@@ -63,8 +71,12 @@ class TestFetchAndFeed(unittest.TestCase):
         build_strategy = fluid.BuildStrategy()
         build_strategy.enable_inplace = False
         build_strategy.memory_optimize = False
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.use_experimental_executor = use_experimental_executor
         train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
+            loss_name=loss.name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
 
         run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)
 
@@ -131,8 +143,7 @@ class TestFetchAndFeed(unittest.TestCase):
             if batch_id == 2:
                 break
 
-    def test_fetch(self):
-        os.environ['CPU_NUM'] = str(4)
+    def test_fetch_with_threaded_executor(self):
         if core.is_compiled_with_cuda():
             self.parallel_exe(
                 use_cuda=True,
@@ -140,8 +151,18 @@ class TestFetchAndFeed(unittest.TestCase):
         self.parallel_exe(
             use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_fetch)
 
+    def test_fetch_with_fast_threaded_executor(self):
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(
+                use_cuda=True,
+                run_parallel_exe=self.run_parallel_exe_with_fetch,
+                use_experimental_executor=True)
+        self.parallel_exe(
+            use_cuda=False,
+            run_parallel_exe=self.run_parallel_exe_with_fetch,
+            use_experimental_executor=True)
+
     def test_feed(self):
-        os.environ['CPU_NUM'] = str(4)
         if core.is_compiled_with_cuda():
             self.parallel_exe(
                 use_cuda=True, run_parallel_exe=self.run_parallel_exe_with_feed)