merge conflict, test=release/1.4

b90c08c3 · lujun · 7bcb070a · cb9c59bd · b90c08c3 · b90c08c3
86 changed file
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -201,7 +201,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
    ENDIF()

-    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_REPO "https://github.com/protocolbuffers/protobuf.git")
    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")

    ExternalProject_Add(

--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -13,44 +13,160 @@
 // limitations under the License.

 #include <algorithm>
-#include <memory>
+#include <map>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>

-#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_graph_view.h"
-#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"

 namespace paddle {
 namespace framework {
 namespace details {

-VarHandle* GetValidInput(const OpHandleBase* a) {
-  for (auto p : a->Inputs()) {
-    VarHandle* b = dynamic_cast<VarHandle*>(p);
-    if (b) {
-      return b;
+class AllReduceDepsPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override {
+    std::vector<AllReduceOpHandle*> all_reduce_op_handles =
+        GetSortedAllReduceOps(*graph);
+
+    for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) {
+      auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      all_reduce_op_handles[i - 1]->AddOutput(dep_var);
+      all_reduce_op_handles[i]->AddInput(dep_var);
+    }
+
+    if (VLOG_IS_ON(10)) {
+      DebugString(*graph, all_reduce_op_handles);
+    }
+  }
+
+  std::vector<AllReduceOpHandle*> GetSortedAllReduceOps(
+      const ir::Graph& graph) const {
+    std::vector<AllReduceOpHandle*> all_reduce_op_handles;
+    std::unordered_map<OpHandleBase*, size_t> pending_ops;
+    std::unordered_set<OpHandleBase*> ready_ops;
+    std::unordered_set<OpHandleBase*> next_ready_ops;
+
+    auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(graph);
+    size_t num_of_ops = op_handles.size();
+    for (OpHandleBase* op : op_handles) {
+      size_t not_ready_vars = op->NotReadyInputSize();
+      if (not_ready_vars) {
+        pending_ops.insert({op, not_ready_vars});
+      } else {
+        ready_ops.insert(op);
      }
    }

-  return nullptr;
-}
+    GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);

-void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
-  auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+    size_t has_run_ops = ready_ops.size();
+    while (has_run_ops != num_of_ops) {
+      for (auto* op : ready_ops) {
+        for (auto& ready_var : op->Outputs()) {
+          for (auto* pend_op : ready_var->PendingOps()) {
+            auto& deps = --pending_ops[pend_op];
+            if (deps == 0) {
+              next_ready_ops.insert(pend_op);
+            }
+          }
+        }
+      }

+      PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle.");
+      ready_ops.clear();
+      std::swap(ready_ops, next_ready_ops);
+      GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
+      has_run_ops += ready_ops.size();
+    }
+    return all_reduce_op_handles;
+  }
+
+  void GetSortedAllReduceOps(
+      const std::unordered_set<OpHandleBase*>& ready_ops,
+      std::vector<AllReduceOpHandle*>* all_reduce_op_handles) const {
+    std::vector<AllReduceOpHandle*> current_all_reduce_op_handles;
+    for (auto& op_handle : ready_ops) {
+      auto all_reduce_op_handle = dynamic_cast<AllReduceOpHandle*>(op_handle);
+      if (all_reduce_op_handle) {
+        current_all_reduce_op_handles.emplace_back(all_reduce_op_handle);
+      }
+    }
+
+    // NOTE(zcd): For distributed training, it is important to keep the order of
+    // allReduce on each node consistent. Otherwise, hang may occur.
+    // Sort the current_all_reduce_op_handles according to the name of input.
+    sort(current_all_reduce_op_handles.begin(),
+         current_all_reduce_op_handles.end(),
+         [](const AllReduceOpHandle* left,
+            const AllReduceOpHandle* right) -> bool {
+           auto left_in_vars = DynamicCast<VarHandle>(left->Inputs());
+           auto right_in_vars = DynamicCast<VarHandle>(right->Inputs());
+           PADDLE_ENFORCE_GT(left_in_vars.size(), 0);
+           PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size());
+           return left_in_vars[0]->Name() > right_in_vars[0]->Name();
+         });
+
+    all_reduce_op_handles->insert(all_reduce_op_handles->end(),
+                                  current_all_reduce_op_handles.begin(),
+                                  current_all_reduce_op_handles.end());
+  }
+
+  void DebugString(
+      const ir::Graph& graph,
+      const std::vector<AllReduceOpHandle*>& all_reduce_op_handles) const {
    // get vars order
+    std::map<int, std::vector<std::string>> vars =
+        GetSoredGradientsFromStaleProgram(graph);
+    std::stringstream out;
+    size_t grads_of_stale_program = 0;
+    out << "Get Order From kStaleProgramOpDescs: ";
+    for (auto& var : vars) {
+      out << "Order " << var.first << " [";
+      for (auto& var_name : var.second) {
+        out << var_name << ", ";
+        ++grads_of_stale_program;
+      }
+      out << "], ";
+    }
+    VLOG(10) << out.str();
+
+    std::stringstream out2;
+    out2 << "Get Order From Topological order: ";
+    for (auto& op : all_reduce_op_handles) {
+      bool find_valid_input = false;
+      for (auto& in_var : op->Inputs()) {
+        if (dynamic_cast<VarHandle*>(in_var)) {
+          out2 << in_var->Name() << ", ";
+          find_valid_input = true;
+          break;
+        }
+      }
+      PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input.");
+    }
+    VLOG(10) << out2.str();
+    if (grads_of_stale_program != all_reduce_op_handles.size()) {
+      VLOG(10)
+          << "The gradients number of stale program and graph is not equal.";
+    }
+  }
+
+  std::map<int, std::vector<std::string>> GetSoredGradientsFromStaleProgram(
+      const ir::Graph& graph) const {
+    std::map<int, std::vector<std::string>> vars;
+    auto ops = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
    int order = 0;
-  std::unordered_map<std::string, int> vars;
-  // TODO(gongwb): use graph topology sort to find the order of operators.
-  //               Note that must assert topology sort is stable
-  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
    for (auto* op_desc : ops) {
      try {
        bool is_bk_op =
@@ -62,76 +178,21 @@ void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
        auto backward_vars =
            boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+        if (backward_vars.empty()) continue;

-      auto outputs = op_desc->Outputs();
-      for (auto& o_it : outputs) {
-        for (auto& v : o_it.second) {  // values
-          vars[v] = order;
-          VLOG(10) << "in all_reduce_deps_pass:" << v;
-        }
+        PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+        for (size_t i = 1; i < backward_vars.size(); i += 2) {
+          vars[order].emplace_back(backward_vars[i]);
+          VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1]
+                  << ", " << backward_vars[i];
        }
        order++;
      } catch (boost::bad_get e) {
      }
    }
-
-  std::vector<OpHandleBase*> dist_ops;
-  // get allreduce ops.
-  for (auto& op : graph_ops) {
-    // FIXME(gongwb):add broad cast.
-    if (op->Name() == "all_reduce" || op->Name() == "reduce") {
-      dist_ops.push_back(op);
-    }
+    return vars;
  }
-
-  VLOG(10) << "dist_ops size:" << dist_ops.size()
-           << ", outputs size:" << vars.size() << ", ops size:" << ops.size();
-
-  std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
-                                                  OpHandleBase* op2) {
-    VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
-    VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
-
-    PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
-                   op1->DebugString(), op2->DebugString());
-
-    auto l_it = vars.find(i0->name());
-    auto r_it = vars.find(i1->name());
-
-    PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(),
-                   "can't find var's name %s and %s in opdesc", i0->name(),
-                   i1->name());
-
-    if (l_it->second < r_it->second) return true;
-
-    if (l_it->second == r_it->second) {
-      return i0->name() < i1->name();
-    }
-
-    return false;
-  });
-
-  // add dependency.
-  auto& sorted_ops = dist_ops;
-  for (size_t i = 1; i < sorted_ops.size(); ++i) {
-    auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-
-    auto* pre_op = sorted_ops[i - 1];
-    auto* op = sorted_ops[i];
-
-    pre_op->AddOutput(dep_var);
-    op->AddInput(dep_var);
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-
-    VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
-             << " and " << op;
-
-    VLOG(10) << "pre_op:" << pre_op->DebugString()
-             << ", op:" << op->DebugString();
-  }
-}
-
+};
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/details/all_reduce_deps_pass.h
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// TODO(gongwb): overlap allreduce with backward computation.
-class AllReduceDepsPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -28,7 +28,7 @@
 // asynchronous nccl allreduce or synchronous issue:
 // https://github.com/PaddlePaddle/Paddle/issues/15049
 DEFINE_bool(
-    sync_nccl_allreduce, false,
+    sync_nccl_allreduce, true,
    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
    "after allreduce, this mode can get better performance in some scenarios.");

@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
    }
  }
+  // TODO(gongwb) :polish them!
+  if (is_encoded) {
+    VLOG(1) << "Use dgc allreduce mode";
+  }
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
    auto encode_var_name = original_name + g_dgc_encoded;
    auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
+    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
    auto &in = in_var->Get<LoDTensor>();
    ins.emplace_back(&in);


--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -163,15 +163,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
          "graph_printer", new details::GraphvizSSAGraphPrinter);
    }

-    // Verify that the graph is correct for multi-device executor.
-    AppendPass("multi_devices_check_pass");
-
-    if (VLOG_IS_ON(2)) {
-      AppendPass("all_reduce_deps_pass");
-    }
-
-    if (SeqOnlyAllReduceOps(strategy_)) {
-      VLOG(10) << "Add all_reduce_deps_pass";
+    // experimental shows that the program will be faster if append
+    // all_reduce_deps_pass here.
+    if (!strategy_.enable_parallel_graph_ &&
+        (SeqOnlyAllReduceOps(strategy_) ||
+         strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) {
      AppendPass("all_reduce_deps_pass");
    }

@@ -179,6 +175,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      VLOG(10) << "Add modify_op_lock_and_record_event_pass";
      AppendPass("modify_op_lock_and_record_event_pass");
    }
+
+    // Verify that the graph is correct for multi-device executor.
+    AppendPass("multi_devices_check_pass");
  }

  // Convert graph to run on multi-devices.

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -56,6 +56,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
  fetches.resize(fetch_tensors.size());
  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
  std::vector<FetchOpHandle *> fetch_ops;
+  std::vector<OpHandleBase *> ready_fetch_ops;

  for (auto &fetch_var_name : fetch_tensors) {
    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
@@ -70,8 +71,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
    auto &var_name = fetch_tensors[i];
    auto fetched_var_it = fetched_vars.find(var_name);
    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
-                   "Cannot find fetched variable.(Perhaps the main_program "
-                   "is not set to ParallelExecutor)");
+                   "Cannot find fetched variable(%s).(Perhaps the main_program "
+                   "is not set to ParallelExecutor)",
+                   var_name);

    auto &vars = fetched_var_it->second;

@@ -88,7 +90,11 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
      op->AddInput(var);
    }

-    (*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
+    int dep = static_cast<int>(op->NotReadyInputSize());
+    (*op_deps)[op] = dep;
+    if (dep == 0) {
+      ready_fetch_ops.emplace_back(op);
+    }
  }

  size_t num_complete = 0;
@@ -97,7 +103,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
  for (auto op : bootstrap_ops_) {
    RunOpAsync(op_deps.get(), op, complete_q);
  }
-
+  for (auto op : ready_fetch_ops) {
+    RunOpAsync(op_deps.get(), op, complete_q);
+  }
  while (num_complete != op_deps->size()) {
    size_t num_comp = complete_q->Pop();
    if (num_comp == -1UL) {

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -13,9 +13,9 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/fetch_op_handle.h"
-
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace framework {
@@ -44,6 +44,7 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const {
 }

 void FetchOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
  WaitInputVarGenerated(platform::CPUPlace());

  tensors_.resize(inputs_.size());
@@ -62,7 +63,8 @@ void FetchOpHandle::RunImpl() {
    auto &t = var->Get<framework::LoDTensor>();
    if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
-      TensorCopySync(t, cpu, &tensors_[i]);
+      TensorCopy(t, cpu, *dev_ctxes_.at(t.place()), &tensors_[i]);
+      dev_ctxes_.at(t.place())->Wait();
 #endif
    } else {
      tensors_[i].ShareDataWith(t);

--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -305,6 +305,12 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,

    VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;

+    if (var_nodes_[in_var_name].back() != in_node) {
+      VLOG(4) << "SKIP since " << in_var_name
+              << " is also used as output by other ops";
+      continue;
+    }
+
    bool can_replace = true;
    if (in_var_name == out_var_name) {
      can_replace = false;
@@ -527,6 +533,9 @@ void GraphView::Build(ir::Graph* g) {
  };
  for (auto& node : g->Nodes()) {
    if (!node->IsOp()) continue;
+    // avoid optimize the variable used in sub-blocks
+    if (OpHasSubBlock(node->Op())) update_skip_set(node);
+
    if (node->Name() == "send") update_skip_set(node);
    if (node->Name() == "recv") update_skip_set(node);
    if (node->Name() == "prefetch") update_skip_set(node);

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -131,16 +131,7 @@ size_t NodeSize(const VarDesc& node) {
  return type_size * std::abs(size);
 }

-size_t NodeSize(ir::Node* n) {
-  VarDesc* desc = nullptr;
-  // some op do not have block pointer
-  if (n->inputs[0]->Op() != nullptr) {
-    desc = FindVarDescInBlock(n);
-  } else {
-    desc = n->Var();
-  }
-  return NodeSize(*desc);
-}
+size_t NodeSize(ir::Node* n) { return NodeSize(*(n->Var())); }

 std::string DebugStringImpl(VarDesc* var) {
  std::stringstream ss;
@@ -163,24 +154,22 @@ std::string DebugStringImpl(VarDesc* var) {
 }

 std::string DebugString(ir::Node* var) {
-  return DebugStringImpl(FindVarDescInBlock(var));
+  return DebugStringImpl(GetVarDesc(var));
 }

 // NOTE(dzh): based ir node, if a large node has been reused
 // by a small size node, then next time it appear in pool, it will
 // have the small size. Find the original node shap from blockdesc.
-VarDesc* FindVarDescInBlock(ir::Node* n) {
+VarDesc* GetVarDesc(ir::Node* n) {
  PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1);
-  BlockDesc* block = n->inputs[0]->Op()->Block();
-  PADDLE_ENFORCE(block->HasVar(n->Name()),
-                 string::Sprintf("Block do not has var %s", n->Name()));
-  return block->FindVar(n->Name());
+  return n->Var();
 }

 struct NodeComparator {
  bool operator()(ir::Node* lhs, ir::Node* rhs) const {
-    auto* lhs_desc = FindVarDescInBlock(lhs);
-    auto* rhs_desc = FindVarDescInBlock(rhs);
+    if (lhs->Var()->GetType() != rhs->Var()->GetType()) return false;
+    auto* lhs_desc = GetVarDesc(lhs);
+    auto* rhs_desc = GetVarDesc(rhs);
    // match data type
    if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
      return false;
@@ -204,7 +193,7 @@ void OrderedSet::Insert(ir::Node* var) {
    return;
  }

-  auto* var_desc = FindVarDescInBlock(var);
+  auto* var_desc = var->Var();
  auto var_shape = var_desc->GetShape();
  int batch_size = static_cast<int>(var_shape[0]);

@@ -212,7 +201,7 @@ void OrderedSet::Insert(ir::Node* var) {
  Iter it = nodes_.begin();
  while (it != nodes_.end()) {
    auto& prev = it->front();
-    auto* cache_desc = FindVarDescInBlock(prev);
+    auto* cache_desc = GetVarDesc(prev);
    int cache_batch_size = cache_desc->GetShape()[0];
    if ((cache_batch_size == -1 && batch_size == -1) ||
        (cache_batch_size != -1 && batch_size != -1)) {
@@ -336,10 +325,16 @@ int MinChunkSize() {
 bool NodeCanReused(const VarDesc& node) {
  auto type = node.GetType();
  // only these types holds bulk of gpu memory
-  if (!(type == proto::VarType::LOD_TENSOR ||
-        type == proto::VarType::LOD_TENSOR_ARRAY)) {
-    return false;
-  }
+  // FIXME(liuwei1031) did not find good ways to test SELECTED_ROWS and
+  // LOD_TENSOR_ARRAY re-use logic,
+  // disable them in version 1.4
+  // if (!(type == proto::VarType::LOD_TENSOR ||
+  //       type == proto::VarType::SELECTED_ROWS ||
+  //       type == proto::VarType::LOD_TENSOR_ARRAY)) {
+  //   return false;
+  // }
+  if (type != proto::VarType::LOD_TENSOR) return false;
+
  // persistable variable is parameter
  if (node.Persistable()) {
    return false;

--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -20,6 +20,7 @@
 #include <map>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
@@ -140,11 +141,7 @@ size_t NodeSize(const VarDesc&);

 std::string DebugString(ir::Node* var);

-// NOTE(dzhwinter)
-// after node reuse, the replaced node shape is
-// different with its VarDesc. So need to find the
-// correct VarDesc in Block.
-VarDesc* FindVarDescInBlock(ir::Node* n);
+VarDesc* GetVarDesc(ir::Node* n);

 static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -68,7 +68,7 @@ void OpHandleBase::Run(bool use_cuda) {
        if (out_var_handle) {
          PADDLE_ENFORCE(
              platform::is_same_place(place, out_var_handle->place()),
-              "The place of input(%s) is not consistent with the "
+              "The place of output(%s) is not consistent with the "
              "place of current op(%s).",
              out_var_handle->Name(), Name());
          out_var_handle->SetGenerateEvent(events_.at(dev_id));

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -68,7 +68,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    }
    set.clear();
  };
-  auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); };
+
  // Clean run context
  run_op_futures_.clear();
  exception_holder_.Clear();
@@ -102,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
        auto &deps = pending_ops[op];
        --deps;
        if (deps == 0) {
-          run_all_op(op);
+          ready_ops.insert(op);
        }
      }
    }

--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -84,7 +84,8 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {

  // 1. record op nodes of different roles
  for (auto node : nodes) {
-    if (node->IsVar()) continue;
+    if (!node->IsOp()) continue;
+    PADDLE_ENFORCE(node->Op(), "must find opdesc");
    int op_role = boost::get<int>(node->Op()->GetAttr(
        framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
    if ((op_role == static_cast<int>(framework::OpRole::kForward)) ||

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -19,17 +19,14 @@ limitations under the License. */
 #include <tuple>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-#include "paddle/fluid/framework/ir/graph.h"
-
-#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"

 #ifdef WITH_GPERFTOOLS

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -122,14 +122,14 @@ class Autograd {
      std::map<std::string, std::vector<VarBase*>> input_grads =
          ready_op->ApplyGrad();

-      for (auto it : input_grads) {
-        const std::vector<VarBase*>& ingrads = it.second;
+      for (auto it = input_grads.rbegin(); it != input_grads.rend(); ++it) {
+        const std::vector<VarBase*>& ingrads = it->second;
        for (size_t i = 0; i < ingrads.size(); ++i) {
          if (!ingrads[i]) continue;
-          if (ready_op->input_vars_[it.first][i]->IsStopGradient()) {
+          if (ready_op->input_vars_[it->first][i]->IsStopGradient()) {
            continue;
          }
-          OpBase* pre_op = ready_op->pre_ops_[it.first][i];
+          OpBase* pre_op = ready_op->pre_ops_[it->first][i];
          if (!pre_op) continue;

          dep_counts[pre_op] -= 1;

--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
-cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
- elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc  softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
+cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc  softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)

 cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
 cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)

--- a/paddle/fluid/inference/anakin/convert/activation.cc
+++ b/paddle/fluid/inference/anakin/convert/activation.cc
@@ -34,6 +34,7 @@ ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
 }

 void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
+                                       const framework::BlockDesc &block_desc,
                                       const framework::Scope &scope,
                                       bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/activation.h
+++ b/paddle/fluid/inference/anakin/convert/activation.h
@@ -27,6 +27,7 @@ class ActivationOpConverter : public AnakinOpConverter {
  explicit ActivationOpConverter(const std::string &op_type);

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~ActivationOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc
@@ -29,6 +29,7 @@ namespace inference {
 namespace anakin {

 void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::BlockDesc &block_desc,
                                      const framework::Scope &scope,
                                      bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/batch_norm.h
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.h
@@ -25,6 +25,7 @@ class BatchNormOpConverter : public AnakinOpConverter {
  BatchNormOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~BatchNormOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/concat.cc
+++ b/paddle/fluid/inference/anakin/convert/concat.cc
@@ -29,6 +29,7 @@ namespace inference {
 namespace anakin {

 void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::BlockDesc &block_desc,
                                   const framework::Scope &scope,
                                   bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/concat.h
+++ b/paddle/fluid/inference/anakin/convert/concat.h
@@ -25,6 +25,7 @@ class ConcatOpConverter : public AnakinOpConverter {
  ConcatOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~ConcatOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d.cc
@@ -28,6 +28,7 @@ namespace inference {
 namespace anakin {

 void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::BlockDesc &block_desc,
                                   const framework::Scope &scope,
                                   bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/conv2d.h
+++ b/paddle/fluid/inference/anakin/convert/conv2d.h
@@ -25,6 +25,7 @@ class Conv2dOpConverter : public AnakinOpConverter {
  Conv2dOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~Conv2dOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
@@ -28,6 +28,7 @@ namespace inference {
 namespace anakin {

 void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::BlockDesc &block_desc,
                                         const framework::Scope &scope,
                                         bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
@@ -25,6 +25,7 @@ class Conv2dFusionOpConverter : public AnakinOpConverter {
  Conv2dFusionOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~Conv2dFusionOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
@@ -27,9 +27,9 @@ namespace paddle {
 namespace inference {
 namespace anakin {

-void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
-                                            const framework::Scope& scope,
-                                            bool test_mode) {
+void DensityPriorBoxOpConverter::operator()(
+    const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc,
+    const framework::Scope& scope, bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);
  auto input_name = op_desc.Input("Input").front();
  auto image_name = op_desc.Input("Image").front();

--- a/paddle/fluid/inference/anakin/convert/density_prior_box.h
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h
@@ -27,6 +27,7 @@ class DensityPriorBoxOpConverter : public AnakinOpConverter {
  DensityPriorBoxOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~DensityPriorBoxOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/detection_out.cc
+++ b/paddle/fluid/inference/anakin/convert/detection_out.cc
@@ -26,6 +26,7 @@ namespace inference {
 namespace anakin {

 void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::BlockDesc &block_desc,
                                         const framework::Scope &scope,
                                         bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/detection_out.h
+++ b/paddle/fluid/inference/anakin/convert/detection_out.h
@@ -27,6 +27,7 @@ class DetectionOutOpConverter : public AnakinOpConverter {
  DetectionOutOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~DetectionOutOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/dropout.cc
+++ b/paddle/fluid/inference/anakin/convert/dropout.cc
@@ -31,6 +31,7 @@ namespace inference {
 namespace anakin {

 void DropoutOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::BlockDesc &block_desc,
                                    const framework::Scope &scope,
                                    bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/dropout.h
+++ b/paddle/fluid/inference/anakin/convert/dropout.h
@@ -25,6 +25,7 @@ class DropoutOpConverter : public AnakinOpConverter {
  DropoutOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~DropoutOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/elementwise.cc
+++ b/paddle/fluid/inference/anakin/convert/elementwise.cc
@@ -30,9 +30,9 @@ namespace paddle {
 namespace inference {
 namespace anakin {

-void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op,
-                                           const framework::Scope &scope,
-                                           bool test_mode) {
+void ElementwiseAddOpConverter::operator()(
+    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
+    const framework::Scope &scope, bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);
  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
@@ -50,9 +50,9 @@ void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op,
  engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
 }

-void ElementwiseMulOpConverter::operator()(const framework::proto::OpDesc &op,
-                                           const framework::Scope &scope,
-                                           bool test_mode) {
+void ElementwiseMulOpConverter::operator()(
+    const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
+    const framework::Scope &scope, bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);
  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);

--- a/paddle/fluid/inference/anakin/convert/elementwise.h
+++ b/paddle/fluid/inference/anakin/convert/elementwise.h
@@ -25,6 +25,7 @@ class ElementwiseAddOpConverter : public AnakinOpConverter {
  ElementwiseAddOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~ElementwiseAddOpConverter() {}
@@ -37,6 +38,7 @@ class ElementwiseMulOpConverter : public AnakinOpConverter {
  ElementwiseMulOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~ElementwiseMulOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
@@ -27,6 +27,7 @@ namespace inference {
 namespace anakin {

 void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::BlockDesc &block_desc,
                                   const framework::Scope &scope,
                                   bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/fc.h
+++ b/paddle/fluid/inference/anakin/convert/fc.h
@@ -25,6 +25,7 @@ class FcBaseOpConverter : public AnakinOpConverter {
  FcBaseOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~FcBaseOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/flatten.cc
+++ b/paddle/fluid/inference/anakin/convert/flatten.cc
@@ -26,6 +26,7 @@ namespace inference {
 namespace anakin {

 void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::BlockDesc &block_desc,
                                    const framework::Scope &scope,
                                    bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/flatten.h
+++ b/paddle/fluid/inference/anakin/convert/flatten.h
@@ -25,6 +25,7 @@ class FlattenOpConverter : public AnakinOpConverter {
  FlattenOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~FlattenOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/im2sequence.cc
+++ b/paddle/fluid/inference/anakin/convert/im2sequence.cc
@@ -31,6 +31,7 @@ namespace inference {
 namespace anakin {

 void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::BlockDesc &block_desc,
                                      const framework::Scope &scope,
                                      bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/im2sequence.h
+++ b/paddle/fluid/inference/anakin/convert/im2sequence.h
@@ -25,6 +25,7 @@ class Im2SequenceConverter : public AnakinOpConverter {
  Im2SequenceConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~Im2SequenceConverter() {}

--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -40,8 +40,10 @@ class AnakinOpConverter {
  AnakinOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope, bool test_mode) {}
  void ConvertOp(const framework::proto::OpDesc &op,
+                 const framework::BlockDesc &block_desc,
                 const std::unordered_set<std::string> &parameters,
                 const framework::Scope &scope, AnakinNvEngine *engine,
                 bool test_mode = false) {
@@ -58,16 +60,17 @@ class AnakinOpConverter {
    }
    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type);
    it->SetEngine(engine);
-    (*it)(op, scope, test_mode);
+    (*it)(op, block_desc, scope, test_mode);
  }

-  void ConvertBlock(const framework::proto::BlockDesc &block,
+  void ConvertBlock(framework::BlockDesc *block_desc,
                    const std::unordered_set<std::string> &parameters,
                    const framework::Scope &scope, AnakinNvEngine *engine) {
    std::unique_lock<std::mutex> lock(mutex_);
-    for (auto i = 0; i < block.ops_size(); i++) {
-      auto &op = block.ops(i);
-      ConvertOp(op, parameters, scope, engine);
+    framework::proto::BlockDesc *block = block_desc->Proto();
+    for (auto i = 0; i < block->ops_size(); i++) {
+      auto &op = block->ops(i);
+      ConvertOp(op, *block_desc, parameters, scope, engine);
    }
  }

@@ -77,9 +80,7 @@ class AnakinOpConverter {
      const std::vector<std::string> &inputs,
      const std::unordered_set<std::string> &parameters,
      const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
-    framework::proto::BlockDesc *block_proto = block_desc->Proto();
-    ConvertBlock(*block_proto, parameters, *scope, engine);
-
+    ConvertBlock(block_desc, parameters, *scope, engine);
    engine->Freeze();
    // if the max_batch size
    int max_batch_size = engine->GetMaxBatchSize();

--- a/paddle/fluid/inference/anakin/convert/pool2d.cc
+++ b/paddle/fluid/inference/anakin/convert/pool2d.cc
@@ -31,6 +31,7 @@ namespace inference {
 namespace anakin {

 void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::BlockDesc &block_desc,
                                   const framework::Scope &scope,
                                   bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/pool2d.h
+++ b/paddle/fluid/inference/anakin/convert/pool2d.h
@@ -25,6 +25,7 @@ class Pool2dOpConverter : public AnakinOpConverter {
  Pool2dOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~Pool2dOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/relu.cc
+++ b/paddle/fluid/inference/anakin/convert/relu.cc
@@ -26,6 +26,7 @@ namespace inference {
 namespace anakin {

 void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
+                                 const framework::BlockDesc &block_desc,
                                 const framework::Scope &scope,
                                 bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/relu.h
+++ b/paddle/fluid/inference/anakin/convert/relu.h
@@ -27,6 +27,7 @@ class ReluOpConverter : public AnakinOpConverter {
  ReluOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~ReluOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/reshape.cc
+++ b/paddle/fluid/inference/anakin/convert/reshape.cc
@@ -26,6 +26,7 @@ namespace inference {
 namespace anakin {

 void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::BlockDesc &block_desc,
                                    const framework::Scope &scope,
                                    bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/reshape.h
+++ b/paddle/fluid/inference/anakin/convert/reshape.h
@@ -25,6 +25,7 @@ class ReshapeOpConverter : public AnakinOpConverter {
  ReshapeOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~ReshapeOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/scale.cc
+++ b/paddle/fluid/inference/anakin/convert/scale.cc
@@ -26,6 +26,7 @@ namespace inference {
 namespace anakin {

 void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
+                                  const framework::BlockDesc &block_desc,
                                  const framework::Scope &scope,
                                  bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/scale.h
+++ b/paddle/fluid/inference/anakin/convert/scale.h
@@ -27,6 +27,7 @@ class ScaleOpConverter : public AnakinOpConverter {
  ScaleOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~ScaleOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/softmax.cc
+++ b/paddle/fluid/inference/anakin/convert/softmax.cc
@@ -24,6 +24,7 @@ namespace inference {
 namespace anakin {

 void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
+                                    const framework::BlockDesc &block_desc,
                                    const framework::Scope &scope,
                                    bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);
@@ -32,8 +33,16 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
  auto input = op_desc.Input("X").front();
  auto output = op_desc.Output("Out").front();
  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+
+  auto input_var_desc = block_desc.FindVar(input);
+  PADDLE_ENFORCE(input_var_desc,
+                 "Cant find %s variable When runing Anakin Softmax converter.",
+                 input);
+  auto input_shape_in_fluid = input_var_desc->GetShape();
+  size_t input_dims = input_shape_in_fluid.size();
+
  engine_->AddOp(op_name, "Softmax", {input}, {output});
-  engine_->AddOpAttr(op_name, "axis", 2);
+  engine_->AddOpAttr(op_name, "axis", static_cast<int>(input_dims - 1));
 }

 }  // namespace anakin

--- a/paddle/fluid/inference/anakin/convert/softmax.h
+++ b/paddle/fluid/inference/anakin/convert/softmax.h
@@ -25,6 +25,7 @@ class SoftMaxOpConverter : public AnakinOpConverter {
  SoftMaxOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~SoftMaxOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/split.cc
+++ b/paddle/fluid/inference/anakin/convert/split.cc
@@ -30,6 +30,7 @@ namespace inference {
 namespace anakin {

 void SplitOpConverter::operator()(const framework::proto::OpDesc &op,
+                                  const framework::BlockDesc &block_desc,
                                  const framework::Scope &scope,
                                  bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/split.h
+++ b/paddle/fluid/inference/anakin/convert/split.h
@@ -25,6 +25,7 @@ class SplitOpConverter : public AnakinOpConverter {
  SplitOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~SplitOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/sum.cc
+++ b/paddle/fluid/inference/anakin/convert/sum.cc
@@ -31,6 +31,7 @@ namespace inference {
 namespace anakin {

 void SumOpConverter::operator()(const framework::proto::OpDesc &op,
+                                const framework::BlockDesc &block_desc,
                                const framework::Scope &scope, bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);
  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2);

--- a/paddle/fluid/inference/anakin/convert/sum.h
+++ b/paddle/fluid/inference/anakin/convert/sum.h
@@ -25,6 +25,7 @@ class SumOpConverter : public AnakinOpConverter {
  SumOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~SumOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/transpose.cc
+++ b/paddle/fluid/inference/anakin/convert/transpose.cc
@@ -28,6 +28,7 @@ namespace inference {
 namespace anakin {

 void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
+                                      const framework::BlockDesc &block_desc,
                                      const framework::Scope &scope,
                                      bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);

--- a/paddle/fluid/inference/anakin/convert/transpose.h
+++ b/paddle/fluid/inference/anakin/convert/transpose.h
@@ -25,6 +25,7 @@ class TransposeOpConverter : public AnakinOpConverter {
  TransposeOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::BlockDesc &block_desc,
                          const framework::Scope &scope,
                          bool test_mode) override;
  virtual ~TransposeOpConverter() {}

--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>

+#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -112,6 +113,17 @@ class AnakinConvertValidation {
    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
    x_tensor->Resize(framework::make_ddim(dim_vec));
    RandomizeTensor(x_tensor, place_, ctx);
+
+    std::vector<int64_t> dim_vec_int64;
+    for (auto& ele : dim_vec) {
+      dim_vec_int64.push_back(static_cast<int64_t>(ele));
+    }
+
+    // Add var_desc to block_desc
+    auto* block_desc = program_desc_.MutableBlock(framework::kRootBlockIndex);
+
+    auto* var_desc = block_desc->Var(name);
+    var_desc->SetShape(dim_vec_int64);
  }

  void SetOp(const framework::proto::OpDesc& desc) {
@@ -119,8 +131,10 @@ class AnakinConvertValidation {
    op_desc_.reset(new framework::OpDesc(desc, nullptr));
    // should init anakin engine here.

+    auto& block_desc = program_desc_.Block(framework::kRootBlockIndex);
    Singleton<AnakinOpConverter>::Global().ConvertOp(
-        desc, parameters_, *scope_, engine_.get(), true /*test_mode*/);
+        desc, block_desc, parameters_, *scope_, engine_.get(),
+        true /*test_mode*/);
    engine_->Freeze();

    std::map<std::string, std::vector<int>> temp_max_input_shape;
@@ -194,6 +208,7 @@ class AnakinConvertValidation {
  cudaStream_t stream_;
  std::unique_ptr<framework::OperatorBase> op_;
  std::unique_ptr<framework::OpDesc> op_desc_;
+  framework::ProgramDesc program_desc_;
  const std::unordered_set<std::string>& parameters_;
  framework::Scope* scope_;
  platform::CUDAPlace place_;

--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -91,7 +91,6 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
                   " or equal to the real input shape, Please set the max "
                   "input shape using EnableAnakinEngine");
    anakin_input->reshape(fluid_input_shape);
-
    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
                                                       fluid_input_shape);
    anakin_input->copy_from(tmp_anakin_tensor);

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -168,6 +168,7 @@ struct Argument {
  DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
                      anakin_max_shape_t);
  DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int);
  DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);

  // Memory optimized related.

--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -151,13 +151,20 @@ void AnakinSubgraphPass::CreateAnakinOp(
  op_desc->SetType("anakin_engine");

  std::unordered_map<std::string, std::string> output_name_map;
+  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
+
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      graph_var_map[node->Name()] = node;
+    }
+  }
  auto &subgraph_nodes = *Agent(node).subgraph();

  // The following procedure is used to rename all the intermediate
  // variables and the output variables of the subgraph.
  RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
                      &output_names_with_id, &output_names, &output_name_map,
-                      false);
+                      graph_var_map, false);

  // When anakin engine runs at the end of the operation,
  // output_mapping help us copy the data from the renamed ITensor
@@ -168,13 +175,6 @@ void AnakinSubgraphPass::CreateAnakinOp(
    output_mapping.push_back(output_name_map[name]);
  }

-  auto *vars = block_desc.Proto()->mutable_vars();
-  for (framework::ir::Node *node : graph->Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      *vars->Add() = *node->Var()->Proto();
-    }
-  }
-
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                 "the block has no var-desc");
  PADDLE_ENFORCE(!output_mapping.empty());

--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -60,6 +60,7 @@ void RenameAndGetOutputs(
    std::set<std::string> *output_names_with_id,
    std::set<std::string> *output_names,
    std::unordered_map<std::string, std::string> *output_name_map,
+    const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
    bool is_trt) {
  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
  // When there are more than two convolutions of 1 * 1 with the same input, the
@@ -69,6 +70,15 @@ void RenameAndGetOutputs(
  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
      same_hierarchy_conv2d_num_map;

+  auto add_block_var = [&](const std::string &graph_arg,
+                           const std::string &block_arg) {
+    auto arg_var_node = graph_var_map.find(graph_arg);
+    PADDLE_ENFORCE(arg_var_node != graph_var_map.end());
+    auto *var_t = block_desc->Var(block_arg);
+    var_t->SetShape(arg_var_node->second->Var()->GetShape());
+    var_t->SetDataType(arg_var_node->second->Var()->GetDataType());
+  };
+
  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
    framework::OpDesc op_desc(*op, nullptr);
@@ -87,13 +97,20 @@ void RenameAndGetOutputs(
      auto *in_var = op->mutable_inputs(i);
      std::vector<std::string> replaced_names;
      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
+        const std::string arg_value = in_var->arguments(k);
+        const std::string arg_value_with_id =
            arg_value + std::to_string(var2id[arg_value]);
+
        if (input_names_with_id.count(arg_value_with_id)) {
          replaced_names.push_back(arg_value);
+          if (graph_var_map.count(arg_value)) {
+            add_block_var(arg_value, arg_value);
+          }
        } else {
          replaced_names.push_back(arg_value_with_id);
+          if (graph_var_map.count(arg_value)) {
+            add_block_var(arg_value, arg_value_with_id);
+          }
        }
      }
      in_var->clear_arguments();
@@ -105,7 +122,6 @@ void RenameAndGetOutputs(
    for (auto out_var : correspond_node->outputs) {
      var2id[out_var->Name()] = out_var->id();
    }
-
    if (op_desc.Type() == "conv2d" && is_trt) {
      auto input_var_name = op_desc.Input("Input").front();
      auto filter_var_name = op_desc.Input("Filter").front();
@@ -125,15 +141,18 @@ void RenameAndGetOutputs(
        same_hierarchy_conv2d_num_map[input_var_name] += 1;
      }
    }
-
    // rename for the output variables of op inside subgraph
    for (int i = 0; i < op->outputs_size(); i++) {
      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
      std::vector<std::string> replaced_names;
      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
+        const std::string arg_value = out_var->arguments(k);
+        const std::string arg_value_with_id =
            arg_value + std::to_string(var2id[arg_value]);
+
+        if (graph_var_map.count(arg_value)) {
+          add_block_var(arg_value, arg_value_with_id);
+        }
        if (output_names_with_id->count(arg_value_with_id)) {
          (*output_name_map)[arg_value] = arg_value_with_id;
        }

--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -42,6 +42,7 @@ void RenameAndGetOutputs(
    std::set<std::string> *output_names_with_id,
    std::set<std::string> *output_names,
    std::unordered_map<std::string, std::string> *output_name_map,
+    const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
    bool is_trt = true);

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -142,6 +142,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  }

  std::unordered_map<std::string, std::string> output_name_map;
+  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
+
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      graph_var_map[node->Name()] = node;
+    }
+  }
  auto &subgraph_nodes = *Agent(node).subgraph();

  // The following procedure is used to rename all the intermediate
@@ -157,7 +164,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  // So we have to rename the variable in the subgraph to make sure
  // it is either an OP's input or an OP's output.
  RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
-                      &output_names_with_id, &output_names, &output_name_map);
+                      &output_names_with_id, &output_names, &output_name_map,
+                      graph_var_map);

  // When tensorrt engine runs at the end of the operation,
  // output_mapping help us copy the data from the renamed ITensor
@@ -168,14 +176,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
    output_mapping.push_back(output_name_map[name]);
  }
  PADDLE_ENFORCE(!output_mapping.empty());
-
-  auto *vars = block_desc.Proto()->mutable_vars();
-  for (framework::ir::Node *node : graph->Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      *vars->Add() = *node->Var()->Proto();
-    }
-  }
-
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                 "the block has no var-desc");

@@ -213,7 +213,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
  SetAttr(op_desc->Proto(), "engine_key", engine_key);
  std::string trt_engine_serialized_data = "";
-
  SetAttr(op_desc->Proto(), "engine_serialized_data",
          trt_engine_serialized_data);


--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -115,6 +115,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(use_anakin_);
  CP_MEMBER(anakin_max_batchsize_);
  CP_MEMBER(anakin_max_input_shape_);
+  CP_MEMBER(anakin_min_subgraph_size_);

  // Ir related.
  CP_MEMBER(enable_ir_optim_);
@@ -315,6 +316,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << specify_input_name_;
  ss << cpu_math_library_num_threads_;
  ss << use_anakin_;
+  ss << anakin_min_subgraph_size_;
  return ss.str();
 }

@@ -386,10 +388,11 @@ void AnalysisConfig::SwitchIrDebug(int x) {
  Update();
 }
 void AnalysisConfig::EnableAnakinEngine(
-    int max_batch_size,
-    std::map<std::string, std::vector<int>> max_input_shape) {
+    int max_batch_size, std::map<std::string, std::vector<int>> max_input_shape,
+    int min_subgraph_size) {
  anakin_max_batchsize_ = max_batch_size;
  anakin_max_input_shape_ = max_input_shape;
+  anakin_min_subgraph_size_ = min_subgraph_size;
  use_anakin_ = true;
  Update();
 }

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -385,6 +385,7 @@ void AnalysisPredictor::PrepareArgument() {
  if (config_.use_gpu() && config_.anakin_engine_enabled()) {
    argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
    argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
+    argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_);
    LOG(INFO) << "Anakin subgraph engine is enabled";
  }


--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -151,7 +151,8 @@ struct AnalysisConfig {
   */
  void EnableAnakinEngine(
      int max_batch_size = 1,
-      std::map<std::string, std::vector<int>> max_input_shape = {});
+      std::map<std::string, std::vector<int>> max_input_shape = {},
+      int min_subgraph_size = 6);

  /** A boolean state indicating whether the Anakin sub-graph engine is used.
  */
@@ -288,6 +289,7 @@ struct AnalysisConfig {

  bool use_anakin_{false};
  int anakin_max_batchsize_;
+  int anakin_min_subgraph_size_{6};
  std::map<std::string, std::vector<int>> anakin_max_input_shape_;
  std::map<std::string, std::string> engine_opt_info_;


--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -87,6 +87,7 @@ const std::vector<std::string> kAnakinSubgraphPasses({
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
  passes_.assign({
    "infer_clean_graph_pass",          //
+        "runtime_context_cache_pass",  //
        //   "identity_scale_op_clean_pass",              //
        "conv_affine_channel_fuse_pass",             //
        "conv_eltwiseadd_affine_channel_fuse_pass",  //
@@ -96,7 +97,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
        "conv_elementwise_add_act_fuse_pass",   //
        "conv_elementwise_add2_act_fuse_pass",  //
        "conv_elementwise_add_fuse_pass",       //
-        "runtime_context_cache_pass",           //
 #endif                                          //
        "transpose_flatten_concat_fuse_pass",
  });
@@ -117,6 +117,10 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
  // not be damaged by smaller ones.
  passes_.assign({
      "infer_clean_graph_pass",  //
+      // TODO(luotao): runtime_context_cache_pass should be located in the
+      // front, see https://github.com/PaddlePaddle/Paddle/issues/16609,
+      // will enhance this pass later.
+      "runtime_context_cache_pass",     //
      "attention_lstm_fuse_pass",       //
      "seqpool_concat_fuse_pass",       //
      "seqconv_eltadd_relu_fuse_pass",  //
@@ -132,8 +136,6 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
      "conv_bn_fuse_pass",             //
      "conv_eltwiseadd_bn_fuse_pass",  //
      "is_test_pass",                  //
-      "identity_scale_op_clean_pass",  //
-      "runtime_context_cache_pass",    //
  });
  use_gpu_ = false;
 }

--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -23,18 +23,11 @@ namespace analysis {

 void SetConfig(AnalysisConfig *cfg) {
  cfg->SetModel(FLAGS_infer_model);
-  cfg->SetProgFile("__model__");
  cfg->DisableGpu();
  cfg->SwitchIrOptim();
-  cfg->SwitchSpecifyInputNames(false);
+  cfg->SwitchSpecifyInputNames();
  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
  cfg->EnableMKLDNN();
-  cfg->pass_builder()->SetPasses(
-      {"infer_clean_graph_pass", "mkldnn_placement_pass",
-       "depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass",
-       "conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass",
-       "conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass",
-       "fc_fuse_pass", "is_test_pass"});
 }

 template <typename T>
@@ -84,13 +77,13 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
          std::to_string(num_images) + " is bigger than all test data size.");

  PaddleTensor images;
-  images.name = "input";
+  images.name = "image";
  images.shape = {num_images, 3, 224, 224};
  images.dtype = PaddleDType::FLOAT32;
  images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);

  PaddleTensor labels;
-  labels.name = "labels";
+  labels.name = "label";
  labels.shape = {num_images, 1};
  labels.dtype = PaddleDType::INT64;
  labels.data.Resize(sizeof(int64_t) * num_images);
@@ -132,7 +125,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
      images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;

  TensorReader<float> image_reader(file, images_offset_in_file,
-                                   image_batch_shape, "input");
+                                   image_batch_shape, "image");
  TensorReader<int64_t> label_reader(file, labels_offset_in_file,
                                     label_batch_shape, "label");


--- a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
+++ b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
+# INT8 MKL-DNN quantization 
+
+This document describes how to use Paddle inference Engine to convert the FP32 model to INT8 model on ResNet-50 and MobileNet-V1. We provide the instructions on enabling INT8 MKL-DNN quantization in Paddle inference and show the ResNet-50 and MobileNet-V1 results in accuracy and performance.
+
+## 0. Install PaddlePaddle 
+Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments. 
+```
+cmake ..  -DWITH_TESTING=ON -WITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_MKL=ON  -WITH_SWIG_PY=OFF -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
+
+```  
+Note: MKL-DNN and MKL are required.
+
+## 1. Enable INT8 MKL-DNN quantization 
+For reference, please examine the code of unit test enclosed in [analyzer_int8_image_classification_tester.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc).
+
+* ### Create Analysis config
+INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease) 
+
+* ### Create quantize config by analysis config
+We enable the MKL-DNN quantization procedure by calling an appropriate method from analysis config. Afterwards, all the required quantization parameters (quantization op names, quantization strategies etc.) can be set through quantizer config which is present in the analysis config. It is also necessary to specify a pre-processed warmup dataset and desired batch size.
+
+```cpp
+//Enable MKL-DNN quantization
+cfg.EnableMkldnnQuantizer();
+
+//use analysis config to call the MKL-DNN quantization config
+cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); 
+cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
+```
+
+## 2. Accuracy and Performance benchmark
+
+We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 on single core.
+
+   >**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
+
+| Model  | Dataset  | FP32 Accuracy  | INT8 Accuracy  | Accuracy Diff  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  | 76.63%  | 76.48%  | 0.15% |
+| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.36%  | 0.42%  |
+
+   >**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
+
+| Model  | Dataset  | FP32 Throughput  | INT8 Throughput  |  Ratio(INT8/FP32)  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  |  13.17 images/s | 49.84 images/s | 3.78 |
+| MobileNet-V1 | Full ImageNet Val  | 75.49 images/s | 232.38 images/s | 3.07  |
+
+Notes:
+* Measurement of accuracy requires a model which accepts two inputs: data and labels.
+* Different sampling batch data may cause slight difference on INT8 top accuracy.
+* C-API performance data is better than Python API performance data because of the python overhead. Especially for the small computational model, python overhead will be more obvious. 
+
+
+## 3. Commands to reproduce the above accuracy and performance benchmark
+* #### Full dataset (Single core)
+   * ##### Download full ImageNet Validation Dataset
+```bash
+cd /PATH/TO/PADDLE/build
+python ../paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+```
+The converted data binary file is saved by default in ~/.cache/paddle/dataset/int8/download/int8_full_val.bin
+   * ##### ResNet50 Full dataset benchmark
+```bash
+./paddle/fluid/inference/tests/api/test_analyzer_int8_resnet50 --infer_model=third_party/inference_demo/int8v2/resnet50/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
+```
+   * ##### Mobilenet-v1 Full dataset benchmark
+```bash
+./paddle/fluid/inference/tests/api/test_analyzer_int8_mobilenet --infer_model=third_party/inference_demo/int8v2/mobilenet/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
+```
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -316,7 +316,8 @@ void PredictionRun(PaddlePredictor *predictor,
                   int num_threads, int tid) {
  int num_times = FLAGS_repeat;
  int iterations = inputs.size();  // process the whole dataset ...
-  if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size())
+  if (FLAGS_iterations > 0 &&
+      FLAGS_iterations < static_cast<int64_t>(inputs.size()))
    iterations =
        FLAGS_iterations;  // ... unless the number of iterations is set
  outputs->resize(iterations);
@@ -329,14 +330,14 @@ void PredictionRun(PaddlePredictor *predictor,
 #endif
  if (!FLAGS_zero_copy) {
    run_timer.tic();
-    for (size_t i = 0; i < iterations; i++) {
+    for (int i = 0; i < iterations; i++) {
      for (int j = 0; j < num_times; j++) {
        predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
      }
    }
    elapsed_time = run_timer.toc();
  } else {
-    for (size_t i = 0; i < iterations; i++) {
+    for (int i = 0; i < iterations; i++) {
      ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
      run_timer.tic();
      for (int j = 0; j < num_times; j++) {
@@ -366,9 +367,8 @@ void TestOneThreadPrediction(
    const std::vector<std::vector<PaddleTensor>> &inputs,
    std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
  auto predictor = CreateTestPredictor(config, use_analysis);
-  PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads,
-                   0);
-  PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0);
+  PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
+  PredictionRun(predictor.get(), inputs, outputs, 1, 0);
 }

 void TestMultiThreadPrediction(

--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
@@ -120,40 +120,8 @@ class AnakinEngineOp : public framework::OperatorBase {
          inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
              .Get(engine_key_);
    }
-
    return anakin_engine_;
  }
-
-  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
-               AnakinNvEngineT *engine) const {
-    LOG(INFO) << "Prepare Anakin engine (Optimize model structure, Select OP "
-                 "kernel etc). This process may cost a lot of time.";
-    framework::proto::BlockDesc block_desc;
-    block_desc.ParseFromString(Attr<std::string>("subgraph"));
-
-    std::vector<std::string> output_maps =
-        Attr<std::vector<std::string>>("output_name_mapping");
-
-    inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
-        .ConvertBlock(block_desc, param_names_, scope, engine);
-    engine->Freeze();
-    for (const auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
-      auto &t =
-          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
-      auto t_shape = framework::vectorize2int(t.dims());
-      // all input shape should be 4 dims
-      if (t_shape.size() == 2) {
-        t_shape.push_back(1);
-        t_shape.push_back(1);
-      }
-      engine->SetInputShape(x, t_shape);
-    }
-
-    engine->Optimize();
-
-    engine->InitGraph();
-  }
 };

 }  // namespace operators

--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -24,19 +24,22 @@ class DGCClipByNormKernel : public ClipByNormKernel<DeviceContext, T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
-    if (static_cast<int>(rampup_begin_step) >= 0) {
-      auto current_step_tensor =
-          context.Input<framework::Tensor>("current_step");
+    if (static_cast<int>(rampup_begin_step) < 0) {
+      return;
+    }
+
+    auto current_step_tensor = context.Input<framework::Tensor>("current_step");
    auto* current_step = current_step_tensor->data<T>();

-      if (static_cast<int>(*current_step) <
-          static_cast<int>(rampup_begin_step)) {
+    VLOG(10) << "current_step:" << *current_step
+             << ", rampup_begin_step:" << rampup_begin_step;
+
+    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
      VLOG(10) << "current_step:" << *current_step
               << " < rampup_begin_step:" << rampup_begin_step
               << " so does't use dgc_clip_by_norm";
      return;
    }
-    }

    return ClipByNormKernel<DeviceContext, T>::Compute(context);
  };

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -991,15 +991,17 @@ TEST(JITKernel_pool, jitpool) {

 TEST(JITKernel_pool, more) {
  const auto& kers = jit::KernelPool::Instance().AllKernels();
-#if defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(kers.size(), 10UL);
-#else
-#ifdef PADDLE_WITH_MKLML
-  EXPECT_EQ(kers.size(), 22UL);
-#else
-  EXPECT_EQ(kers.size(), 8UL);
+  size_t target_num = 8;
+
+#ifdef __AVX__
+  target_num += 2;
 #endif
+
+#ifdef PADDLE_WITH_MKLML
+  target_num += 12;
 #endif
+
+  EXPECT_EQ(kers.size(), target_num);
 }

 TEST(JITKernel_pool, refer) {

--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    framework::OpKernelType kt = framework::OpKernelType(
-        framework::proto::VarType::FP32, platform::CPUPlace());
+        framework::proto::VarType::FP32, ctx.GetPlace());
    return kt;
  }
 };

--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -75,6 +75,7 @@ std::vector<std::string> NgraphEngine::feed_vars = {};
 std::vector<std::string> NgraphEngine::fetch_vars = {};
 framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
 const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
+bool NgraphEngine::is_training = false;

 std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
 std::unordered_map<std::string,
@@ -93,11 +94,13 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
  int size = ops->size();
  int left = 0;
  while (left < size && ops->at(left)->Type() != framework::kFeedOpType &&
+         ops->at(left)->Type() != "read" &&
         ops->at(left)->Type() != framework::kFetchOpType) {
    ++left;
  }

-  while (left < size && ops->at(left)->Type() == framework::kFeedOpType) {
+  while (left < size && (ops->at(left)->Type() == framework::kFeedOpType ||
+                         ops->at(left)->Type() == "read")) {
    for (auto& var_name_item : ops->at(left)->Outputs()) {
      for (auto& var_name : var_name_item.second) {
        NgraphEngine::feed_vars.emplace_back(var_name);
@@ -270,6 +273,7 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {

  for (auto op_desc : ops_desc) {
    if (op_desc->Type().find("_grad") != std::string::npos) {
+      is_training = true;
      this->is_test_ = false;
      break;
    }
@@ -590,7 +594,7 @@ void NgraphEngine::Run(const framework::Scope& scope,
      }
      bool is_persistable =
          (p_persistables->find(vi) != p_persistables->end()) ? true : false;
-      if (is_test && is_persistable) {
+      if (!is_training && is_test && is_persistable) {
        ti->set_stale(false);
      }
      (*p_t_in).emplace_back(ti);

--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -57,6 +57,7 @@ class NgraphEngine {

  void Run(const framework::Scope& scope, const platform::Place& place) const;

+  static bool is_training;
  static const framework::BlockDesc* p_bdesc;
  static std::vector<std::string> feed_vars, fetch_vars;


--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -91,6 +91,10 @@ class LayerObjectHelper(LayerHelperBase):

        Returns input, param_attr
        """
+        param_attr_in = ParamAttr._to_attr(param_attr_in)
+        if isinstance(param_attr_in, bool):
+            raise ValueError('Param_attr should not be False in {}'.format(
+                self.name))
        inputs = inputs_in if (inputs_in is not None) else []
        inputs = self._multiple_input(inputs)
        param_attrs = self._multiple_param_attr(len(inputs), param_attr_in)

--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -511,44 +511,67 @@ class FC(layers.Layer):
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._act = act
+        self.__w = list()
+
+    @property
+    def _w(self, i=0):
+        return self.__w[i]
+
+    @_w.setter
+    def _w(self, value, i=0):
+        assert isinstance(value, Parameter)
+        self.__w[i] = value

    def _build_once(self, input):
-        input_shape = input.shape
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            input_shape = inp.shape
+
            param_shape = [
-            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
+                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
+                       1)
            ] + [self._size]
-        self._w = self.create_parameter(
-            attr=self._param_attr,
+            self.__w.append(
+                self.add_parameter(
+                    '_w%d' % i,
+                    self.create_parameter(
+                        attr=param,
                        shape=param_shape,
                        dtype=self._dtype,
-            is_bias=False)
+                        is_bias=False)))
+            i += 1

-        if self._bias_attr:
        size = list([self._size])
        self._b = self.create_parameter(
-                attr=self._bias_attr,
-                shape=size,
-                dtype=self._dtype,
-                is_bias=True)
-        else:
-            self._b = None
+            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)

    def forward(self, input):
+        mul_results = list()
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
            tmp = self._helper.create_variable_for_type_inference(self._dtype)
            self._helper.append_op(
                type="mul",
-            inputs={"X": input,
-                    "Y": self._w},
+                inputs={"X": inp,
+                        "Y": self.__w[i]},
                outputs={"Out": tmp},
                attrs={
                    "x_num_col_dims": self._num_flatten_dims,
                    "y_num_col_dims": 1
                })
+            i += 1
+            mul_results.append(tmp)

-        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
+        if len(mul_results) == 1:
+            pre_bias = mul_results[0]
+        else:
+            pre_bias = self._helper.create_variable_for_type_inference(
+                self._dtype)
            self._helper.append_op(
                type="sum",
-            inputs={"X": [tmp]},
+                inputs={"X": mul_results},
                outputs={"Out": pre_bias},
                attrs={"use_mkldnn": False})


--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -493,7 +493,8 @@ class Variable(object):
        self._ivar._run_backward()

    def _gradient(self):
-        return np.array(self._ivar._grad_value())
+        new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
+        return np.array(new_ivar.value().get_tensor())

    def _clear_gradient(self):
        self._ivar._clear_gradient()

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -752,7 +752,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
            force_cpu=True)

        for param_var, grad_var in param_and_grads:
-            var_numel = reduce(lambda x, y: x * y, param_var.shape)
+            var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
            if var_numel < 16384 or \
                param_var.type == core.VarDesc.VarType.SELECTED_ROWS  or \
                grad_var.type == core.VarDesc.VarType.SELECTED_ROWS  or  \
@@ -832,7 +832,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
            type=x.type, name=name, dtype=x.dtype, persistable=False)

        helper.append_op(
-            type="clip_by_norm",
+            type="dgc_clip_by_norm",
            inputs={"X": x,
                    "current_step": self._global_step_var},
            attrs={
@@ -845,7 +845,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
    def _append_clip_norm(self, grad_var, clip_norm):
        with grad_var.block.program._backward_role_guard():
            return self._clip_by_norm(
-                x=grad_var, max_norm=clip_norm, name=grad_var.name + "@DGC")
+                x=grad_var, max_norm=clip_norm, name=grad_var.name)

    def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
                encoded_var):

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -104,10 +104,11 @@ class ParallelExecutor(object):
        self._scope = scope if scope is not None else executor.global_scope()

        if main_program is not None and main_program._enable_dgc:
+            assert num_trainers > 1, "dgc is not useful for single trainer training."
            assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
            assert num_trainers * len(
-                self._places) > 1, "dgc is not useful for single card training"
-            assert use_cuda
+                self._places) > 1, "dgc is not useful for single card training."
+            assert use_cuda, "dgc only used when cuda is used."

        main_program = main_program if main_program is not None \
            else framework.default_main_program()
@@ -123,6 +124,11 @@ class ParallelExecutor(object):
            exec_strategy=exec_strategy,
            share_vars_from=share_vars_from._compiled_program
            if share_vars_from else None)
+
+        # FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr.
+        if main_program._enable_dgc:
+            self._compiled_program._build_strategy.is_distribution = True
+
        self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
        self._exe = executor.Executor(self._place)
        self._compiled_program._compile(place=self._place, scope=self._scope)

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -139,8 +139,7 @@ class TestDistRunnerBase(object):
        pass_builder = None
        if args.batch_merge_repeat > 1:
            pass_builder = build_stra._finalize_strategy_and_create_passes()
-            mypass = pass_builder.insert_pass(
-                len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
+            mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass")
            mypass.set("num_repeats", args.batch_merge_repeat)

        if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":

--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -16,6 +16,7 @@ from __future__ import print_function

 import unittest
 import paddle.fluid as fluid
+from paddle.fluid import core
 from paddle.fluid.dygraph.nn import Embedding
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import SGDOptimizer
@@ -200,8 +201,6 @@ class PtbModel(fluid.dygraph.Layer):
            rnn_out, shape=[-1, self.num_steps, self.hidden_size])
        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
        projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
-        projection = fluid.layers.reshape(
-            projection, shape=[-1, self.vocab_size])
        projection = fluid.layers.reshape(
            projection, shape=[-1, self.vocab_size])
        loss = fluid.layers.softmax_with_cross_entropy(
@@ -223,6 +222,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
        num_steps = 3
        init_scale = 0.1
        batch_size = 4
+        batch_num = 200

        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
@@ -242,7 +242,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
            dy_loss = None
            last_hidden = None
            last_cell = None
-            batch_num = 200

            for i in range(batch_num):
                x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -280,9 +279,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
                num_steps=num_steps,
                init_scale=init_scale)

-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            sgd = SGDOptimizer(learning_rate=1e-3)
-            x = fluid.layers.data(name="x", shape=[-1, 3, 1], dtype='int64')
+            x = fluid.layers.data(
+                name="x", shape=[-1, num_steps, 1], dtype='int64')
            y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
            init_hidden = fluid.layers.data(
                name="init_hidden", shape=[1], dtype='float32')
@@ -332,7 +333,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
                    for k in range(3, len(out)):
                        static_param_updated[static_param_name_list[k -
                                                                    3]] = out[k]
-
        self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy()))
        self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy()))
        self.assertTrue(
@@ -340,13 +340,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
        for key, value in six.iteritems(static_param_init):
            # print("static_init name: {}, value {}".format(key, value))
            # print("dy_init name: {}, value {}".format(key, dy_param_init[key]))
-            self.assertTrue(np.allclose(value, dy_param_init[key], atol=1e-5))
+            self.assertTrue(np.allclose(value, dy_param_init[key]))
        for key, value in six.iteritems(static_param_updated):
            # print("static name: {}, value {}".format(key, value))
            # print("dy name: {}, value {}".format(key, dy_param_updated[key]))
-            self.assertTrue(
-                np.allclose(
-                    value, dy_param_updated[key], atol=1e-5))
+            self.assertTrue(np.allclose(value, dy_param_updated[key]))


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -302,8 +302,7 @@ use_py_reader = False
 # if we run sync mode
 sync = False

-# how many batches we use
-batch_num = 2
+batch_num = 5

 np.random.seed = 1
 src_word_np = np.random.randint(
@@ -335,24 +334,6 @@ lbl_word_np = np.random.randint(
    dtype='int64')
 lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')

-# np.random.seed = 1
-# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
-# src_pos_np = np.random.randint(
-#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-#
-# trg_word_np =  np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
-# trg_pos_np = np.random.randint(
-#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-#
-# lbl_word_np =  np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
-# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
-#
 pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
                                  ModelHyperParams.d_model)
 pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
@@ -739,7 +720,7 @@ class DecoderSubLayer(Layer):
        enc_attn_output_pp = self._multihead_attention_layer2(
            pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
        enc_attn_output = self._post_process_layer2(
-            slf_attn_output, enc_attn_output_pp, self._postprocess_cmd,
+            slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
            self._prepostprcess_dropout)
        pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
                                                    self._preprocess_cmd,
@@ -1076,20 +1057,17 @@ class TestDygraphTransformer(unittest.TestCase):
                                                                    4]] = out[k]

        self.assertTrue(
-            np.allclose(static_avg_cost_value, dy_avg_cost._numpy()))
+            np.array_equal(static_avg_cost_value, dy_avg_cost._numpy()))
        self.assertTrue(
-            np.allclose(static_sum_cost_value, dy_sum_cost._numpy()))
+            np.array_equal(static_sum_cost_value, dy_sum_cost._numpy()))
        self.assertTrue(
-            np.allclose(
-                static_predict_value, dy_predict._numpy(), atol=1e-5))
+            np.array_equal(static_predict_value, dy_predict._numpy()))
        self.assertTrue(
-            np.allclose(static_token_num_value, dy_token_num._numpy()))
+            np.array_equal(static_token_num_value, dy_token_num._numpy()))
        for key, value in six.iteritems(static_param_init):
-            self.assertTrue(np.allclose(value, dy_param_init[key]))
+            self.assertTrue(np.array_equal(value, dy_param_init[key]))
        for key, value in six.iteritems(static_param_updated):
-            self.assertTrue(
-                np.allclose(
-                    value, dy_param_updated[key], atol=1e-4))
+            self.assertTrue(np.array_equal(value, dy_param_updated[key]))


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -38,7 +38,15 @@ def Lenet(data, class_dim):


 class TestFetchAndFeed(unittest.TestCase):
-    def parallel_exe(self, use_cuda, run_parallel_exe, seed=1):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
+    def parallel_exe(self,
+                     use_cuda,
+                     run_parallel_exe,
+                     use_experimental_executor=False,
+                     seed=1):
        main_program = fluid.Program()
        startup = fluid.Program()
        startup.random_seed = seed
@@ -63,8 +71,12 @@ class TestFetchAndFeed(unittest.TestCase):
        build_strategy = fluid.BuildStrategy()
        build_strategy.enable_inplace = False
        build_strategy.memory_optimize = False
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.use_experimental_executor = use_experimental_executor
        train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
+            loss_name=loss.name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)

        run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)

@@ -131,8 +143,7 @@ class TestFetchAndFeed(unittest.TestCase):
            if batch_id == 2:
                break

-    def test_fetch(self):
-        os.environ['CPU_NUM'] = str(4)
+    def test_fetch_with_threaded_executor(self):
        if core.is_compiled_with_cuda():
            self.parallel_exe(
                use_cuda=True,
@@ -140,8 +151,18 @@ class TestFetchAndFeed(unittest.TestCase):
        self.parallel_exe(
            use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_fetch)

+    def test_fetch_with_fast_threaded_executor(self):
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(
+                use_cuda=True,
+                run_parallel_exe=self.run_parallel_exe_with_fetch,
+                use_experimental_executor=True)
+        self.parallel_exe(
+            use_cuda=False,
+            run_parallel_exe=self.run_parallel_exe_with_fetch,
+            use_experimental_executor=True)
+
    def test_feed(self):
-        os.environ['CPU_NUM'] = str(4)
        if core.is_compiled_with_cuda():
            self.parallel_exe(
                use_cuda=True, run_parallel_exe=self.run_parallel_exe_with_feed)