diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 4ee2fdcf2db6bfa373f814ee4c0ab4d708486ea8..e3d856fb30d8103f50ebcb6dc16153c8ed2a97a6 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -134,6 +134,7 @@ if(WITH_GPU)
             message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
             set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
         endif()
+        add_definitions(-DWITH_ANAKIN)
     endif()
     if(WITH_ANAKIN)
         # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index 9da657b7d78f2287ae253b48c5e18d7eb43abbaa..799d9c309f329f5f10364d794a7964ec3b02eeb4 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "v0.10.1")
+SET(NGRAPH_GIT_TAG         "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index f65b3598b0e0a6b862e286408fd21c1f4234c93d..df0ff772c9d35c88ec5a6112525c56aa92d359b9 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/fluid/framework/details/memory_reuse_types.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
@@ -86,10 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     if (strategy.memory_optimize_) {
       auto analysis_var_pass = AppendPass("analysis_var_pass");
     }
-    // Convert graph to run on multi-devices.
-    auto multi_devices_pass = AppendPass("multi_devices_pass");
-    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
-                                                         &strategy_);
+
+    AppendMultiDevPass(strategy);
 
     // Add a graph print pass to record a graph with device info.
     if (!strategy_.debug_graphviz_path_.empty()) {
@@ -115,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     }
   }
 
+  // Convert graph to run on multi-devices.
+  void AppendMultiDevPass(const BuildStrategy &strategy) {
+    ir::Pass *multi_devices_pass;
+    if (strategy_.is_distribution_) {
+      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
+    } else {
+      if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+        multi_devices_pass =
+            AppendPass("allreduce_mode_multi_devices_pass").get();
+      } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+        multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
+      } else {
+        PADDLE_THROW("Unknown reduce strategy.");
+      }
+    }
+    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
+                                                         &strategy_);
+  }
+
  private:
   BuildStrategy strategy_;
 };
@@ -131,6 +148,10 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
   return pass_builder_;
 }
 
+bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
+  return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0;
+}
+
 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
     const ProgramDesc &main_program, const std::vector<platform::Place> &places,
     const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
@@ -145,22 +166,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
-    if (pass->Type() == "multi_devices_pass") {
-      pass->Erase("places");
-      pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
-      pass->Erase("loss_var_name");
-      pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
-      pass->Erase("local_scopes");
-      pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+    if (IsMultiDevPass(pass->Type())) {
+      pass->Erase(kPlaces);
+      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
+      pass->Erase(kLossVarName);
+      pass->SetNotOwned<const std::string>(kLossVarName, &loss_var_name);
+      pass->Erase(kLocalScopes);
+      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                     &local_scopes);
-      pass->Erase("nranks");
-      pass->Set<size_t>("nranks", new size_t(nranks));
+      pass->Erase(kNRanks);
+      pass->Set<size_t>(kNRanks, new size_t(nranks));
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
       pass->Erase("nccl_ctxs");
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
+
     } else if (pass->Type() == "analysis_var_pass") {
       const std::vector<OpDesc *> *all_op_descs =
           new std::vector<OpDesc *>(main_program.Block(0).AllOps());
@@ -201,7 +223,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
-USE_PASS(multi_devices_pass);
+USE_PASS(reduce_mode_multi_devices_pass);
+USE_PASS(allreduce_mode_multi_devices_pass);
+USE_PASS(dist_multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
 USE_PASS(analysis_var_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index b75c01c4852590cd6400c273cf13e7307816618c..15c2e01b6142571883c759efb1e26b609be9adb4 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -74,8 +74,6 @@ struct BuildStrategy {
 
   bool fuse_elewise_add_act_ops_{false};
 
-  bool enable_data_balance_{false};
-
   bool memory_optimize_{false};
 
   bool memory_early_delete_{false};
@@ -84,6 +82,10 @@ struct BuildStrategy {
 
   bool fuse_broadcast_op_{false};
 
+  // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
+  // num_trainers is 1, so the current fields of build_strategy doesn't tell if
+  // it's distributed model.
+  bool is_distribution_{false};
   int num_trainers_{1};
   int trainer_id_{0};
   std::vector<std::string> trainers_endpoints_;
@@ -104,6 +106,8 @@ struct BuildStrategy {
 
   bool IsFinalized() const { return is_finalized_; }
 
+  bool IsMultiDevPass(const std::string &pass_name) const;
+
   // Apply the passes built by the pass_builder_. The passes will be
   // applied to the Program and output an ir::Graph.
   std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
index c8ea18804630fea4ada98062256730dbf4c24860..a4bb1e26d933946b7ca36196d1c0e8a0a4ec54e2 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include <string>
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
@@ -21,68 +21,78 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
-  std::unordered_map<OpHandleBase *, size_t> pending_ops;
-  std::unordered_set<VarHandleBase *> pending_vars;
-  std::unordered_set<VarHandleBase *> ready_vars;
-  std::unordered_set<OpHandleBase *> ready_ops;
+class SSAGraghBuilderWithChecker : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
+  }
 
-  auto insert_pending_var = [&](VarHandleBase *var) {
-    pending_vars.insert(var);
-    if (var->GeneratedOp() == nullptr) {
-      ready_vars.emplace(var);
-    }
-  };
+  bool IsValidGraph(const ir::Graph *graph) const {
+    std::unordered_map<OpHandleBase *, size_t> pending_ops;
+    std::unordered_set<VarHandleBase *> pending_vars;
+    std::unordered_set<VarHandleBase *> ready_vars;
+    std::unordered_set<OpHandleBase *> ready_ops;
 
-  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
-    for (auto &name_pair : var_map) {
-      for (auto &version_pair : name_pair.second) {
-        insert_pending_var(version_pair);
+    auto insert_pending_var = [&](VarHandleBase *var) {
+      pending_vars.insert(var);
+      if (var->GeneratedOp() == nullptr) {
+        ready_vars.emplace(var);
       }
-    }
-  }
+    };
 
-  for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
-    insert_pending_var(var);
-  }
+    for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
+      for (auto &name_pair : var_map) {
+        for (auto &version_pair : name_pair.second) {
+          insert_pending_var(version_pair);
+        }
+      }
+    }
 
-  for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
-    if (op->Inputs().empty()) {
-      ready_ops.insert(op);
-    } else {
-      pending_ops.insert({op, op->NoDupInputSize()});
+    for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
+      insert_pending_var(var);
     }
-  }
 
-  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
-    for (auto *op : set) {
-      for (auto out : op->Outputs()) {
-        ready_vars.emplace(out);
+    for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
+      if (op->Inputs().empty()) {
+        ready_ops.insert(op);
+      } else {
+        pending_ops.insert({op, op->NoDupInputSize()});
       }
     }
-    set.clear();
-  };
 
-  while (!pending_vars.empty()) {
-    run_all_ops(ready_ops);
+    auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+      for (auto *op : set) {
+        for (auto out : op->Outputs()) {
+          ready_vars.emplace(out);
+        }
+      }
+      set.clear();
+    };
 
-    if (ready_vars.empty()) {
-      return false;
-    }
+    while (!pending_vars.empty()) {
+      run_all_ops(ready_ops);
 
-    for (auto ready_var : ready_vars) {
-      pending_vars.erase(ready_var);
-      for (auto *op : ready_var->PendingOps()) {
-        auto &deps = --pending_ops[op];
-        if (deps == 0) {
-          ready_ops.insert(op);
+      if (ready_vars.empty()) {
+        return false;
+      }
+
+      for (auto ready_var : ready_vars) {
+        pending_vars.erase(ready_var);
+        for (auto *op : ready_var->PendingOps()) {
+          auto &deps = --pending_ops[op];
+          if (deps == 0) {
+            ready_ops.insert(op);
+          }
         }
       }
+      ready_vars.clear();
     }
-    ready_vars.clear();
+    return true;
   }
-  return true;
-}
+};
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h b/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
deleted file mode 100644
index 1e2b1867c376956d7d2dac465c13e2f3f64ba7eb..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class SSAGraghBuilderWithChecker : public ir::Pass {
- protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override {
-    PADDLE_ENFORCE(IsValidGraph(graph.get()));
-    return graph;
-  }
-
-  bool IsValidGraph(const ir::Graph* graph) const;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 761c9ab9041fe70012a7524924b27b1021e97477..d91993bd4f8c04539cd189a4145350498911c513 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -134,15 +134,8 @@ void AddOutputToLeafOps(ir::Graph *graph) {
 }
 }  // namespace
 
-static const char kLossVarName[] = "loss_var_name";
-static const char kPlaces[] = "places";
-static const char kLocalScopes[] = "local_scopes";
-static const char kStrategy[] = "strategy";
-static const char kNRanks[] = "nranks";
-
-void MultiDevSSAGraphBuilder::Init() const {
+void MultiDevSSAGraphBuilderBase::Init() const {
   all_vars_.clear();
-  balance_vars_.clear();
 
   loss_var_name_ = Get<const std::string>(kLossVarName);
   places_ = Get<const std::vector<platform::Place>>(kPlaces);
@@ -151,31 +144,16 @@ void MultiDevSSAGraphBuilder::Init() const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
-
-  balance_vars_.resize(places_.size(), 0);
-
-  if (strategy_.enable_data_balance_ && places_.size() == 1) {
-    LOG(WARNING) << "It is no need to enable data balance when there is only "
-                    "one place. enable_data_balance is set to False.";
-    strategy_.enable_data_balance_ = false;
-  }
 }
 
-std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
+std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   Init();
-  // Give the topology sort order and rebuild the graph structure.
-  std::vector<ir::Node *> sorted_ops = ir::TopologySortOperations(*graph);
-
-  if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-    sorted_ops = SortForReduceMode(sorted_ops);
-  }
+  std::vector<ir::Node *> sorted_ops = SortOperations(*graph);
 
   auto nodes = graph->ReleaseNodes();
   ir::Graph &result = *graph;
 
-  size_t nranks = Get<size_t>(kNRanks);
-
   for (auto &node : nodes) {
     if (node->IsVar() && node->Var()) {
       all_vars_.emplace(node->Name(), node->Var());
@@ -187,146 +165,61 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   result.Set(kGraphDepVars, new GraphDepVars);
   result.Set(kGraphOps, new GraphOps);
 
-  std::vector<std::unordered_set<std::string>> bcast_var_name_set;
-  bcast_var_name_set.resize(places_.size());
-
   bool is_forwarding = true;
-  bool is_dist_train = false;
-
-  std::unordered_map<std::string, int> sharded_var_device;
+  bool insert_collection_ops = NeedCollectiveOps();
 
   for (ir::Node *node : sorted_ops) {
-    if (OpHaveRole(*node, OpRole::kRPC)) {
-      int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device);
-      PADDLE_ENFORCE(op_dev_id != -1,
-                     "Can not schedule the RPC operator to the right place.");
-      if (node->Op()->Type() == "recv") {
-        auto recv_vars_attr =
-            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-        PADDLE_ENFORCE(recv_vars_attr.size() == 2UL);  // [parameter, gradient]
-        if (recv_vars_attr[0].find(".block") == std::string::npos) {
-          bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
-        }
-      }
-      is_dist_train = true;
-    } else if (OpHaveRole(*node, OpRole::kDist)) {
-      int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device);
-      if (node->Op()->Type() == "concat") {
-        auto origin_param_name = node->Op()->OutputArgumentNames()[0];
-        bcast_var_name_set[op_dev_id].emplace(origin_param_name);
-      }
-    } else if (IsScaleLossOp(node)) {
-      // user can customize loss@grad if not use_default_grad_scale_
-      if (strategy_.gradient_scale_ !=
-          BuildStrategy::GradientScaleStrategy::kCustomized) {
-        // TODO(paddle-dev): Why is there no input for this op_handle?
-        auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
-        auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
-        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
-                              out_dtype);
-      }
-      // This assumes the backward generating code will ensure IsScaleLossOp
-      // is true only for the op that scale the final scalar loss.
-      // It also assumes backward op will always follow the forward op in
-      // the block.
-      is_forwarding = false;
+    if (DealWithSpecialOp(&result, node)) {
+      continue;
     } else {
-      int op_dev_id = GetOpDeviceID(node, sharded_var_device);
-      if (op_dev_id != -1) {  // This op only runs on one specific device.
-        CreateComputationalOp(&result, node, op_dev_id);
-        for (ir::Node *n : node->outputs) {
-          sharded_var_device.emplace(n->Name(), op_dev_id);
-        }
+      // This op runs on all devices
+      if (IsScaleLossOp(node)) {
+        // user can customize loss@grad if not use_default_grad_scale_
+        InsertScaleLossGradOp(&result, node);
+        // This assumes the backward generating code will ensure IsScaleLossOp
+        // is true only for the op that scale the final scalar loss.
+        // It also assumes backward op will always follow the forward op in
+        // the block.
+        is_forwarding = false;
       } else {
-        // This op runs on all devices, and its output may have parameter's
-        // gradients.
-        // TODO(paddle-dev): Why is so special about "read" op?
-        if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
-          node->Op()->SetAttr("throw_eof_exp", false);
-          CreateComputationalOps(&result, node, places_.size());
-          const auto &data_var_names = node->Op()->Output("Out");
-          InsertDataBalanceOp(&result, data_var_names);
-        } else {
-          CreateComputationalOps(&result, node, places_.size());
-        }
+        CreateComputationalOps(&result, node, places_.size());
+      }
 
-        if (!is_forwarding && nranks > 1UL) {
+      // Insert collection ops
+      if (!is_forwarding && insert_collection_ops) {
+        try {
           bool is_bk_op =
               static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
                                     OpProtoAndCheckerMaker::OpRoleAttrName())) &
                                 static_cast<int>(OpRole::kBackward));
           if (!is_bk_op) continue;
+
           // Currently, we assume that once gradient is generated, it can be
           // broadcast, and each gradient is only broadcast once.
-          try {
-            auto backward_vars = boost::get<std::vector<std::string>>(
-                node->Op()->GetNullableAttr(
-                    OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-            PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
-            for (size_t i = 0; i < backward_vars.size(); i += 2) {
-              auto &p_name = backward_vars[i];
-              auto &g_name = backward_vars[i + 1];
-              VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
-              size_t cur_device_id = -1;
-              switch (strategy_.reduce_) {
-                case BuildStrategy::ReduceStrategy::kReduce:
-                  cur_device_id = GetAppropriateDeviceID({g_name});
-                  CreateReduceOp(&result, g_name, cur_device_id);
-                  sharded_var_device.emplace(g_name, cur_device_id);
-                  if (!is_dist_train) {
-                    bcast_var_name_set[cur_device_id].emplace(p_name);
-                  }
-                  break;
-                case BuildStrategy::ReduceStrategy::kAllReduce:
-                  if (IsSparseGradient(g_name)) {
-                    CreateReduceOp(&result, g_name, 0);
-                    CreateBroadcastOp(&result, g_name, 0);
-                  } else {
-                    InsertAllReduceOp(&result, g_name);
-                  }
-                  break;
-                default:
-                  LOG(FATAL) << "Unknown reduce strategy ";
-                  break;
-              }
-            }
-          } catch (boost::bad_get e) {
+          auto backward_vars =
+              boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                  OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+          PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+          for (size_t i = 0; i < backward_vars.size(); i += 2) {
+            auto &p_name = backward_vars[i];
+            auto &g_name = backward_vars[i + 1];
+            VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+
+            InsertCollectiveOp(&result, p_name, g_name);
           }
+        } catch (boost::bad_get e) {
         }
       }
     }
   }
-  bool use_gpu = false;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  use_gpu = nccl_ctxs_ != nullptr;
-#endif
 
-  // Insert broadcast operators principle:
-  // 1. Broadcast optimized parameters in Reduce strategy;
-  // 2. No need broadcast optimized parameters in AllReduce strategy because of
-  //    the optimization sub-graph would be run on every GPU;
-  // 3. Allways broadcast received parameters in Distribute Training.
-  if ((use_gpu &&
-       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
-      is_dist_train) {
-    if (strategy_.fuse_broadcast_op_) {
-      CreateFusedBroadcastOp(&result, bcast_var_name_set);
-    } else {
-      for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
-        auto &to_bcast_set = bcast_var_name_set[dev_id];
-        for (auto &bcast_name : to_bcast_set) {
-          CreateBroadcastOp(&result, bcast_name, dev_id);
-        }
-      }
-    }
-  }
+  InsertPostprocessOps(&result);
+
   /*
   Dependency graph has been constructed. However, there are still data
   hazards need to be handled.
- */
+  */
   PolishGraphToSupportDataHazards(&result);
 
   /*
@@ -337,67 +230,54 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   return graph;
 }
 
-std::vector<ir::Node *> MultiDevSSAGraphBuilder::SortForReduceMode(
-    const std::vector<ir::Node *> &topo_ops) const {
-  std::unordered_map<std::string, int> sharded_var_device;
-  std::vector<ir::Node *> sorted_ops;
-  std::unordered_map<std::string, std::vector<ir::Node *>> delayed_op;
-  sorted_ops.reserve(topo_ops.size());
-
-  auto insert_delayed_op = [&](const std::string &var_name, int dev_id) {
-    sharded_var_device.emplace(var_name, dev_id);
-    if (delayed_op.count(var_name)) {
-      auto &ops = delayed_op.at(var_name);
-      sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end());
-      delayed_op.at(var_name).clear();
-    }
-  };
+void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
+    ir::Graph *result, const ir::Node *node) const {
+  // user can customize loss@grad if not use_default_grad_scale_
+  size_t loss_scale = 0;
+  switch (this->strategy_.gradient_scale_) {
+    case BuildStrategy::GradientScaleStrategy::kOne:
+      loss_scale = 1;
+      break;
+    case BuildStrategy::GradientScaleStrategy::kCoeffNumDevice:
+      loss_scale = Get<size_t>(kNRanks);
+      break;
+    case BuildStrategy::GradientScaleStrategy::kCustomized:
+      loss_scale = 0;
+      break;
+    default:
+      LOG(FATAL) << "Unknown gradient scale strategy.";
+      break;
+  }
+
+  if (loss_scale) {
+    // TODO(paddle-dev): Why is there no input for this op_handle?
+    auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
+    auto out_dtype = this->all_vars_.at(loss_grad_name)->GetDataType();
+    this->CreateScaleLossGradOp(result, loss_grad_name, node->outputs[0],
+                                loss_scale, out_dtype);
+  }
+}
 
-  for (ir::Node *node : topo_ops) {
-    int op_dev_id = GetOpDeviceID(node, sharded_var_device, &delayed_op);
-    if (op_dev_id > -1) {
-      // This op only runs on one specific device.
-      sorted_ops.emplace_back(node);
-      for (ir::Node *n : node->outputs) {
-        insert_delayed_op(n->Name(), op_dev_id);
-      }
-    } else if (op_dev_id == -1) {
-      // This op runs on all devices, and its output may have parameter's
-      // gradients.
-      sorted_ops.emplace_back(node);
-      bool is_bk_op =
-          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kBackward));
-      if (!is_bk_op) continue;
-      // Currently, we assume that once gradient is generated, it can be
-      // broadcast, and each gradient is only broadcast once.
-      std::vector<std::string> backward_vars;
-      try {
-        backward_vars =
-            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      } catch (boost::bad_get e) {
-      }
-      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
+    const ir::Graph &graph) const {
+  return ir::TopologySortOperations(graph);
+}
 
-      for (size_t i = 0; i < backward_vars.size(); i += 2) {
-        auto &g_name = backward_vars[i + 1];
-        size_t cur_device_id = GetAppropriateDeviceID({g_name});
-        insert_delayed_op(g_name, static_cast<int>(cur_device_id));
-      }
-    } else if (op_dev_id == -2) {
-      // The Op on which the Op depends has not yet been generated.
-    }
-  }
+bool MultiDevSSAGraphBuilderBase::UseGPU() const {
+  bool use_gpu = false;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  use_gpu = nccl_ctxs_ != nullptr;
+#endif
+  return use_gpu;
+}
 
-  PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size());
-  return sorted_ops;
+bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const {
+  return Get<size_t>(kNRanks) > 1;
 }
 
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
-                                                ir::Node *node,
-                                                size_t place_id) const {
+void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
+                                                    ir::Node *node,
+                                                    size_t place_id) const {
   auto p = places_[place_id];
   auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
   op_handle->SetDeviceContext(p,
@@ -420,28 +300,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
   }
 }
 
-size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
-    const std::vector<std::string> &var_names) const {
-  int64_t numel_sum = 0;
-  for (auto var_name : var_names) {
-    if (all_vars_.find(var_name) == all_vars_.end()) continue;
-    auto var_desc = all_vars_.at(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
-    auto dim = framework::make_ddim(var_desc->GetShape());
-    int64_t numel = framework::product(dim);
-    PADDLE_ENFORCE_GT(numel, 0);
-    numel_sum += numel;
-  }
-
-  auto smallest =
-      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
-  size_t dev_id =
-      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
-  balance_vars_[dev_id] += numel_sum;
-  return dev_id;
-}
-
-void MultiDevSSAGraphBuilder::SetCommunicationContext(
+void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
     OpHandleBase *op_handle, const platform::Place &p) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   if (nccl_ctxs_ == nullptr) {
@@ -454,9 +313,9 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
 #endif
 }
 
-void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
-                                                const std::string &p_name,
-                                                size_t src_dev_id) const {
+void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
+                                                    const std::string &p_name,
+                                                    size_t src_dev_id) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   auto *op_handle = new BroadcastOpHandle(
       result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
@@ -484,7 +343,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
+void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
     ir::Graph *result,
     const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
@@ -522,17 +381,17 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
-                                                    ir::Node *node,
-                                                    int dev_id) const {
+void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
+                                                        ir::Node *node,
+                                                        int dev_id) const {
   result->Get<GraphOps>(kGraphOps).emplace_back(
       new ComputationOpHandle(result->CreateOpNode(node->Op()),
                               local_scopes_[dev_id], places_[dev_id], dev_id));
   CreateOpHandleIOs(result, node, dev_id);
 }
 
-void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
-                                                const std::string &og) const {
+void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
+    ir::Graph *result, const std::string &og) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -560,102 +419,15 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
   }
 }
 
-void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
-    ir::Graph *result, const std::vector<std::string> &datas) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
-      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
-      local_scopes_, places_, nccl_ctxs_));
-#else
-  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
-      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
-      local_scopes_, places_));
-#endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
-    for (const std::string &d_name : datas) {
-      auto &vars = result->Get<GraphVars>(kGraphVars)[i][d_name];
-      PADDLE_ENFORCE(!vars.empty());
-      op_handle->AddInput(vars.back());
-      auto var = new VarHandle(
-          result->CreateEmptyNode(d_name, ir::Node::Type::kVariable),
-          vars.size(), i, d_name, p);
-      vars.emplace_back(var);
-      op_handle->AddOutput(var);
-    }
-  }
-}
-
-int MultiDevSSAGraphBuilder::GetOpDeviceID(
-    ir::Node *node,
-    const std::unordered_map<std::string, int> &sharded_var_device,
-    std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops) const {
-  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
-    return -1;
-  }
-
-  if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
-    return -1;
-  }
-
-  auto param_grad = boost::get<std::vector<std::string>>(
-      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device);
-
-  if (dev_id == -1) {
-    (*delay_ops)[param_grad[1]].push_back(node);
-    return -2;
-  }
-  return dev_id;
-}
-
-int MultiDevSSAGraphBuilder::GetOpDeviceID(
-    ir::Node *node,
-    const std::unordered_map<std::string, int> &sharded_var_device) const {
-  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
-    return -1;
-  }
-
-  if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
-    return -1;
-  }
-  auto param_grad = boost::get<std::vector<std::string>>(
-      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device);
-  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
-                    node->Op()->Type(), param_grad[0], param_grad[1]);
-  return dev_id;
-}
-
-int MultiDevSSAGraphBuilder::GetVarDeviceID(
-    const std::string &varname,
-    const std::unordered_map<std::string, int> &sharded_var_device) const {
-  auto got = sharded_var_device.find(varname);
-  if (got == sharded_var_device.end()) {
-    auto pos = varname.find(framework::kNewGradSuffix);
-    if (pos != std::string::npos) {
-      got = sharded_var_device.find(varname.substr(0, pos));
-    }
-  }
-  return got == sharded_var_device.end() ? -1 : got->second;
-}
-
-void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
+void MultiDevSSAGraphBuilderBase::CreateScaleLossGradOp(
     ir::Graph *result, const std::string &loss_grad_name,
-    ir::Node *out_var_node, proto::VarType::Type dtype) const {
-  size_t nranks = Get<size_t>("nranks");
+    ir::Node *out_var_node, size_t loss_scale,
+    proto::VarType::Type dtype) const {
   for (size_t i = 0; i < places_.size(); ++i) {
-    // Insert ScaleCost OpHandle
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
     auto *op_handle = new ScaleLossGradOpHandle(
         result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
-        nranks, local_scopes_[i], places_[i], dev_ctx, dtype);
+        loss_scale, local_scopes_[i], places_[i], dev_ctx, dtype);
     result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
     // FIXME: Currently ScaleLossGradOp only use device_count as scale
@@ -669,9 +441,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
-                                                     ir::Node *node,
-                                                     size_t num_places) const {
+void MultiDevSSAGraphBuilderBase::CreateComputationalOps(
+    ir::Graph *result, ir::Node *node, size_t num_places) const {
   for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
     auto p = places_[scope_idx];
     auto s = local_scopes_[scope_idx];
@@ -681,9 +452,9 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
   }
 }
 
-VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
-                                                   const std::string &og,
-                                                   int dst_dev_id) const {
+VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result,
+                                                       const std::string &og,
+                                                       int dst_dev_id) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
@@ -712,51 +483,273 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
   return var;
 }
 
-int MultiDevSSAGraphBuilder::CreateDistTrainOp(
-    ir::Graph *result, ir::Node *node,
-    std::unordered_map<std::string, int> *sharded_var_device) const {
-  int op_dev_id = -1;
-  std::vector<std::string> input_var_names;
-  std::vector<std::string> output_var_names;
-  for (ir::Node *input : node->inputs) {
-    input_var_names.push_back(input->Name());
+bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const {
+  return boost::get<int>(
+             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+             (static_cast<int>(OpRole::kBackward) |
+              static_cast<int>(OpRole::kLoss)) &&
+         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
+}
+
+bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
+    const std::string &og) const {
+  PADDLE_ENFORCE(all_vars_.count(og) != 0);
+  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
+    return true;
   }
-  for (ir::Node *output : node->outputs) {
-    output_var_names.push_back(output->Name());
+  return false;
+}
+
+void AllReduceSSAGraphBuilder::InsertCollectiveOp(
+    ir::Graph *result, const std::string &p_name,
+    const std::string &g_name) const {
+  if (IsSparseGradient(g_name)) {
+    CreateReduceOp(result, g_name, 0);
+    CreateBroadcastOp(result, g_name, 0);
+  } else {
+    CreateAllReduceOp(result, g_name);
   }
+}
 
-  if (node->Op()->Type() == "split_byref" ||
-      node->Op()->Type() == "split_selected_rows" ||
-      node->Op()->Type() == "split_ids") {
-    // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device);
-    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-      op_dev_id = GetAppropriateDeviceID(input_var_names);
-      for (auto &varname : input_var_names) {
-        sharded_var_device->emplace(varname, op_dev_id);
+int BalanceVarSSAGraphBuilder::GetVarDeviceID(
+    const std::string &varname) const {
+  auto got = sharded_var_device_.find(varname);
+  if (got == sharded_var_device_.end()) {
+    auto pos = varname.find(framework::kNewGradSuffix);
+    if (pos != std::string::npos) {
+      got = sharded_var_device_.find(varname.substr(0, pos));
+    }
+  }
+  return got == sharded_var_device_.end() ? -1 : got->second;
+}
+
+int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
+  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
+    return -1;
+  }
+  if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
+    return -1;
+  }
+  auto param_grad = boost::get<std::vector<std::string>>(
+      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  int dev_id = GetVarDeviceID(param_grad[1]);
+  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
+                    node->Op()->Type(), param_grad[0], param_grad[1]);
+  return dev_id;
+}
+
+size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID(
+    const std::vector<std::string> &var_names) const {
+  int64_t numel_sum = 0;
+  for (auto var_name : var_names) {
+    if (all_vars_.find(var_name) == all_vars_.end()) continue;
+    auto var_desc = all_vars_.at(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GT(numel, 0);
+    numel_sum += numel;
+  }
+
+  auto smallest =
+      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
+  size_t dev_id =
+      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
+  balance_vars_[dev_id] += numel_sum;
+  return dev_id;
+}
+
+void BalanceVarSSAGraphBuilder::ResetState() const {
+  balance_vars_.clear();
+  sharded_var_device_.clear();
+
+  balance_vars_.resize(places_.size(), 0);
+}
+
+void ReduceSSAGraphBuilder::Init() const {
+  MultiDevSSAGraphBuilderBase::Init();
+  ResetState();
+}
+
+void ReduceSSAGraphBuilder::ResetState() const {
+  BalanceVarSSAGraphBuilder::ResetState();
+  bcast_var_name_set_.clear();
+  bcast_var_name_set_.resize(places_.size());
+}
+
+void ReduceSSAGraphBuilder::InsertCollectiveOp(
+    ir::Graph *result, const std::string &p_name,
+    const std::string &g_name) const {
+  size_t cur_device_id = GetAppropriateDeviceID({g_name});
+  CreateReduceOp(result, g_name, cur_device_id);
+  sharded_var_device_.emplace(g_name, cur_device_id);
+  bcast_var_name_set_[cur_device_id].emplace(p_name);
+}
+
+bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
+                                              ir::Node *node) const {
+  int op_dev_id = BalanceVarSSAGraphBuilder::GetOpDeviceID(node);
+  if (op_dev_id != -1) {
+    // This op only runs on one specific device.
+    CreateComputationalOp(result, node, op_dev_id);
+    for (ir::Node *n : node->outputs) {
+      sharded_var_device_.emplace(n->Name(), op_dev_id);
+    }
+    return true;
+  }
+  return false;
+}
+
+void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
+  if (UseGPU()) {
+    if (strategy_.fuse_broadcast_op_) {
+      CreateFusedBroadcastOp(result, bcast_var_name_set_);
+    } else {
+      for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
+        auto &to_bcast_set = bcast_var_name_set_[dev_id];
+        for (auto &bcast_name : to_bcast_set) {
+          CreateBroadcastOp(result, bcast_name, dev_id);
+        }
       }
     }
-    for (auto &varname : output_var_names) {
-      sharded_var_device->emplace(varname, op_dev_id);
+  }
+}
+
+int ReduceSSAGraphBuilder::GetOpDeviceID(
+    ir::Node *node,
+    std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops) const {
+  if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
+    return -1;
+  }
+
+  auto param_grad = boost::get<std::vector<std::string>>(
+      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  int dev_id = GetVarDeviceID(param_grad[1]);
+
+  if (dev_id == -1) {
+    (*delay_ops)[param_grad[1]].push_back(node);
+    return -2;
+  }
+  return dev_id;
+}
+
+std::vector<ir::Node *> ReduceSSAGraphBuilder::SortOperations(
+    const ir::Graph &graph) const {
+  std::vector<ir::Node *> sorted_ops = ir::TopologySortOperations(graph);
+  return SortForReduceMode(sorted_ops);
+}
+
+std::vector<ir::Node *> ReduceSSAGraphBuilder::SortForReduceMode(
+    const std::vector<ir::Node *> &topo_ops) const {
+  std::vector<ir::Node *> sorted_ops;
+  std::unordered_map<std::string, std::vector<ir::Node *>> delayed_op;
+  sorted_ops.reserve(topo_ops.size());
+  ResetState();
+
+  auto insert_delayed_op = [&](const std::string &var_name, int dev_id) {
+    sharded_var_device_.emplace(var_name, dev_id);
+    if (delayed_op.count(var_name)) {
+      auto &ops = delayed_op.at(var_name);
+      sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end());
+      delayed_op.at(var_name).clear();
     }
-  } else if (node->Op()->Type() == "concat") {
-    op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device);
-    for (auto &varname : output_var_names) {
-      sharded_var_device->emplace(varname, op_dev_id);
+  };
+
+  for (ir::Node *node : topo_ops) {
+    int op_dev_id = GetOpDeviceID(node, &delayed_op);
+    if (op_dev_id > -1) {
+      // This op only runs on one specific device.
+      sorted_ops.emplace_back(node);
+      for (ir::Node *n : node->outputs) {
+        insert_delayed_op(n->Name(), op_dev_id);
+      }
+    } else if (op_dev_id == -1) {
+      // This op runs on all devices, and its output may have parameter's
+      // gradients.
+      sorted_ops.emplace_back(node);
+      bool is_bk_op =
+          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kBackward));
+      if (!is_bk_op) continue;
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once.
+      std::vector<std::string> backward_vars;
+      try {
+        backward_vars =
+            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      } catch (boost::bad_get e) {
+      }
+      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+      for (size_t i = 0; i < backward_vars.size(); i += 2) {
+        auto &g_name = backward_vars[i + 1];
+        size_t cur_device_id = GetAppropriateDeviceID({g_name});
+        insert_delayed_op(g_name, static_cast<int>(cur_device_id));
+      }
+    } else if (op_dev_id == -2) {
+      // The Op on which the Op depends has not yet been generated.
     }
-  } else {
-    LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
-    PADDLE_THROW(
-        "the distribute training related op should be in [split_byref, "
-        "concat].");
   }
 
-  PADDLE_ENFORCE(op_dev_id != -1,
-                 "can not find right place for distributed op: %s",
-                 node->Op()->Type());
+  PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size());
 
-  CreateComputationalOp(result, node, op_dev_id);
-  return op_dev_id;
+  ResetState();
+  return sorted_ops;
+}
+
+void DistSSAGraphBuilder::Init() const {
+  MultiDevSSAGraphBuilderBase::Init();
+  ResetState();
+}
+
+void DistSSAGraphBuilder::ResetState() const {
+  BalanceVarSSAGraphBuilder::ResetState();
+  bcast_var_name_set_.clear();
+  bcast_var_name_set_.resize(places_.size());
+}
+
+bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
+                                            ir::Node *node) const {
+  bool insert_op = false;
+  if (OpHaveRole(*node, OpRole::kRPC)) {
+    int op_dev_id = CreateRPCOp(result, node);
+    PADDLE_ENFORCE(op_dev_id != -1,
+                   "Can not schedule the RPC operator to the right place.");
+    if (node->Op()->Type() == "recv") {
+      auto recv_vars_attr =
+          boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE(recv_vars_attr.size() == 2UL);  // [parameter, gradient]
+      if (recv_vars_attr[0].find(".block") == std::string::npos) {
+        bcast_var_name_set_[op_dev_id].emplace(recv_vars_attr[0]);
+      }
+    }
+    insert_op = true;
+    need_broadcast_var_ = true;
+  } else if (OpHaveRole(*node, OpRole::kDist)) {
+    int op_dev_id = CreateDistTrainOp(result, node);
+    if (node->Op()->Type() == "concat") {
+      auto origin_param_name = node->Op()->OutputArgumentNames()[0];
+      bcast_var_name_set_[op_dev_id].emplace(origin_param_name);
+    }
+    insert_op = true;
+  } else {
+    int op_dev_id = GetOpDeviceID(node);
+    if (op_dev_id != -1) {  // This op only runs on one specific device.
+      CreateComputationalOp(result, node, op_dev_id);
+      for (ir::Node *n : node->outputs) {
+        sharded_var_device_.emplace(n->Name(), op_dev_id);
+      }
+      insert_op = true;
+    }
+  }
+  return insert_op;
 }
 
 void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
@@ -775,13 +768,11 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
 }
 
 // Create RPC related op handles that connects its in ops and out ops.
-int MultiDevSSAGraphBuilder::CreateRPCOp(
-    ir::Graph *result, ir::Node *node,
-    std::unordered_map<std::string, int> *sharded_var_device) const {
+int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
     // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id = GetVarDeviceID(node->inputs[0]->Name(), *sharded_var_device);
+    op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
     PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
                    "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
@@ -799,9 +790,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
       VLOG(10) << "send grad " << input_var_names[0] << " origin "
                << send_param_grad[1] << " place: " << op_dev_id;
       for (auto &varname : input_var_names) {
-        sharded_var_device->emplace(varname, op_dev_id);
+        sharded_var_device_.emplace(varname, op_dev_id);
       }
-      sharded_var_device->emplace(send_param_grad[1], op_dev_id);
+      sharded_var_device_.emplace(send_param_grad[1], op_dev_id);
     }
   } else if (node->Op()->Type() == "recv") {
     std::vector<std::string> output_var_names;
@@ -811,7 +802,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
     auto recv_param_grad = boost::get<std::vector<std::string>>(
         node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
     if (recv_param_grad.size() == 2U) {
-      op_dev_id = GetVarDeviceID(recv_param_grad[1], *sharded_var_device);
+      op_dev_id = GetVarDeviceID(recv_param_grad[1]);
       VLOG(10) << "recv param " << recv_param_grad[0]
                << " get grad place: " << recv_param_grad[1]
                << " place: " << op_dev_id;
@@ -819,7 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
       op_dev_id = GetAppropriateDeviceID(output_var_names);
     }
     for (auto &varname : output_var_names) {
-      sharded_var_device->emplace(varname, op_dev_id);
+      sharded_var_device_.emplace(varname, op_dev_id);
     }
   } else {
     // send_barrier, fetch_barrier will run on place 0;
@@ -846,7 +837,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
     for (ir::Node *output : node->outputs) {
       int outvar_dev_id = op_dev_id;
       if (node->Op()->Type() == "fetch_barrier") {
-        outvar_dev_id = GetVarDeviceID(output->Name(), *sharded_var_device);
+        outvar_dev_id = GetVarDeviceID(output->Name());
         PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
       }
       p = places_[outvar_dev_id];
@@ -863,29 +854,124 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
   return op_dev_id;
 }
 
-bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
-  PADDLE_ENFORCE(all_vars_.count(og) != 0);
-  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
-    return true;
+int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
+                                           ir::Node *node) const {
+  int op_dev_id = -1;
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
   }
-  return false;
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  if (node->Op()->Type() == "split_byref" ||
+      node->Op()->Type() == "split_selected_rows" ||
+      node->Op()->Type() == "split_ids") {
+    // TODO(paddle-dev): getting the first var is not safe.
+    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+      op_dev_id = GetAppropriateDeviceID(input_var_names);
+      for (auto &varname : input_var_names) {
+        sharded_var_device_.emplace(varname, op_dev_id);
+      }
+    }
+    for (auto &varname : output_var_names) {
+      sharded_var_device_.emplace(varname, op_dev_id);
+    }
+  } else if (node->Op()->Type() == "concat") {
+    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    for (auto &varname : output_var_names) {
+      sharded_var_device_.emplace(varname, op_dev_id);
+    }
+  } else {
+    LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
+    PADDLE_THROW(
+        "the distribute training related op should be in [split_byref, "
+        "concat].");
+  }
+
+  PADDLE_ENFORCE(op_dev_id != -1,
+                 "can not find right place for distributed op: %s",
+                 node->Op()->Type());
+
+  CreateComputationalOp(result, node, op_dev_id);
+  return op_dev_id;
 }
 
-bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
-  return boost::get<int>(
-             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-             (static_cast<int>(OpRole::kBackward) |
-              static_cast<int>(OpRole::kLoss)) &&
-         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
+void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
+                                             const std::string &p_name,
+                                             const std::string &g_name) const {
+  size_t cur_device_id = 0;
+  switch (strategy_.reduce_) {
+    case BuildStrategy::ReduceStrategy::kReduce:
+      cur_device_id = GetAppropriateDeviceID({g_name});
+      CreateReduceOp(result, g_name, cur_device_id);
+      sharded_var_device_.emplace(g_name, cur_device_id);
+      break;
+    case BuildStrategy::ReduceStrategy::kAllReduce:
+      if (IsSparseGradient(g_name)) {
+        CreateReduceOp(result, g_name, 0);
+        CreateBroadcastOp(result, g_name, 0);
+      } else {
+        CreateAllReduceOp(result, g_name);
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unknown reduce strategy.";
+      break;
+  }
+}
+
+void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
+  if (need_broadcast_var_ ||
+      (UseGPU() &&
+       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) {
+    if (strategy_.fuse_broadcast_op_) {
+      CreateFusedBroadcastOp(result, bcast_var_name_set_);
+    } else {
+      for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
+        auto &to_bcast_set = bcast_var_name_set_[dev_id];
+        for (auto &bcast_name : to_bcast_set) {
+          CreateBroadcastOp(result, bcast_name, dev_id);
+        }
+      }
+    }
+  }
+}
+
+std::unordered_set<std::string> &MultiDevSSAGraphBuilder() {
+  static std::unordered_set<std::string> regs;
+  return regs;
 }
+
+static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
+  MultiDevSSAGraphBuilder().insert(builder_mode);
+  return 0;
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(multi_devices_pass,
-              paddle::framework::details::MultiDevSSAGraphBuilder)
-    .RequirePassAttr(paddle::framework::details::kLossVarName)
-    .RequirePassAttr(paddle::framework::details::kPlaces)
-    .RequirePassAttr(paddle::framework::details::kLocalScopes)
-    .RequirePassAttr(paddle::framework::details::kStrategy)
-    .RequirePassAttr(paddle::framework::details::kNRanks);
+#define REGISTER_MULTI_DEVICES_PASS(pass_name, pass_class)                     \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
+      _reg_ssa_graph_builder_##pass_name,                                      \
+      "REGISTER_MULTI_DEVICES_PASS must be called in global namespace.");      \
+  int _reg_ssa_graph_builder_entry_##pass_name =                               \
+      paddle::framework::details::MultiDevSSAGraphBuilderRegister(#pass_name); \
+  REGISTER_PASS(pass_name, pass_class)                                         \
+      .RequirePassAttr(paddle::framework::details::kLossVarName)               \
+      .RequirePassAttr(paddle::framework::details::kPlaces)                    \
+      .RequirePassAttr(paddle::framework::details::kLocalScopes)               \
+      .RequirePassAttr(paddle::framework::details::kStrategy)                  \
+      .RequirePassAttr(paddle::framework::details::kNRanks)
+
+REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass,
+                            paddle::framework::details::ReduceSSAGraphBuilder);
+REGISTER_MULTI_DEVICES_PASS(
+    allreduce_mode_multi_devices_pass,
+    paddle::framework::details::AllReduceSSAGraphBuilder);
+REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass,
+                            paddle::framework::details::DistSSAGraphBuilder);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 7029e9dc18cbacf0c5f0d7c6430d84fb72d6a0a3..6d4386538ea7d0cc318647c92282af9d598fa699 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include <string>
 #include <utility>
 #include <vector>
@@ -30,78 +31,70 @@ namespace framework {
 class Scope;
 namespace details {
 
-class MultiDevSSAGraphBuilder : public ir::Pass {
+constexpr char kLossVarName[] = "loss_var_name";
+constexpr char kPlaces[] = "places";
+constexpr char kLocalScopes[] = "local_scopes";
+constexpr char kStrategy[] = "strategy";
+constexpr char kNRanks[] = "nranks";
+
+class MultiDevSSAGraphBuilderBase : public ir::Pass {
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(
       std::unique_ptr<ir::Graph> graph) const override;
 
- private:
-  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
-                         size_t device_id) const;
-  void Init() const;
+  virtual void Init() const;
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  mutable platform::NCCLContextMap *nccl_ctxs_;
-#endif
+  virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
 
-  int GetVarDeviceID(
-      const std::string &varname,
-      const std::unordered_map<std::string, int> &sharded_var_device) const;
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const = 0;
 
-  bool IsScaleLossOp(ir::Node *node) const;
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const = 0;
 
-  int CreateRPCOp(
-      ir::Graph *result, ir::Node *node,
-      std::unordered_map<std::string, int> *sharded_var_device) const;
-  int CreateDistTrainOp(
-      ir::Graph *result, ir::Node *node,
-      std::unordered_map<std::string, int> *sharded_var_device) const;
+  bool UseGPU() const;
+
+  bool NeedCollectiveOps() const;
+
+  bool IsScaleLossOp(ir::Node *node) const;
 
   void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                               size_t num_places) const;
 
   void CreateScaleLossGradOp(ir::Graph *result,
                              const std::string &loss_grad_name,
-                             ir::Node *out_var_node,
+                             ir::Node *out_var_node, size_t loss_scale,
                              proto::VarType::Type dtype) const;
 
   VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
+
   void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                              int dev_id) const;
 
-  int GetOpDeviceID(
-      ir::Node *node,
-      const std::unordered_map<std::string, int> &sharded_var_device) const;
-
-  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
+  bool IsSparseGradient(const std::string &og) const;
 
-  void InsertDataBalanceOp(ir::Graph *result,
-                           const std::vector<std::string> &datas) const;
+  void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;
 
   void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
+  void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const;
+
   void CreateFusedBroadcastOp(
       ir::Graph *result,
       const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;
 
-  bool IsSparseGradient(const std::string &og) const;
-
-  size_t GetAppropriateDeviceID(
-      const std::vector<std::string> &var_names) const;
-
   void SetCommunicationContext(OpHandleBase *op_handle,
                                const platform::Place &p) const;
 
-  std::vector<ir::Node *> SortForReduceMode(
-      const std::vector<ir::Node *> &) const;
+  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
+                         size_t device_id) const;
 
-  int GetOpDeviceID(
-      ir::Node *node,
-      const std::unordered_map<std::string, int> &shared_var_device,
-      std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
-      const;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  mutable platform::NCCLContextMap *nccl_ctxs_;
+#endif
 
   mutable std::string loss_var_name_;
   mutable std::vector<platform::Place> places_;
@@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
 
   mutable BuildStrategy strategy_;
   mutable std::unordered_map<std::string, VarDesc *> all_vars_;
+};
+
+class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
+ protected:
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const;
+
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
+    return false;
+  }
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const {}
+};
+
+class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
+ protected:
+  int GetVarDeviceID(const std::string &varname) const;
+
+  int GetOpDeviceID(ir::Node *node) const;
+
+  size_t GetAppropriateDeviceID(
+      const std::vector<std::string> &var_names) const;
+
+  virtual void ResetState() const;
+
+  mutable std::unordered_map<std::string, int> sharded_var_device_;
   mutable std::vector<int64_t> balance_vars_;
 };
+
+class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
+ protected:
+  virtual void Init() const;
+
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const;
+
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const;
+
+  virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
+
+  virtual void ResetState() const;
+
+  int GetOpDeviceID(ir::Node *node,
+                    std::unordered_map<std::string, std::vector<ir::Node *>>
+                        *delay_ops) const;
+
+  std::vector<ir::Node *> SortForReduceMode(
+      const std::vector<ir::Node *> &topo_ops) const;
+
+  mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
+};
+
+class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
+ protected:
+  virtual void Init() const;
+
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const;
+
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const;
+
+  virtual void ResetState() const;
+
+  int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
+
+  int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
+
+  mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
+  mutable bool need_broadcast_var_{false};
+};
+
+std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index f1642bc0d2b10f97295e80ee201db8f83bfd06ef..86e6b1f7d92bc7bc97180e05f6a7c14ab375f92f 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -40,14 +40,14 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 
 void NaiveExecutor::Run() {
 #ifndef PADDLE_ON_INFERENCE
-  LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
-                              "cmake flag ON_INFER is not set.";
-  LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
-                              "variables will be reused to save the allocation "
-                              "overhead.";
-  LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
-                              "setting the cmake flag ON_INFER=ON if you are "
-                              "running Paddle Inference";
+  LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
+                             "cmake flag ON_INFER is not set.";
+  LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
+                             "variables will be reused to save the allocation "
+                             "overhead.";
+  LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
+                             "setting the cmake flag ON_INFER=ON if you are "
+                             "running Paddle Inference";
 #endif  // PADDLE_ON_INFERENCE
   for (auto &op : ops_) {
     VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index 57345f12ccc5d59c84001f1c5c1ebdacadc97ed5..7e174c7def1ffa4089a94d9cc504b18843557c53 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
     }
   }
 
-  backend_->call(ngraph_function_, t_out, t_in);
+  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
 }  // NgraphEngine::RunImpl
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 2db5705d0944b2ab10defdda9a7b616daa8fd47e..2d8980b1d15d89cdf9c243a57188a0acb354940d 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -123,8 +123,6 @@ struct Argument {
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
   DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
-  DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
-                      std::function<bool(const framework::ir::Node*)>);
   DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index b8c9426ed3b62d35f78247269cb32d2f6344b092..e37fea38bcb2b1f514347ecbfe7072abb6f07455 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
   for (const std::string &pass_name : passes) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
 
-    // Set some pass attributes.
-    if (pass_name == "ir_analysis_pass") {
-      pass->Set("tensorrt_node_teller",
-                new SubgraphDetector::NodeInsideSubgraphTeller(
-                    argument->tensorrt_node_teller()));
-    }
-
     if (pass_name == "graph_viz_pass") {
       std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
                                   (pre_pass.empty() ? "origin" : pre_pass) +
@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
     }
 
     if (pass_name == "tensorrt_subgraph_pass") {
-      PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
-      pass->SetNotOwned("tensorrt_node_teller",
-                        argument->tensorrt_node_teller_ptr());
       pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
       pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
       pass->Set("min_subgraph_size",
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index 822c7799bb3ae6d79da6cf2a7b3c8c9b20353ed7..9ae5b8aa173b85904df360eb196aefe5af08c6aa 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -1,9 +1,13 @@
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
-cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
-set(analysis_deps ${analysis_deps}
-        subgraph_detector tensorrt_subgraph_pass
-        CACHE INTERNAL "")
 
-set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
-file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
-set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
+if (TENSORRT_FOUND)
+  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
+
+  set(analysis_deps ${analysis_deps}
+          subgraph_detector tensorrt_subgraph_pass
+          CACHE INTERNAL "")
+
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+  file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
+endif()
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index ad10010e42be9717e3298fc88c89764e4ae2690b..bc06e78ae6997b0d4d0456c15d6e4158efdad300 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/tensorrt/op_teller.h"
 
 namespace paddle {
 namespace inference {
@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
     std::unique_ptr<framework::ir::Graph> graph) const {
   framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
 
-  auto teller =
-      Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
+  auto teller = [](const framework::ir::Node *node) {
+    if (!node->IsOp() || !node->Op()) return false;
+    return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
+  };
 
   SubGraphFuser fuser(graph.get(), teller,
                       Get<int>("min_subgraph_size") /*min subgraph size*/);
@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(
 
 REGISTER_PASS(tensorrt_subgraph_pass,
               paddle::inference::analysis::TensorRtSubgraphPass)
-    .RequirePassAttr("tensorrt_node_teller")
     .RequirePassAttr("max_batch_size")
     .RequirePassAttr("workspace_size")
     .RequirePassAttr("min_subgraph_size");
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
index c3a2b3ca1d3b09e71921fde0b0bad8d195aaa38f..490189e550760b4de62724e685dd07f6e521445e 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -27,9 +27,6 @@ namespace analysis {
 
 void IrAnalysisComposePass::RunImpl(Argument *argument) {
   ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
-  if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
-    InitTensorRTAttrs(argument);
-  }
   ApplyIrPasses(argument);
   CollectFusionStatis(argument);
 }
@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
   return "ir-analysis-compose-pass";
 }
 
-void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
-  if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
-    LOG(INFO) << "Initing TensorRT pass";
-    argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) {
-      std::unordered_set<std::string> teller_set(
-          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
-           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
-           "conv2d_transpose", "leaky_relu"});
-      if (!node->IsOp()) return false;
-
-      if (teller_set.count(node->Op()->Type())) {
-        return true;
-      } else {
-        return false;
-      }
-    });
-  }
-}
-
 void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
   std::vector<std::string> passes({
       "ir_graph_build_pass", "ir_analysis_pass",
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
index 53e2ebb0038a5c105f68a0146b3da90a6ae34af8..16c6b7d84df88d0ebbc06b547c75a45dcb0c2440 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
   std::string repr() const override;
 
  private:
-  void InitTensorRTAttrs(Argument* argument);
-
   void ApplyIrPasses(Argument* argument);
 
   void CollectFusionStatis(Argument* argument);
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 6d6e799fdec9c67b4714f203b91b8bccb61510ba..211c691504de2c0bd8ff50f34b92cbc01397d5c9 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -14,86 +14,101 @@
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle_pass_builder.h"  // NOLINT
+#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 
 PassStrategy *contrib::AnalysisConfig::pass_builder() const {
-  PADDLE_ENFORCE(
-      pass_builder_.get(),
-      "Should call constructor first, that will init the pass_builder_.");
+  if (!pass_builder_.get()) {
+    if (use_gpu_) {
+      LOG(INFO) << "Create GPU IR passes";
+      pass_builder_.reset(new GpuPassStrategy);
+    } else {
+      LOG(INFO) << "Create CPU IR passes";
+      pass_builder_.reset(new CpuPassStrategy);
+    }
+  } else if (pass_builder_->use_gpu() ^ use_gpu()) {
+    LOG(WARNING) << "The use_gpu flag is not compatible between Config and "
+                    "PassBuilder, the flags are "
+                 << use_gpu() << " " << pass_builder_->use_gpu();
+    LOG(WARNING) << "Please make them compatible, still use the existing "
+                    "PassBuilder.";
+  }
+
   return pass_builder_.get();
 }
 
-contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) {
-  this->use_gpu = use_gpu;
-  if (use_gpu) {
-    pass_builder_.reset(new GpuPassStrategy);
-  } else {
-    pass_builder_.reset(new CpuPassStrategy);
-  }
+contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
+  model_dir_ = model_dir;
+}
+contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
+                                        const std::string &params_file) {
+  prog_file_ = prog_file;
+  params_file_ = params_file;
+}
+void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
+                                       const std::string &params_file_path) {
+  prog_file_ = prog_file_path;
+  params_file_ = params_file_path;
+}
+void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
+                                           int device_id) {
+#ifdef PADDLE_WITH_CUDA
+  use_gpu_ = true;
+  memory_pool_init_size_mb_ = memory_pool_init_size_mb;
+  device_id_ = device_id;
+#else
+  LOG(ERROR) << "Please compile with gpu to EnableGpu";
+  use_gpu_ = false;
+#endif
 }
+void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
 
 contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
-  // fields from Config
-  model_dir = other.model_dir;
-  // fields from NativeConfig
-  use_gpu = other.use_gpu;
-  device = other.device;
-  fraction_of_gpu_memory = other.fraction_of_gpu_memory;
-  prog_file = other.prog_file;
-  param_file = other.param_file;
-  specify_input_name = other.specify_input_name;
-  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
-  // fields from this.
-  enable_ir_optim = other.enable_ir_optim;
-  // For mkldnn
-  use_mkldnn_ = other.use_mkldnn_;
-  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
-
-  use_feed_fetch_ops = other.use_feed_fetch_ops;
-  use_tensorrt_ = other.use_tensorrt_;
-  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
-  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
-  tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
-  model_from_memory_ = other.model_from_memory_;
-
-  if (use_gpu) {
+#define CP_MEMBER(member__) member__ = other.member__;
+
+  // Model related.
+  CP_MEMBER(model_dir_);
+  CP_MEMBER(prog_file_);
+  CP_MEMBER(params_file_);
+  CP_MEMBER(model_from_memory_);  // the memory model reuses prog_file_ and
+                                  // params_file_ fields.
+  // Gpu releated.
+  CP_MEMBER(use_gpu_);
+  CP_MEMBER(device_id_);
+  CP_MEMBER(memory_pool_init_size_mb_);
+  // TensorRT releated.
+  CP_MEMBER(use_tensorrt_);
+  CP_MEMBER(tensorrt_workspace_size_);
+  CP_MEMBER(tensorrt_max_batchsize_);
+  CP_MEMBER(tensorrt_min_subgraph_size_);
+  // MKLDNN releated.
+  CP_MEMBER(use_mkldnn_);
+  CP_MEMBER(mkldnn_enabled_op_types_);
+
+  // Ir related.
+  CP_MEMBER(enable_ir_optim_);
+  CP_MEMBER(use_feed_fetch_ops_);
+  CP_MEMBER(ir_debug_);
+  CP_MEMBER(specify_input_name_);
+
+  CP_MEMBER(cpu_math_library_num_threads_);
+
+  CP_MEMBER(serialized_info_cache_);
+
+  if (use_gpu_) {
     pass_builder_.reset(new GpuPassStrategy(
         *static_cast<GpuPassStrategy *>(other.pass_builder())));
   } else {
     pass_builder_.reset(new CpuPassStrategy(
         *static_cast<CpuPassStrategy *>(other.pass_builder())));
   }
-}
 
-contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
-  // fields from Config
-  model_dir = other.model_dir;
-  // fields from NativeConfig
-  use_gpu = other.use_gpu;
-  device = other.device;
-  fraction_of_gpu_memory = other.fraction_of_gpu_memory;
-  prog_file = other.prog_file;
-  param_file = other.param_file;
-  specify_input_name = other.specify_input_name;
-  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
-  // fields from this.
-  enable_ir_optim = other.enable_ir_optim;
-  // For mkldnn
-  use_mkldnn_ = other.use_mkldnn_;
-  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
-
-  use_feed_fetch_ops = other.use_feed_fetch_ops;
-  use_tensorrt_ = other.use_tensorrt_;
-  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
-  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
-  tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
-  model_from_memory_ = other.model_from_memory_;
-
-  pass_builder_ = std::move(other.pass_builder_);
+#undef CP_MEMBER
 }
 
 void contrib::AnalysisConfig::EnableMKLDNN() {
@@ -112,17 +127,90 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
   use_tensorrt_ = true;
   tensorrt_workspace_size_ = workspace_size;
   tensorrt_max_batchsize_ = max_batch_size;
-  tensorrt_min_subgraph_size_ = min_subgraph_size;
-  // Append after the conv+affine_channel fuse pass.
-  pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
+}
+
+void contrib::AnalysisConfig::Update() {
+  auto info = SerializeInfoCache();
+  if (info == serialized_info_cache_) return;
+
+  if (use_gpu_) {
+    pass_builder_.reset(new GpuPassStrategy);
+  } else {
+    pass_builder_.reset(new CpuPassStrategy);
+  }
+
+  if (use_tensorrt_) {
+    if (!use_gpu_) {
+      LOG(ERROR)
+          << "TensorRT engine is not available when EnableGpu() not actived.";
+    } else {
+      // Append after the infer_clean pass.
+      pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
+    }
+  }
+
+  if (use_mkldnn_) {
+    if (!enable_ir_optim_) {
+      LOG(ERROR)
+          << "EnableMKLDNN() only works when IR optimization is enabled.";
+    }
+#ifdef PADDLE_WITH_MKLDNN
+    pass_builder()->EnableMKLDNN();
+    use_mkldnn_ = true;
+#else
+    LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
+    use_mkldnn_ = false;
+#endif
+  }
+
+  if (ir_debug_) {
+    pass_builder()->TurnOnDebug();
+  }
+}
+
+std::string contrib::AnalysisConfig::SerializeInfoCache() {
+  std::stringstream ss;
+  ss << use_gpu_;
+  ss << memory_pool_init_size_mb_;
+
+  ss << use_tensorrt_;
+  ss << tensorrt_workspace_size_;
+  ss << tensorrt_max_batchsize_;
+
+  ss << use_mkldnn_;
+  ss << enable_ir_optim_;
+  ss << use_feed_fetch_ops_;
+  ss << ir_debug_;
+
+  return ss.str();
+}
+
+void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
+    int cpu_math_library_num_threads) {
+  cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+}
+
+float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
+#ifdef PADDLE_WITH_CUDA
+  // Get the GPU memory details and calculate the fraction of memory for the
+  // GPU memory pool.
+  size_t gpu_used, gpu_available;
+  platform::GpuMemoryUsage(&gpu_used, &gpu_available);
+  double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.;
+  float fraction_of_gpu_memory =
+      static_cast<double>(memory_pool_init_size_mb()) / total_gpu_memory;
+  return fraction_of_gpu_memory;
+#else
+  return 0.;
+#endif
 }
 
 void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
                                              size_t prog_buffer_size,
                                              const char *param_buffer,
                                              size_t param_buffer_size) {
-  prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size);
-  param_file = std::string(param_buffer, param_buffer + param_buffer_size);
+  prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
+  params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
   model_from_memory_ = true;
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3aaec10ee2d442f834c490d51d73a58421d2c38f..585634fae9c85f77cc77d774ac166891014a025c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -33,6 +33,7 @@
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(profile);
@@ -59,8 +60,8 @@ bool AnalysisPredictor::Init(
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
     LOG(INFO) << "You can turn off by set gflags '-profile false'";
-    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
-                                           : platform::ProfilerState::kCPU;
+    auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll
+                                             : platform::ProfilerState::kCPU;
     platform::EnableProfiler(tracking_device);
   }
 
@@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram(
     // Optimize the program, and load parameters and modify them in the
     // scope_.
     // This will change the scope_ address.
-    if (config_.enable_ir_optim) {
+    if (config_.ir_optim()) {
       status_ir_optim_enabled_ = true;
       OptimizeInferenceProgram();
     } else {
@@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram(
   return true;
 }
 bool AnalysisPredictor::CreateExecutor() {
-  if (config_.use_gpu) {
+  if (config_.use_gpu_) {
     status_use_gpu_ = true;
-    place_ = paddle::platform::CUDAPlace(config_.device);
+    place_ = paddle::platform::CUDAPlace(config_.device_id_);
   } else {
     place_ = paddle::platform::CPUPlace();
   }
@@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() {
 }
 bool AnalysisPredictor::PrepareExecutor() {
   executor_->Prepare(sub_scope_, *inference_program_, 0,
-                     config_.use_feed_fetch_ops);
+                     config_.use_feed_fetch_ops_);
 
   PADDLE_ENFORCE_NOT_NULL(sub_scope_);
 
@@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     }
     input.set_lod(lod);
     int idx = -1;
-    if (config_.specify_input_name) {
+    if (config_.specify_input_name_) {
       auto name = inputs[i].name;
       if (feed_names_.find(name) == feed_names_.end()) {
         LOG(ERROR) << "feed names from program do not have name: [" << name
@@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 void AnalysisPredictor::OptimizeInferenceProgram() {
   status_program_optimized_ = true;
 
-  argument_.SetUseGPU(config_.use_gpu);
-  argument_.SetGPUDeviceId(config_.device);
+  argument_.SetUseGPU(config_.use_gpu());
+  argument_.SetGPUDeviceId(config_.gpu_device_id());
   argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
-  if (!config_.model_dir.empty()) {
-    argument_.SetModelDir(config_.model_dir);
+  if (!config_.model_dir().empty()) {
+    argument_.SetModelDir(config_.model_dir());
   } else {
     PADDLE_ENFORCE(
-        !config_.param_file.empty(),
+        !config_.params_file().empty(),
         "Either model_dir or (param_file, prog_file) should be set.");
-    PADDLE_ENFORCE(!config_.prog_file.empty());
-    argument_.SetModelProgramPath(config_.prog_file);
-    argument_.SetModelParamsPath(config_.param_file);
+    PADDLE_ENFORCE(!config_.prog_file().empty());
+    argument_.SetModelProgramPath(config_.prog_file());
+    argument_.SetModelParamsPath(config_.params_file());
   }
 
-  if (config_.use_gpu && config_.use_tensorrt_) {
+  if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
     argument_.SetUseTensorRT(true);
     argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
     argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
@@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   }
 
   auto passes = config_.pass_builder()->AllPasses();
-  if (!config_.enable_ir_optim) passes.clear();
+  if (!config_.ir_optim()) passes.clear();
   argument_.SetIrAnalysisPasses(passes);
   argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
   Analyzer().Run(&argument_);
@@ -358,18 +359,26 @@ template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
   VLOG(3) << "create AnalysisConfig";
-  if (config.use_gpu) {
+  if (config.use_gpu()) {
     // 1. GPU memeroy
-    PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
+    PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
+                      config.gpu_device_id());
     std::vector<std::string> flags;
-    if (config.fraction_of_gpu_memory >= 0.0f ||
-        config.fraction_of_gpu_memory <= 0.95f) {
+
+    float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
+    if (fraction_of_gpu_memory > 0.95f) {
+      LOG(ERROR)
+          << "Allocate too much memory for the GPU memory pool, assigned "
+          << config.memory_pool_init_size_mb() << " MB";
+      LOG(ERROR)
+          << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
+    }
+
+    if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
       flags.push_back("dummpy");
       std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         std::to_string(config.fraction_of_gpu_memory);
+                         std::to_string(fraction_of_gpu_memory);
       flags.push_back(flag);
       VLOG(3) << "set flag: " << flag;
       framework::InitGflags(flags);
@@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() {
 bool AnalysisPredictor::LoadProgramDesc() {
   // Initialize the inference program
   std::string filename;
-  if (!config_.model_dir.empty()) {
-    filename = config_.model_dir + "/__model__";
-  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+  if (!config_.model_dir().empty()) {
+    filename = config_.model_dir() + "/__model__";
+  } else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
     // All parameters are saved in a single file.
     // The file names should be consistent with that used
     // in Python API `fluid.io.save_inference_model`.
-    filename = config_.prog_file;
+    filename = config_.prog_file();
   } else {
-    if (config_.model_dir.empty() && config_.prog_file.empty()) {
+    if (config_.model_dir().empty() && config_.prog_file().empty()) {
       LOG(ERROR)
           << "Either model_dir or (prog_file, param_file) should be set.";
       return false;
     }
     LOG(ERROR) << string::Sprintf(
-        "not valid model path '%s' or program path '%s'.", config_.model_dir,
-        config_.param_file);
+        "not valid model path '%s' or program path '%s'.", config_.model_dir(),
+        config_.params_file());
     return false;
   }
 
@@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
 
     proto.ParseFromString(pb_content);
   } else {
-    proto.ParseFromString(config_.prog_file);
+    proto.ParseFromString(config_.prog_file());
   }
   inference_program_.reset(new framework::ProgramDesc(proto));
   return true;
@@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() {
       new_var->SetLoDLevel(var->GetLoDLevel());
       new_var->SetPersistable(true);
 
-      if (!config_.param_file.empty()) {
+      if (!config_.params_file().empty()) {
         params.push_back(new_var->Name());
       } else {
         // append_op
         framework::OpDesc *op = load_block->AppendOp();
         op->SetType("load");
         op->SetOutput("Out", {new_var->Name()});
-        op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()});
+        op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()});
         op->CheckAttrs();
       }
     }
   }
 
-  if (!config_.param_file.empty()) {
+  if (!config_.params_file().empty()) {
     // sort paramlist to have consistent ordering
     std::sort(params.begin(), params.end());
     // append just the load_combine op
     framework::OpDesc *op = load_block->AppendOp();
     op->SetType("load_combine");
     op->SetOutput("Out", params);
-    op->SetAttr("file_path", {config_.param_file});
+    op->SetAttr("file_path", {config_.params_file()});
     op->CheckAttrs();
   }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index a361b34437ade36dfba2c99db800a7d77ada8704..6169e60541e4a14d560e719d56624b3219dbcefd 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -25,9 +25,9 @@ namespace paddle {
 using contrib::AnalysisConfig;
 
 TEST(AnalysisPredictor, analysis_off) {
-  AnalysisConfig config(false);
-  config.model_dir = FLAGS_dirname;
-  config.enable_ir_optim = false;
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim(false);
 
   auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
   auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
@@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) {
 }
 
 TEST(AnalysisPredictor, analysis_on) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim(true);
 #ifdef PADDLE_WITH_CUDA
-  AnalysisConfig config(true);
-  config.fraction_of_gpu_memory = 0.15;
+  config.EnableUseGpu(100, 0);
 #else
-  AnalysisConfig config;
+  config.DisableGpu();
 #endif
-  config.model_dir = FLAGS_dirname;
-  config.enable_ir_optim = true;
 
   auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
   auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
@@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) {
   }
 
   // compare with NativePredictor
-  auto naive_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  auto naive_predictor =
+      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
   std::vector<PaddleTensor> naive_outputs;
   ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs));
   ASSERT_EQ(naive_outputs.size(), 1UL);
@@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) {
 
 TEST(AnalysisPredictor, ZeroCopy) {
   AnalysisConfig config;
-  config.model_dir = FLAGS_dirname;
-  config.use_feed_fetch_ops = false;
-
+  config.SetModel(FLAGS_dirname);
+  config.SwitchUseFeedFetchOps(false);
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
   auto w0 = predictor->GetInputTensor("firstw");
@@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) {
 
 TEST(AnalysisPredictor, Clone) {
   AnalysisConfig config;
-  config.model_dir = FLAGS_dirname;
-  config.use_feed_fetch_ops = true;
-  config.enable_ir_optim = true;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchUseFeedFetchOps(true);
+  config.SwitchIrOptim(true);
 
   std::vector<std::unique_ptr<PaddlePredictor>> predictors;
   predictors.emplace_back(CreatePaddlePredictor(config));
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index 6a8b81cc57281b12cd3a4c89c863b20a824ce34a..e14d93de2c41f740bc175c8e59412d7b828dd381 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -19,8 +19,6 @@ limitations under the License. */
 
 #pragma once
 
-#define WITH_ANAKIN
-
 #include <vector>
 
 #include "framework/core/net/net.h"
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 102147a493ed1454db1a78124200f163f68e555b..85e250aaaf4a18a261a4bfc5271670f93565a336 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -288,7 +288,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memeroy
-    PADDLE_ENFORCE_GT(
+    PADDLE_ENFORCE_GE(
         config.fraction_of_gpu_memory, 0.f,
         "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
     PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 78396397397c3125c3990073d6b2887ebb477ff2..54895679ca37362c7267677af80274b8de95e296 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) {
 #endif
 
 TEST(PassBuilder, Delete) {
-  contrib::AnalysisConfig config(false);
+  contrib::AnalysisConfig config;
+  config.DisableGpu();
   config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
   const auto& passes = config.pass_builder()->AllPasses();
   auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass");
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index 61ecd7bce683e40bbf89a343bfdbaa2b7051ae73..30215e480f908f353f00cbc9077e6c057222423a 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -36,12 +36,11 @@ namespace demo {
  */
 void Main() {
   std::unique_ptr<PaddlePredictor> predictor;
-  paddle::contrib::AnalysisConfig config(true);
-  config.param_file = FLAGS_modeldir + "/__params__";
-  config.prog_file = FLAGS_modeldir + "/__model__";
-  config.device = 0;
+  paddle::contrib::AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(FLAGS_modeldir + "/__params__",
+                  FLAGS_modeldir + "/__model__");
   config.EnableTensorRtEngine();
-  config.fraction_of_gpu_memory = 0.1;  // set by yourself
   predictor = CreatePaddlePredictor(config);
 
   VLOG(3) << "begin to process data";
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index bc8891455dc8e4a30ddfcc5f89792296e59c2548..5320992b7e78f4aa0ea8950af03038c1953dd027 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -40,15 +40,14 @@ using contrib::AnalysisConfig;
  */
 void Main(bool use_gpu) {
   std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
-  AnalysisConfig config(use_gpu);
-  config.param_file = FLAGS_modeldir + "/__params__";
-  config.prog_file = FLAGS_modeldir + "/__model__";
-  config.device = 0;
-  if (FLAGS_use_gpu) {
-    config.fraction_of_gpu_memory = 0.1;  // set by yourself
+  AnalysisConfig config;
+  if (use_gpu) {
+    config.EnableUseGpu(100, 0);
   }
+  config.SetModel(FLAGS_modeldir + "/__model__",
+                  FLAGS_modeldir + "/__params__");
 
-  predictor = CreatePaddlePredictor<NativeConfig>(config);
+  predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
   analysis_predictor = CreatePaddlePredictor(config);
 
   // Just a single batch of data.
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index e7ccea6587a250d9d931fa0e85146e32af714d26..2d61098f933f9e391bc7b2bee9f8fd8518302168 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -34,26 +34,67 @@ class AnalysisPredictor;
 namespace contrib {
 
 // NOTE WIP, not stable yet.
-struct AnalysisConfig : public NativeConfig {
-  explicit AnalysisConfig(bool use_gpu = false);
+struct AnalysisConfig {
+  AnalysisConfig() = default;
   explicit AnalysisConfig(const AnalysisConfig& other);
-  explicit AnalysisConfig(AnalysisConfig&& other);
+  explicit AnalysisConfig(const std::string& model_dir);
+  explicit AnalysisConfig(const std::string& prog_file,
+                          const std::string& params_file);
+
+  // Model path related.
+  void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
+  void SetModel(const std::string& prog_file_path,
+                const std::string& params_file_path);
+  void SetProgFile(const std::string& x) { prog_file_ = x; }
+  void SetParamsFile(const std::string& x) { params_file_ = x; }
+  const std::string& model_dir() const { return model_dir_; }
+  const std::string& prog_file() const { return prog_file_; }
+  const std::string& params_file() const { return params_file_; }
+
+  // GPU related.
+  void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
+  void DisableGpu();
+  bool use_gpu() const { return use_gpu_; }
+  int gpu_device_id() const { return device_id_; }
+  int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
+  float fraction_of_gpu_memory_for_pool() const;
 
   // Determine whether to perform graph optimization.
-  bool enable_ir_optim = true;
+  void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
+  bool ir_optim() const { return enable_ir_optim_; }
 
-  // Get a pass builder for customize the passes in IR analysis phase.
-  PassStrategy* pass_builder() const;
+  void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
+  bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
 
-  // NOT stable yet.
-  bool use_feed_fetch_ops{true};
+  void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
+  bool specify_input_name() const { return specify_input_name_; }
 
   void EnableTensorRtEngine(int workspace_size = 1 << 20,
                             int max_batch_size = 1, int min_subgraph_size = 3);
-  bool use_tensorrt() const { return use_tensorrt_; }
+  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
+
+  void SwitchIrDebug(int x = true) { ir_debug_ = x; }
 
   void EnableMKLDNN();
-  bool use_mkldnn() const { return use_mkldnn_; }
+  bool mkldnn_enabled() const { return use_mkldnn_; }
+
+  // Set and get the number of cpu math library threads.
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
+  }
+
+  NativeConfig ToNativeConfig() const {
+    NativeConfig config;
+    config.model_dir = model_dir_;
+    config.prog_file = prog_file_;
+    config.param_file = params_file_;
+    config.use_gpu = use_gpu_;
+    config.device = device_id_;
+    config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
+    config.specify_input_name = specify_input_name_;
+    return config;
+  }
   void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
     mkldnn_enabled_op_types_ = op_list;
   }
@@ -65,10 +106,29 @@ struct AnalysisConfig : public NativeConfig {
 
   friend class ::paddle::AnalysisPredictor;
 
+  // NOTE just for developer, not an official API, easily to be broken.
+  // Get a pass builder for customize the passes in IR analysis phase.
+  PassStrategy* pass_builder() const;
+
+ protected:
+  // Update the config.
+  void Update();
+
+  std::string SerializeInfoCache();
+
  protected:
+  // Model pathes.
+  std::string model_dir_;
+  std::string prog_file_;
+  std::string params_file_;
+
+  // GPU releated.
+  bool use_gpu_{false};
+  int device_id_{0};
+  uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
+
+  // TensorRT releated.
   bool use_tensorrt_{false};
-  bool use_mkldnn_{false};
-  std::unordered_set<std::string> mkldnn_enabled_op_types_;
   // For workspace_size, refer it from here:
   // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
   int tensorrt_workspace_size_;
@@ -82,17 +142,24 @@ struct AnalysisConfig : public NativeConfig {
   //  We set this variable to control the minimum number of nodes in the
   //  subgraph, 3 as default value.
   int tensorrt_min_subgraph_size_{3};
-  std::unique_ptr<PassStrategy> pass_builder_;
+
+  bool use_mkldnn_{false};
+  std::unordered_set<std::string> mkldnn_enabled_op_types_;
+
   bool model_from_memory_{false};
-};
 
-// Configurations for Anakin engine.
-struct AnakinConfig : public PaddlePredictor::Config {
-  enum TargetType { NVGPU = 0, X86 };
-  int device;
-  std::string model_file;
-  int max_batch_size{-1};
-  TargetType target_type;
+  bool enable_ir_optim_{true};
+  bool use_feed_fetch_ops_{true};
+  bool ir_debug_{false};
+
+  bool specify_input_name_{false};
+
+  int cpu_math_library_num_threads_{1};
+
+  // A runtime cache, shouldn't be transferred to others.
+  std::string serialized_info_cache_;
+
+  mutable std::unique_ptr<PassStrategy> pass_builder_;
 };
 
 }  // namespace contrib
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 92fb51d647cf4e2c8a4914d8df2e8b7b6318d1d1..1785bd520a17d5f5060d789b2e4e4f1eda26aa6a 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -26,9 +26,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle_api.h"  // NOLINT
-#ifndef WITH_ANAKIN
 #include "paddle_analysis_config.h"  // NOLINT
-#else
+#include "paddle_api.h"              // NOLINT
+#ifdef WITH_ANAKIN
 #include "paddle_anakin_config.h"  // NOLINT
 #endif
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 1062ac5f58b90d8649dae8bacc9ce154b8b9d844..b4cbc40e0fa0313412af0acb5a4cc620d9a8ae50 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -62,7 +62,12 @@ class PassStrategy : public PaddlePassBuilder {
   // still some CPU kernels running in CPU mode.
   virtual void EnableMKLDNN() = 0;
 
+  bool use_gpu() const { return use_gpu_; }
+
   virtual ~PassStrategy() = default;
+
+ protected:
+  bool use_gpu_{false};
 };
 
 /*
@@ -88,6 +93,7 @@ class CpuPassStrategy : public PassStrategy {
         "conv_eltwiseadd_bn_fuse_pass",  //
         "is_test_pass",                  //
     });
+    use_gpu_ = false;
   }
 
   virtual ~CpuPassStrategy() = default;
@@ -126,10 +132,14 @@ class GpuPassStrategy : public PassStrategy {
         "conv_elementwise_add2_act_fuse_pass",       //
         "conv_elementwise_add_fuse_pass",            //
     });
+
+    use_gpu_ = true;
   }
 
   GpuPassStrategy(const GpuPassStrategy &other)
-      : PassStrategy(other.AllPasses()) {}
+      : PassStrategy(other.AllPasses()) {
+    use_gpu_ = true;
+  }
 
   void EnableMKLDNN() override;
 
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index 17f6c6d9f10abf99fd93364d1356e2b3ef1b3934..9afeafd176c70bc03166ec7732ae5e2faf67ea54 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,5 @@
 nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
+nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(plugin)
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9fecad6eb3889f48f2e0012a718ed0d04f34ae66
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/op_teller.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Just tell by the op_types.
+struct SimpleOpTypeSetTeller : public Teller {
+  SimpleOpTypeSetTeller() {}
+
+  bool operator()(const std::string& op_type,
+                  const framework::OpDesc& desc) override {
+    return teller_set.count(op_type);
+  }
+
+ private:
+  std::unordered_set<std::string> teller_set{
+      {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+       "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+       "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
+       "conv2d_transpose", "leaky_relu"}};
+};
+
+bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
+  for (auto& teller : tellers_) {
+    if ((*teller)(op_type, desc)) return true;
+  }
+  return false;
+}
+
+OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
new file mode 100644
index 0000000000000000000000000000000000000000..b98f052bf2478098d74f19858ec79823d5ab1e2d
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Single Op teller definition.
+ * One can override this and define a more complex tell logic, considerring more
+ * issues such as op_desc.
+ */
+struct Teller {
+  virtual bool operator()(const std::string& op_type,
+                          const framework::OpDesc& desc) = 0;
+
+  virtual ~Teller() = default;
+};
+/*
+ * A real example:
+ *
+ * struct SomeTeller : public Teller {
+ * bool operator()(const std::string& op_type,
+ *                const framework::OpDesc& desc) override {
+ *  return op_type == "fc" && desc.Inputs().size() == 2;
+ * }
+ *};
+ */
+
+/*
+ * class OpTeller helps to tell whether a fluid
+ * operator can be transformed to a TensorRT layer.
+ */
+class OpTeller {
+ public:
+  static OpTeller& Global() {
+    static std::unique_ptr<OpTeller> x(new OpTeller);
+    return *x;
+  }
+
+  bool Tell(const std::string& op_type, const framework::OpDesc& desc);
+
+ private:
+  OpTeller();
+
+ private:
+  std::vector<std::unique_ptr<Teller>> tellers_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 12d61d06ce188a2478448373427f2defae5a2524..5ad6e4a8570b309e94375234d673e27698999cb7 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -165,12 +165,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(contrib::AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim(true);
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 2213971c1764b1a0bddfce5830bbdf2ffedd61ee..b9666e01adb23e0cbd9257bc55081c3a5001e887 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -105,11 +105,10 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
index 9d3c7519430522878ace697ea5ed38aebb6b0855..1318fbcbc4022457354fb34c727cf56ce26e12ec 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -76,11 +76,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(contrib::AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 04f8b3ffe894c7df0fb0c95e94a92b4f216f02de..6fef79dc4608acd6eee679ad4939e7684db98f5b 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -84,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
     cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
                         buffer_param.size());
   } else {
-    cfg->prog_file = FLAGS_infer_model + "/__model__";
-    cfg->param_file = FLAGS_infer_model + "/param";
+    cfg->SetModel(FLAGS_infer_model + "/__model__",
+                  FLAGS_infer_model + "/param");
   }
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 764ae5ed8506a7ed7dc51a5c36d0dd7e9df925f3..629981d565f1b6eeabc192287cb9f892df21b8e4 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -21,12 +21,10 @@ namespace inference {
 namespace analysis {
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->param_file = FLAGS_infer_model + "/params";
-  cfg->prog_file = FLAGS_infer_model + "/model";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->enable_ir_optim = true;
-  cfg->specify_input_name = true;
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim();
+  cfg->SwitchSpecifyInputNames();
   cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 17f4587a5093a2f1cd2d8acc0e17f2129ad36353..3c52afbfb8f60b1e9389d416a5640c9685d8e764 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -225,10 +223,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 TEST(Analyzer_rnn1, profile) {
-  contrib::AnalysisConfig cfg(false);
+  contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg.fraction_of_gpu_memory = 0.1;
-  cfg.pass_builder()->TurnOnDebug();
+  cfg.DisableGpu();
+  cfg.SwitchIrDebug();
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) {
 TEST(Analyzer_rnn1, ZeroCopy) {
   AnalysisConfig config;
   SetConfig(&config);
-  config.use_feed_fetch_ops = false;
+  config.SwitchUseFeedFetchOps(false);
 
   PaddlePlace place;
 
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
-  config.use_feed_fetch_ops = true;
-  auto native_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  config.SwitchUseFeedFetchOps(true);
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
 
-  config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
+  config.SwitchUseFeedFetchOps(
+      true);  // the analysis predictor needs feed/fetch.
   auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
 #define NEW_TENSOR(name__) \
@@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) {
 TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
   AnalysisConfig config;
   SetConfig(&config);
-  config.use_feed_fetch_ops = false;
+  config.SwitchUseFeedFetchOps(false);
 
 #define NEW_TENSOR(name__) \
   auto name__##_tensor = predictor->GetInputTensor(#name__);
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index f8354e76871e7f489fd21f2f74e7402db01845c3..007f9f0b66a7b276f5f2e8500a3001788ad41e79 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -105,12 +105,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index e6d6cd2960b394e8cd20b473bed90ce511f806be..47c1d7375843e4bad212c1d7d621c9e6d45e5982 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -89,11 +89,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index 1c251e0c22f1ec88f0e59c71d623e4e0585db795..a1742f606819334e7b15e644f8b9e330795bf16e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -122,12 +122,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->param_file = FLAGS_infer_model + "/params";
-  cfg->prog_file = FLAGS_infer_model + "/model";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->enable_ir_optim = true;
-  cfg->specify_input_name = true;
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
   cfg->pass_builder()->TurnOnDebug();
   cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 79f3c81ade450fa00419b652042b2cfc79b08e4c..7b448a3200351f902df277f7a653cf7114becba0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -47,11 +47,10 @@ struct DataReader {
 };
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index d73bccefd5fc8a8ad8679b7de3feac50f786daed..5a77b53a8513cdbef5620d36ba5e0722ae993916 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) {
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->param_file = FLAGS_infer_model + "/__params__";
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->enable_ir_optim = true;
-  cfg->specify_input_name = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__",
+                FLAGS_infer_model + "/__params__");
+  cfg->DisableGpu();
+  cfg->SwitchIrDebug();
+  cfg->SwitchSpecifyInputNames();
   // TODO(TJ): fix fusion gru
   cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
 }
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index 7046bce303e2bd46197ab512ae273500b9af88bf..cf0f1d5c18c79e34c96d4301dbf13c924ae2a3f0 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -64,19 +64,23 @@ std::ostream &operator<<(std::ostream &os,
   num_spaces++;
   os << *reinterpret_cast<const NativeConfig *>(&config);
   if (!config.model_from_memory()) {
-    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
-    os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n";
+    os << GenSpaces(num_spaces) << "param_file: " << config.params_file()
+       << "\n";
   } else {
     os << GenSpaces(num_spaces)
        << "prog_file and param_file: load from memory \n";
   }
-  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
+  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
      << "\n";
+  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
+     << "\n";
+  os << GenSpaces(num_spaces)
+     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
   os << GenSpaces(num_spaces)
-     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n";
-  os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt()
+     << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n";
+  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled()
      << "\n";
-  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
   return os;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 7eb44d9f4ea6e27a504984eac4f960bddc9032e1..41d033df85811a4730cab8b3275aaffd1ba338e5 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -328,7 +328,10 @@ void CompareNativeAndAnalysis(
     const std::vector<std::vector<PaddleTensor>> &inputs) {
   PrintConfig(config, true);
   std::vector<PaddleTensor> native_outputs, analysis_outputs;
-  TestOneThreadPrediction(config, inputs, &native_outputs, false);
+  const auto *analysis_config =
+      reinterpret_cast<const contrib::AnalysisConfig *>(config);
+  auto native_config = analysis_config->ToNativeConfig();
+  TestOneThreadPrediction(&native_config, inputs, &native_outputs, false);
   TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
   CompareResult(analysis_outputs, native_outputs);
 }
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index d3bd035c1c49c926fc9f5ed83085b2e6d9ca8c93..21df6eab814dad5e2f654bf6d9558a2f9859d5ae 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -46,22 +46,20 @@ void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
                                         std::string model_dir, bool use_gpu,
                                         bool use_tensorrt, int batch_size) {
   if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
-    config->param_file = model_dir + "/" + FLAGS_param_filename;
+    config->SetModel(model_dir + "/" + FLAGS_prog_filename,
+                     model_dir + "/" + FLAGS_param_filename);
   } else {
-    config->model_dir = model_dir;
+    config->SetModel(model_dir);
   }
   if (use_gpu) {
-    config->use_gpu = true;
-    config->device = 0;
-    config->fraction_of_gpu_memory = 0.15;
+    config->EnableUseGpu(100, 0);
     if (use_tensorrt) {
       config->EnableTensorRtEngine(1 << 10, batch_size);
       config->pass_builder()->DeletePass("conv_bn_fuse_pass");
       config->pass_builder()->DeletePass("fc_fuse_pass");
       config->pass_builder()->TurnOnDebug();
     } else {
-      config->enable_ir_optim = true;
+      config->SwitchIrOptim();
     }
   }
 }
@@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
 
   std::vector<PaddleTensor> outputs;
   if (use_analysis || use_tensorrt) {
-    contrib::AnalysisConfig config(true);
+    contrib::AnalysisConfig config;
+    config.EnableUseGpu(100, 0);
     config.pass_builder()->TurnOnDebug();
     SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
                                        FLAGS_batch_size);
@@ -109,7 +108,8 @@ void compare(std::string model_dir, bool use_tensorrt) {
       &native_outputs, false);
 
   std::vector<PaddleTensor> analysis_outputs;
-  contrib::AnalysisConfig analysis_config(true);
+  contrib::AnalysisConfig analysis_config;
+  analysis_config.EnableUseGpu(50, 0);
   SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
                                      use_tensorrt, FLAGS_batch_size);
   TestOneThreadPrediction(
@@ -154,9 +154,9 @@ TEST(TensorRT_mobilenet, analysis) {
 
 TEST(AnalysisPredictor, use_gpu) {
   std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  AnalysisConfig config(true);
-  config.model_dir = model_dir;
-  config.fraction_of_gpu_memory = 0.15;
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
   config.pass_builder()->TurnOnDebug();
 
   std::vector<std::vector<PaddleTensor>> inputs_all;
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 0f2bb8c65cf92aea55bfe77d03e0789cb193296e..03d9d466c3238c6c853bca75f5b9791a0841ff78 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -319,6 +319,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
 
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
+
     bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
     bool is_conv3d = strides.size() == 3U;
@@ -329,6 +331,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                   dilations[2] == 1
             : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
         "dilation in convolution is not implemented yet");
+
     PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently");
 
     const T* input_data = input->data<T>();
@@ -340,15 +343,24 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     GetWeightsTz(weights_tz, g, is_conv3d);
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
+    mkldnn::memory::data_type src_dt =
+        paddle::framework::ToMKLDNNDataType(input->type());
+    auto dst_dt = fuse_relu ? paddle::framework::ToMKLDNNDataType(
+                                  framework::DataTypeTrait<uint8_t>::DataType)
+                            : paddle::framework::ToMKLDNNDataType(
+                                  framework::DataTypeTrait<int8_t>::DataType);
+
+    if (force_fp32_output) {
+      dst_dt = paddle::framework::ToMKLDNNDataType(
+          framework::DataTypeTrait<float>::DataType);
+    }
+
     // Get unique name for storing MKLDNN primitives
     std::string key;
     key.reserve(MaxKeyLength);
-    mkldnn::memory::data_type src_dt =
-        paddle::framework::ToMKLDNNDataType(input->type());
     platform::ConvMKLDNNHandler::AppendKey(
         &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
-        input->format(), ctx.op().Output("Output"));
-
+        input->format(), dst_dt, ctx.op().Output("Output"));
     const std::string key_conv_pd = key + "@conv_pd";
 
     std::shared_ptr<mkldnn::convolution_forward> conv_p = nullptr;
@@ -413,13 +425,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format);
       auto weights_md = platform::MKLDNNMemDesc(
           weights_tz, memory::data_type::s8, chosen_memory_format);
-
-      auto dst_dt = force_fp32_output
-                        ? paddle::framework::ToMKLDNNDataType(
-                              framework::DataTypeTrait<float>::DataType)
-                        : paddle::framework::ToMKLDNNDataType(
-                              framework::DataTypeTrait<int8_t>::DataType);
-
       auto dst_md =
           platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
       // create a conv primitive descriptor and save it for usage in backward
@@ -429,11 +434,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                                memory::format::x);
         conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
                                        strides, paddings, mkldnn_engine,
-                                       output_shift_scale, is_test);
+                                       fuse_relu, output_shift_scale, is_test);
       } else {
-        conv_pd =
-            ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
-                                 mkldnn_engine, output_shift_scale, is_test);
+        conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
+                                       paddings, mkldnn_engine, fuse_relu,
+                                       output_shift_scale, is_test);
       }
       // Save conv_pd/src_memory/weights_memory for backward pass
       dev_ctx.SetBlob(key_conv_pd, conv_pd);
@@ -459,7 +464,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           mask_reorder);
 
       if (!force_fp32_output) {
-        dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
+        if (fuse_relu) {
+          dst_memory_p = platform::SetDstMemory<uint8_t>(ctx, output, handler);
+        } else {
+          dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
+        }
       } else {
         dst_memory_p = platform::SetDstMemory<float>(ctx, output, handler);
       }
@@ -518,8 +527,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                                       mkldnn_engine, key));
       }
       if (!force_fp32_output) {
-        dst_memory_p =
-            platform::SetDstMemoryHandler<int8_t>(ctx, output, handler);
+        if (fuse_relu) {
+          dst_memory_p =
+              platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler);
+        } else {
+          dst_memory_p =
+              platform::SetDstMemoryHandler<int8_t>(ctx, output, handler);
+        }
       } else {
         dst_memory_p =
             platform::SetDstMemoryHandler<float>(ctx, output, handler);
@@ -563,11 +577,18 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 
   mkldnn::primitive_attr CreatePostOps(
-      const std::vector<float> output_shift_scale) const {
+      bool fuse_relu, const std::vector<float> output_shift_scale) const {
     mkldnn::primitive_attr conv_attr;
     mkldnn::post_ops post_operations;
     int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
     conv_attr.set_output_scales(mask, output_shift_scale);
+    if (fuse_relu) {
+      constexpr float scale = 1.0f;
+      constexpr float negative_slope = 0.0f;
+      constexpr float placeholder = 1.0f;  // beta
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                     negative_slope, placeholder);
+    }
     conv_attr.set_post_ops(post_operations);
     return conv_attr;
   }
@@ -600,7 +621,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                        const memory::desc& dst, const std::vector<int>& strides,
                        const std::vector<int>& paddings,
-                       const mkldnn::engine& engine,
+                       const mkldnn::engine& engine, const bool fuse_relu,
                        const std::vector<float> output_shift_scale,
                        bool is_test) const {
     memory::dims stride_dims = {strides[0], strides[1]};
@@ -613,7 +634,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims,
         padding_dims, padding_dims, mkldnn::padding_kind::zero);
 
-    mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale);
+    mkldnn::primitive_attr conv_attr =
+        CreatePostOps(fuse_relu, output_shift_scale);
 
     auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
         conv_desc, conv_attr, engine);
@@ -652,7 +674,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                        const memory::desc& bias, const memory::desc& dst,
                        const std::vector<int>& strides,
                        const std::vector<int>& paddings,
-                       const mkldnn::engine& engine,
+                       const mkldnn::engine& engine, const bool fuse_relu,
                        const std::vector<float> output_shift_scale,
                        bool is_test) const {
     memory::dims stride_dims = {strides[0], strides[1]};
@@ -665,7 +687,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         propagation, mkldnn::convolution_direct, src, weights, bias, dst,
         stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
 
-    mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale);
+    mkldnn::primitive_attr conv_attr =
+        CreatePostOps(fuse_relu, output_shift_scale);
 
     auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
         conv_desc, conv_attr, engine);
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index 998b7f09c3146dcdd57fda13d7834473693eaf9c..1da14631e35608d479e1b861228d52d6d57def79 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -230,10 +230,12 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 
     if (ctx->HasOutput(framework::GradVarName("Emission"))) {
       ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims);
+      ctx->ShareLoD("Emission", framework::GradVarName("Emission"));
     }
     if (ctx->HasOutput(framework::GradVarName("Transition"))) {
       ctx->SetOutputDim(framework::GradVarName("Transition"),
                         transition_exps_dims);
+      ctx->ShareLoD("Transition", framework::GradVarName("Transition"));
     }
   }
 
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index d35073029a3440d8a17e383ce97fcfc582663888..58f7be12ce6b5d447e93cf86c4954a86fccf48ef 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -62,27 +62,19 @@ struct CUBlas<float> {
                       cudaDataType_t Atype, int lda, const void *B,
                       cudaDataType_t Btype, int ldb, const float *beta, void *C,
                       cudaDataType_t Ctype, int ldc) {
-    // Because the gcc 4.8 doesn't expand template parameter pack that
-    // appears in a lambda-expression, I can not use template parameter pack
-    // here.
-    auto cublas_call = [&]() {
+// Because the gcc 4.8 doesn't expand template parameter pack that
+// appears in a lambda-expression, I can not use template parameter pack
+// here.
 #if CUDA_VERSION >= 8000
-      VLOG(5) << "use_tensor_op_math: "
-              << (platform::TensorCoreAvailable() ? "True" : "False");
+    VLOG(5) << "use_tensor_op_math: "
+            << (dev_ctx->tensor_core_available() ? "True" : "False");
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
       PADDLE_ENFORCE(platform::dynload::cublasSgemmEx(
-          dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
-          lda, B, Btype, ldb, beta, C, Ctype, ldc));
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc));
+    });
 #else
-      PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
-#endif
-    };
-
-#if CUDA_VERSION >= 9000
-    // NOTES: To use Tensor Core, we should change the cublas config,
-    // but the cublas may be hold by multi-thread.
-    dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
-#else
-    cublas_call();
+    PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
 #endif
   }
 };
@@ -170,32 +162,24 @@ struct CUBlas<platform::float16> {
                       cudaDataType_t Btype, int ldb, const void *beta, void *C,
                       cudaDataType_t Ctype, int ldc,
                       cudaDataType_t computeType) {
-    auto cublas_call = [&]() {
 #if CUDA_VERSION >= 8000
-      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-      bool use_tensor_op_math = platform::TensorCoreAvailable();
-      if (use_tensor_op_math) {
-        algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-      }
-      VLOG(5) << "use_tensor_op_math: "
-              << (use_tensor_op_math ? "True" : "False");
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
 #endif  // CUDA_VERSION >= 9000
 
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
       PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
-          dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
-          lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo));
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, computeType, algo));
+    });
 #else
-      PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
-#endif
-    };
-
-#if CUDA_VERSION >= 9000
-    // NOTES: To use Tensor Core, we should change the cublas config,
-    // but the cublas may be hold by multi-thread.
-    dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
-#else
-    cublas_call();
+    PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
 #endif
   }
 };
@@ -223,9 +207,10 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
                        CUDA_R_32F, N);
   } else {
 #endif  // CUDA_VERSION >= 8000
-
-    CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
-                    &alpha, B, ldb, A, lda, &beta, C, N);
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+                      lda, &beta, C, N);
+    });
 
 #if CUDA_VERSION >= 8000
   }
@@ -266,9 +251,12 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-  CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
-                                  N, M, K, &h_alpha, h_B, ldb, h_A, lda,
-                                  &h_beta, h_C, N);
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                    &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C,
+                                    N);
+  });
 #endif  // CUDA_VERSION >= 8000
 }
 
@@ -292,8 +280,10 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
   } else {
 #endif  // CUDA_VERSION >= 8000
 
-    CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
-                    &alpha, B, ldb, A, lda, &beta, C, ldc);
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+                      lda, &beta, C, ldc);
+    });
 
 #if CUDA_VERSION >= 8000
   }
@@ -311,16 +301,19 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
   cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
-                                  N, M, K, &alpha, B, ldb, A, lda, &beta, C,
-                                  ldc);
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, A, lda, &beta, C, ldc);
+  });
 }
 
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
                                              T *y) const {
-  CUBlas<T>::AXPY(context_.cublas_handle(), n, &alpha, x, 1, y, 1);
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
 }
 
 template <>
@@ -330,8 +323,9 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
                                              T beta, T *C) const {
   cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  CUBlas<T>::GEMV(context_.cublas_handle(), cuTransA, N, M, &alpha, A, N, B, 1,
-                  &beta, C, 1);
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
 }
 
 template <>
@@ -353,28 +347,28 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
 
 #if CUDA_VERSION >= 9010
   if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto cublas_call = [&]() {
-      cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-      bool use_tensor_op_math = platform::TensorCoreAvailable();
-      if (use_tensor_op_math) {
-        algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-      }
-      VLOG(5) << "use_tensor_op_math: "
-              << (use_tensor_op_math ? "True" : "False");
-
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+    bool use_tensor_op_math = context_.tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+
+    context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
       PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx(
-          context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B,
-          CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C,
-          CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo));
-    };
-    auto &dev_ctx = const_cast<platform::CUDADeviceContext &>(context_);
-    dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
+          handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb,
+          strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc,
+          strideC, batchCount, CUDA_R_32F, algo));
+    });
   } else {
 #endif  // CUDA_VERSION >= 9010
 
-    CUBlas<T>::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA,
-                                  N, M, K, &alpha, B, ldb, strideB, A, lda,
-                                  strideA, &beta, C, ldc, strideC, batchCount);
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, strideB, A, lda, strideA, &beta, C,
+                                    ldc, strideC, batchCount);
+    });
 
 #if CUDA_VERSION >= 9010
   }
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 5c559484ec95e794ebbbe0e713cb9e26b5c01b98..61b9384f8422cb531a94096875434ffe36ecdbce 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -424,16 +424,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
         }
       }
 
+      framework::SelectedRows cpu_grad_merge;
       const framework::SelectedRows* grad_merge_ptr;
       if (is_strict_sorted) {
         grad_merge_ptr = &grad;
       } else {
         // merge duplicated rows if any.
         // The rows of grad_merge have been sorted inside MergeAdd functor
+        framework::SelectedRows* grad_merge_var;
         scatter::MergeAdd<DeviceContext, T> merge_func;
-        auto* grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
-                                   .Var()
-                                   ->GetMutable<framework::SelectedRows>();
+        if (platform::is_cpu_place(ctx.GetPlace())) {
+          grad_merge_var = &cpu_grad_merge;
+        } else {
+          // FIXME(qiao): GPU also need to fix this
+          grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
+                               .Var()
+                               ->GetMutable<framework::SelectedRows>();
+        }
         merge_func(ctx.template device_context<DeviceContext>(), grad,
                    grad_merge_var, true);
         grad_merge_ptr = grad_merge_var;
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..122de72e15d587cf33b5d9856ac8b1243f666881
--- /dev/null
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/macros.h"
+
+#if CUDA_VERSION < 9000
+enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
+#endif
+
+namespace paddle {
+namespace platform {
+
+class CublasHandleHolder {
+ public:
+  CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
+    PADDLE_ENFORCE(dynload::cublasCreate(&handle_));
+    PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream));
+#if CUDA_VERSION >= 9000
+    if (math_type == CUBLAS_TENSOR_OP_MATH) {
+      PADDLE_ENFORCE(
+          dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));
+    }
+#endif
+  }
+
+  ~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); }
+
+  template <typename Callback>
+  inline void Call(Callback &&callback) const {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callback(handle_);
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
+
+  cublasHandle_t handle_;
+  mutable std::mutex mtx_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 022afb686b29c2c493cfd05600ee372470cbc710..be7f4949d65cef36d61b726c1c656f177e298fcc 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -245,8 +245,15 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
   eigen_stream_.reset(new EigenCudaStreamDevice());
   eigen_stream_->Reinitialize(&stream_, place);
   eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
-  PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
-  PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
+  cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH));
+
+  if (TensorCoreAvailable()) {
+#if CUDA_VERSION >= 9000
+    cublas_tensor_core_handle_.reset(
+        new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH));
+#endif
+  }
+
   if (dynload::HasCUDNN()) {
     cudnn_holder_.reset(new CudnnHolder(&stream_, place));
   }
@@ -306,7 +313,8 @@ CUDADeviceContext::~CUDADeviceContext() {
   SetDeviceId(place_.device);
   Wait();
   WaitStreamCallback();
-  PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
+  cublas_handle_.reset();
+  cublas_tensor_core_handle_.reset();
   eigen_stream_.reset();
   eigen_device_.reset();
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
@@ -335,8 +343,8 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
-cublasHandle_t CUDADeviceContext::cublas_handle() const {
-  return cublas_handle_;
+bool CUDADeviceContext::tensor_core_available() const {
+  return cublas_tensor_core_handle_ != nullptr;
 }
 
 cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 7e875801893f3b73f8efaf33af690f8c855beee4..c81d17380cf894631d06588c007c2e11ce5c7836 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/temporary_allocator.h"
 #ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -209,39 +210,6 @@ class CudnnWorkspaceHandle {
   std::unique_ptr<std::lock_guard<std::mutex>> guard_;
 };
 
-#if CUDA_VERSION >= 9000
-class ScopedCublasMathMode {
- public:
-  ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode)
-      : handle_(handle) {
-    need_reset = false;
-    PADDLE_ENFORCE(
-        platform::dynload::cublasGetMathMode(handle_, &old_math_mode_),
-        "Failed to get old cublas math mode");
-    if (old_math_mode_ != new_math_mode) {
-      PADDLE_ENFORCE(
-          platform::dynload::cublasSetMathMode(handle_, new_math_mode),
-          "Failed to set old cublas math mode");
-      need_reset = true;
-    }
-  }
-
-  ~ScopedCublasMathMode() {
-    if (need_reset) {
-      PADDLE_ENFORCE(
-          platform::dynload::cublasSetMathMode(handle_, old_math_mode_),
-          "Failed to set old cublas math mode");
-    }
-  }
-
- private:
-  cublasHandle_t handle_;
-  cublasMath_t old_math_mode_;
-  bool need_reset;
-};
-
-#endif
-
 class CUDADeviceContext : public DeviceContext {
  public:
   explicit CUDADeviceContext(CUDAPlace place);
@@ -262,8 +230,25 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return eigen device in the device context. */
   Eigen::GpuDevice* eigen_device() const;
 
-  /*! \brief  Return cublas handle in the device context. */
-  cublasHandle_t cublas_handle() const;
+  /*! \brief  Call cublas function safely. */
+  template <typename Callback>
+  inline void CublasCall(Callback&& callback) const {
+    cublas_handle_->Call(std::forward<Callback>(callback));
+  }
+
+  /*! \brief  Check whether tensor core is supported */
+  bool tensor_core_available() const;
+
+  /*! \brief  Call cublas function with Tensor Core safely. If
+      Tensor Core is not available, use DEFAULT_MATH instead. */
+  template <typename Callback>
+  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
+    if (cublas_tensor_core_handle_) {
+      cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
+    } else {
+      cublas_handle_->Call(std::forward<Callback>(callback));
+    }
+  }
 
   /*! \brief  Return cudnn  handle in the device context. */
   cudnnHandle_t cudnn_handle() const;
@@ -282,7 +267,6 @@ class CUDADeviceContext : public DeviceContext {
 
   template <typename Callback>
   void RecordEvent(cudaEvent_t ev, Callback callback) {
-    std::lock_guard<std::mutex> guard(mtx_);
     callback();
     PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
   }
@@ -294,18 +278,6 @@ class CUDADeviceContext : public DeviceContext {
 
   void WaitStreamCallback() const { callback_manager_->Wait(); }
 
-#if CUDA_VERSION >= 9000
-  /*! \brief CublasCall may need to change cublas's config,
-   *  but the cublas may be hold by multi-thread, so we should
-   *  add lock here. */
-  template <typename Callback>
-  void CublasCall(Callback callback, cublasMath_t new_math) {
-    std::lock_guard<std::mutex> guard(cublas_mtx_);
-    ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math);
-    callback();
-  }
-#endif
-
  private:
   CUDAPlace place_;
 
@@ -313,7 +285,9 @@ class CUDADeviceContext : public DeviceContext {
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
   std::unique_ptr<CudnnHolder> cudnn_holder_;
   cudaStream_t stream_;
-  cublasHandle_t cublas_handle_;
+
+  std::unique_ptr<CublasHandleHolder> cublas_handle_;
+  std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
 
   int compute_capability_;
   int runtime_version_;
@@ -321,12 +295,10 @@ class CUDADeviceContext : public DeviceContext {
   int multi_process_;
   int max_threads_per_mp_;
 
-  mutable std::mutex mtx_;
-
   // StreamCallbackManager is thread-safe
   std::unique_ptr<StreamCallbackManager> callback_manager_;
 
-  mutable std::mutex cublas_mtx_;
+  DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
 };
 
 template <>
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index 171d2979a0218ad5e22112190a59866b3e0b617f..5b3aa98efb46b51d6c3edb6d2cbd4200bd0a35c6 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -43,9 +43,6 @@ TEST(Device, CUDADeviceContext) {
     ASSERT_NE(nullptr, gpu_device);
     cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
     ASSERT_NE(nullptr, cudnn_handle);
-    cublasHandle_t cublas_handle = device_context->cublas_handle();
-    ASSERT_NE(nullptr, cublas_handle);
-    ASSERT_NE(nullptr, device_context->stream());
     delete device_context;
   }
 }
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 98d1242a1691608d7c16a0f148cc9d0be7d4f22e..b3d20736a8e70d2f57ee5d6dc97cb490b5cfee44 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -214,16 +214,18 @@ class MKLDNNHandler {
       std::string* key, const mkldnn::memory::dims& input_dims,
       const mkldnn::memory::dims& weights_dims, const std::vector<int>& strides,
       const std::vector<int>& paddings, const std::vector<int>& dilations,
-      const int& groups, const mkldnn::memory::data_type& type,
-      const mkldnn::memory::format& format, const std::string& suffix) {
+      const int& groups, const mkldnn::memory::data_type& srcdt,
+      const mkldnn::memory::format& format,
+      const mkldnn::memory::data_type& dstdt, const std::string& suffix) {
     AppendKeyDims(key, input_dims);
     AppendKeyDims(key, weights_dims);
     AppendKeyVec(key, strides);
     AppendKeyVec(key, paddings);
     AppendKeyVec(key, dilations);
     AppendKey(key, std::to_string(groups));
-    AppendKey(key, std::to_string(type));
+    AppendKey(key, std::to_string(srcdt));
     AppendKey(key, std::to_string(format));
+    AppendKey(key, std::to_string(dstdt));
     AppendKey(key, suffix);
   }
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3b81d59ad965b7532ca729682e7aeb8eb96194a8..dce755c91a58d3291d740bd05c1cf835cbfbf1f0 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -946,13 +946,6 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is STR, debug_graphviz_path indicate the path that
                     writing the SSA Graph to file in the form of graphviz, you.
                     It is useful for debugging. Default "")DOC")
-      .def_property(
-          "enable_data_balance",
-          [](const BuildStrategy &self) { return self.enable_data_balance_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
-            self.enable_data_balance_ = b;
-          })  // FIXME(chengudo): enable_data_balance seems not important
       .def_property(
           "enable_sequential_execution",
           [](const BuildStrategy &self) {
@@ -1007,6 +1000,10 @@ All parameter, weight, gradient are variables in Paddle.
           "memory_optimize",
           [](const BuildStrategy &self) { return self.memory_optimize_; },
           [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; })
+      .def_property(
+          "is_distribution",
+          [](const BuildStrategy &self) { return self.is_distribution_; },
+          [](BuildStrategy &self, bool b) { self.is_distribution_ = b; })
       .def_property(
           "memory_early_delete",
           [](const BuildStrategy &self) { return self.memory_early_delete_; },
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index c97a93ec36d4f4a7ff6a9f097551e2d21022d5b1..3b066eda110275dc02e451dfaf0cfe28a3fb7a53 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -29,6 +29,15 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 BuildStrategy = core.ParallelExecutor.BuildStrategy
 
 
+def _is_pserver_mode(main_program):
+    main = main_program if main_program \
+        else framework.default_main_program()
+    for op in main.global_block().ops:
+        if op.type in ["send", "recv"]:
+            return True
+    return False
+
+
 class ParallelExecutor(object):
     """
     ParallelExecutor is designed for data parallelism, which focuses on distributing
@@ -128,6 +137,11 @@ class ParallelExecutor(object):
             build_strategy = BuildStrategy()
         build_strategy.num_trainers = num_trainers
         build_strategy.trainer_id = trainer_id
+        # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
+        # num_trainers is 1, so the current fields of build_strategy doesn't tell if
+        # it's distributed model.
+        build_strategy.is_distribution = _is_pserver_mode(
+            main_program) or num_trainers > 1
 
         # step4: get main_program, scope, local_scopes
         main = main_program if main_program \
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
index ca35adc1a36249aebf1518a6669d316db5c1d909..def188bfa632b5b1bb6b2621091d0526ffa345dc 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
@@ -47,7 +47,8 @@ class TestConv2dInt8Op(TestConv2dOp):
         self.init_group()
         self.init_dilation()
         self.init_test_case()
-        self.init_dtype()
+        self.init_fuse_relu()
+        self.init_data_type()
 
         conv2d_param = {
             'stride': self.stride,
@@ -78,7 +79,11 @@ class TestConv2dInt8Op(TestConv2dOp):
                 np.round((input_shift) * self.scale_in).astype(np.int32),
                 filter_int, self.groups,
                 conv2d_param).astype(np.float32) * scale_output_shift
-            output = np.round(output1 - output2).astype(self.dsttype)
+            if self.fuse_relu:
+                output = np.maximum(np.round(output1 - output2),
+                                    0).astype(self.dsttype)
+            else:
+                output = np.round(output1 - output2).astype(self.dsttype)
         else:
             filter_int = np.round(filter *
                                   self.scale_weights[0]).astype(np.int32)
@@ -87,7 +92,15 @@ class TestConv2dInt8Op(TestConv2dOp):
             output1 = conv2d_forward_refer(
                 input.astype(np.int32), filter_int, self.groups,
                 conv2d_param).astype(np.float32)
-            output = np.round(output1 * scale_output_shift).astype(self.dsttype)
+            if self.fuse_relu:
+                output = np.maximum(
+                    np.round(output1 * (self.scale_out / (
+                        self.scale_in * self.scale_weights[0]))),
+                    0).astype(self.dsttype)
+            else:
+                output = np.round(output1 * (self.scale_out / (
+                    self.scale_in *
+                    self.scale_weights[0]))).astype(self.dsttype)
 
         self.inputs = {
             'Input':
@@ -106,6 +119,7 @@ class TestConv2dInt8Op(TestConv2dOp):
             'Scale_in': self.scale_in,
             'Scale_out': self.scale_out,
             'Scale_weights': self.scale_weights,
+            'fuse_relu': self.fuse_relu
         }
         self.outputs = {'Output': output}
 
@@ -129,12 +143,15 @@ class TestConv2dInt8Op(TestConv2dOp):
         self.scale_out = 0.5
         self.scale_weights = [10.0]
 
-    def init_dtype(self):
+    def init_data_type(self):
         self.srctype = np.uint8
         self.dsttype = np.int8
 
+    def init_fuse_relu(self):
+        self.fuse_relu = True
 
-#--------------------test conv2d u8 in and s8 out--------------------
+
+#--------------------test conv2d u8 in and u8 out--------------------
 
 
 class TestConv2d(TestConv2dInt8Op):
@@ -203,18 +220,43 @@ class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
         self.groups = 3
 
 
-#--------------------test conv2d s8 in and s8 out--------------------
+def init_data_type_with_fusion(self, input_dt, fuse_relu):
+    self.srctype = input_dt
+    self.dsttype = np.uint8 if fuse_relu else np.int8
+
+    def init_fuse_relu(self):
+        self.fuse_relu = fuse_relu
 
 
 def create_test_int8_class(parent):
-    class TestInt8Case(parent):
-        def init_dtype(self):
-            self.srctype = np.int8
-            self.dsttype = np.int8
-
-    cls_name = "{0}_{1}".format(parent.__name__, "s8s8")
-    TestInt8Case.__name__ = cls_name
-    globals()[cls_name] = TestInt8Case
+
+    #--------------------test conv2d s8 in and u8 out--------------------
+
+    class TestS8U8Case(parent):
+        def init_data_type(self):
+            init_data_type_with_fusion(self, np.int8, True)
+
+    #--------------------test conv2d s8 in and s8 out--------------------
+
+    class TestS8S8Case(parent):
+        def init_data_type(self):
+            init_data_type_with_fusion(self, np.int8, False)
+
+    #--------------------test conv2d u8 in and s8 out--------------------
+
+    class TestU8S8Case(parent):
+        def init_data_type(self):
+            init_data_type_with_fusion(self, np.uint8, False)
+
+    cls_name_s8u8 = "{0}_relu_{1}".format(parent.__name__, "1")
+    cls_name_s8s8 = "{0}_relu_{1}".format(parent.__name__, "0")
+    cls_name_u8s8 = "{0}_relu_{1}".format(parent.__name__, "0")
+    TestS8U8Case.__name__ = cls_name_s8u8
+    TestS8S8Case.__name__ = cls_name_s8s8
+    TestU8S8Case.__name__ = cls_name_u8s8
+    globals()[cls_name_s8u8] = TestS8U8Case
+    globals()[cls_name_s8s8] = TestS8S8Case
+    globals()[cls_name_u8s8] = TestU8S8Case
 
 
 create_test_int8_class(TestConv2dInt8Op)
diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
index e97a05b6f929821f82d96b462598a5ff03cf0a48..7eeffa1039a1e14a8883c4a78305d253a4518b26 100644
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -75,8 +75,6 @@ class TestReaderReset(unittest.TestCase):
         exe.run(startup_prog)
 
         build_strategy = fluid.BuildStrategy()
-        if with_double_buffer:
-            build_strategy.enable_data_balance = True
         exec_strategy = fluid.ExecutionStrategy()
         parallel_exe = fluid.ParallelExecutor(
             use_cuda=self.use_cuda,